[PATCH 5/5] esp6: fix memleak on error path in esp6_input
From: Zhen Lei This ought to be an omission in e6194923237 ("esp: Fix memleaks on error paths."). The memleak on error path in esp6_input is similar to esp_input of esp4. Fixes: e6194923237 ("esp: Fix memleaks on error paths.") Fixes: 3f29770723f ("ipsec: check return value of skb_to_sgvec always") Signed-off-by: Zhen Lei Signed-off-by: Steffen Klassert --- net/ipv6/esp6.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 97513f35bcc5..88a7579c23bd 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -669,8 +669,10 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) sg_init_table(sg, nfrags); ret = skb_to_sgvec(skb, sg, 0, skb->len); - if (unlikely(ret < 0)) + if (unlikely(ret < 0)) { + kfree(tmp); goto out; + } skb->ip_summed = CHECKSUM_NONE; -- 2.14.1
[PATCH 2/5] xfrm_user: prevent leaking 2 bytes of kernel memory
From: Eric Dumazet struct xfrm_userpolicy_type has two holes, so we should not use C99 style initializer. KMSAN report: BUG: KMSAN: kernel-infoleak in copyout lib/iov_iter.c:140 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x1b14/0x2800 lib/iov_iter.c:571 CPU: 1 PID: 4520 Comm: syz-executor841 Not tainted 4.17.0+ #5 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:113 kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1117 kmsan_internal_check_memory+0x138/0x1f0 mm/kmsan/kmsan.c:1211 kmsan_copy_to_user+0x7a/0x160 mm/kmsan/kmsan.c:1253 copyout lib/iov_iter.c:140 [inline] _copy_to_iter+0x1b14/0x2800 lib/iov_iter.c:571 copy_to_iter include/linux/uio.h:106 [inline] skb_copy_datagram_iter+0x422/0xfa0 net/core/datagram.c:431 skb_copy_datagram_msg include/linux/skbuff.h:3268 [inline] netlink_recvmsg+0x6f1/0x1900 net/netlink/af_netlink.c:1959 sock_recvmsg_nosec net/socket.c:802 [inline] sock_recvmsg+0x1d6/0x230 net/socket.c:809 ___sys_recvmsg+0x3fe/0x810 net/socket.c:2279 __sys_recvmmsg+0x58e/0xe30 net/socket.c:2391 do_sys_recvmmsg+0x2a6/0x3e0 net/socket.c:2472 __do_sys_recvmmsg net/socket.c:2485 [inline] __se_sys_recvmmsg net/socket.c:2481 [inline] __x64_sys_recvmmsg+0x15d/0x1c0 net/socket.c:2481 do_syscall_64+0x15b/0x230 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x446ce9 RSP: 002b:7fc307918db8 EFLAGS: 0293 ORIG_RAX: 012b RAX: ffda RBX: 006dbc24 RCX: 00446ce9 RDX: 000a RSI: 20005040 RDI: 0003 RBP: 006dbc20 R08: 20004e40 R09: R10: 4000 R11: 0293 R12: R13: 7ffc8d2df32f R14: 7fc3079199c0 R15: 0001 Uninit was stored to memory at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:279 [inline] kmsan_save_stack mm/kmsan/kmsan.c:294 [inline] kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:685 kmsan_memcpy_origins+0x11d/0x170 mm/kmsan/kmsan.c:527 __msan_memcpy+0x109/0x160 mm/kmsan/kmsan_instr.c:413 __nla_put lib/nlattr.c:569 [inline] nla_put+0x276/0x340 lib/nlattr.c:627 copy_to_user_policy_type net/xfrm/xfrm_user.c:1678 [inline] dump_one_policy+0xbe1/0x1090 net/xfrm/xfrm_user.c:1708 xfrm_policy_walk+0x45a/0xd00 net/xfrm/xfrm_policy.c:1013 xfrm_dump_policy+0x1c0/0x2a0 net/xfrm/xfrm_user.c:1749 netlink_dump+0x9b5/0x1550 net/netlink/af_netlink.c:2226 __netlink_dump_start+0x1131/0x1270 net/netlink/af_netlink.c:2323 netlink_dump_start include/linux/netlink.h:214 [inline] xfrm_user_rcv_msg+0x8a3/0x9b0 net/xfrm/xfrm_user.c:2577 netlink_rcv_skb+0x37e/0x600 net/netlink/af_netlink.c:2448 xfrm_netlink_rcv+0xb2/0xf0 net/xfrm/xfrm_user.c:2598 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline] netlink_unicast+0x1680/0x1750 net/netlink/af_netlink.c:1336 netlink_sendmsg+0x104f/0x1350 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:629 [inline] sock_sendmsg net/socket.c:639 [inline] ___sys_sendmsg+0xec8/0x1320 net/socket.c:2117 __sys_sendmsg net/socket.c:2155 [inline] __do_sys_sendmsg net/socket.c:2164 [inline] __se_sys_sendmsg net/socket.c:2162 [inline] __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162 do_syscall_64+0x15b/0x230 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Local variable description: upt.i@dump_one_policy Variable was created at: dump_one_policy+0x78/0x1090 net/xfrm/xfrm_user.c:1689 xfrm_policy_walk+0x45a/0xd00 net/xfrm/xfrm_policy.c:1013 Byte 130 of 137 is uninitialized Memory access starts at 88019550407f Fixes: c0144beaeca42 ("[XFRM] netlink: Use nla_put()/NLA_PUT() variantes") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Steffen Klassert Cc: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 080035f056d9..1e50b70ad668 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1671,9 +1671,11 @@ static inline unsigned int userpolicy_type_attrsize(void) #ifdef CONFIG_XFRM_SUB_POLICY static int copy_to_user_policy_type(u8 type, struct sk_buff *skb) { - struct xfrm_userpolicy_type upt = { - .type = type, - }; + struct xfrm_userpolicy_type upt; + + /* Sadly there are two holes in struct xfrm_userpolicy_type */ + memset(&upt, 0, sizeof(upt)); + upt.type = type; return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt); } -- 2.14.1
pull request (net): ipsec 2018-07-27
1) Fix PMTU handling of vti6. We update the PMTU on the xfrm dst_entry which is not cached anymore after the flowchache removal. So update the PMTU of the original dst_entry instead. From Eyal Birger. 2) Fix a leak of kernel memory to userspace. From Eric Dumazet. 3) Fix a possible dst_entry memleak in xfrm_lookup_route. From Tommi Rantala. 4) Fix a skb leak in case we can't call nlmsg_multicast from xfrm_nlmsg_multicast. From Florian Westphal. 5) Fix a leak of a temporary buffer in the error path of esp6_input. From Zhen Lei. Please pull or let me know if there are problems. Thanks! The following changes since commit 1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21: Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next (2018-06-06 18:39:49 -0700) are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git master for you to fetch changes up to 7284fdf39a912322ce97de2d30def3c6068a418c: esp6: fix memleak on error path in esp6_input (2018-06-27 17:32:11 +0200) Eric Dumazet (1): xfrm_user: prevent leaking 2 bytes of kernel memory Eyal Birger (1): vti6: fix PMTU caching and reporting on xmit Florian Westphal (1): xfrm: free skb if nlsk pointer is NULL Tommi Rantala (1): xfrm: fix missing dst_release() after policy blocking lbcast and multicast Zhen Lei (1): esp6: fix memleak on error path in esp6_input net/ipv6/esp6.c| 4 +++- net/ipv6/ip6_vti.c | 11 ++- net/xfrm/xfrm_policy.c | 3 +++ net/xfrm/xfrm_user.c | 18 +++--- 4 files changed, 23 insertions(+), 13 deletions(-)
[PATCH 4/5] xfrm: free skb if nlsk pointer is NULL
From: Florian Westphal nlmsg_multicast() always frees the skb, so in case we cannot call it we must do that ourselves. Fixes: 21ee543edc0dea ("xfrm: fix race between netns cleanup and state expire notification") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 1e50b70ad668..33878e6e0d0a 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1025,10 +1025,12 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb, { struct sock *nlsk = rcu_dereference(net->xfrm.nlsk); - if (nlsk) - return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC); - else - return -1; + if (!nlsk) { + kfree_skb(skb); + return -EPIPE; + } + + return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC); } static inline unsigned int xfrm_spdinfo_msgsize(void) -- 2.14.1
[PATCH 3/5] xfrm: fix missing dst_release() after policy blocking lbcast and multicast
From: Tommi Rantala Fix missing dst_release() when local broadcast or multicast traffic is xfrm policy blocked. For IPv4 this results to dst leak: ip_route_output_flow() allocates dst_entry via __ip_route_output_key() and passes it to xfrm_lookup_route(). xfrm_lookup returns ERR_PTR(-EPERM) that is propagated. The dst that was allocated is never released. IPv4 local broadcast testcase: ping -b 192.168.1.255 & sleep 1 ip xfrm policy add src 0.0.0.0/0 dst 192.168.1.255/32 dir out action block IPv4 multicast testcase: ping 224.0.0.1 & sleep 1 ip xfrm policy add src 0.0.0.0/0 dst 224.0.0.1/32 dir out action block For IPv6 the missing dst_release() causes trouble e.g. when used in netns: ip netns add TEST ip netns exec TEST ip link set lo up ip link add dummy0 type dummy ip link set dev dummy0 netns TEST ip netns exec TEST ip addr add fd00:: dev dummy0 ip netns exec TEST ip link set dummy0 up ip netns exec TEST ping -6 -c 5 ff02::1%dummy0 & sleep 1 ip netns exec TEST ip xfrm policy add src ::/0 dst ff02::1 dir out action block wait ip netns del TEST After netns deletion we see: [ 258.239097] unregister_netdevice: waiting for lo to become free. Usage count = 2 [ 268.279061] unregister_netdevice: waiting for lo to become free. Usage count = 2 [ 278.367018] unregister_netdevice: waiting for lo to become free. Usage count = 2 [ 288.375259] unregister_netdevice: waiting for lo to become free. Usage count = 2 Fixes: ac37e2515c1a ("xfrm: release dst_orig in case of error in xfrm_lookup()") Signed-off-by: Tommi Rantala Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 5f48251c1319..7c5e8978aeaa 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2286,6 +2286,9 @@ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); + if (IS_ERR(dst)) + dst_release(dst_orig); + return dst; } EXPORT_SYMBOL(xfrm_lookup_route); -- 2.14.1
[PATCH 1/5] vti6: fix PMTU caching and reporting on xmit
From: Eyal Birger When setting the skb->dst before doing the MTU check, the route PMTU caching and reporting is done on the new dst which is about to be released. Instead, PMTU handling should be done using the original dst. This is aligned with IPv4 VTI. Fixes: ccd740cbc6 ("vti6: Add pmtu handling to vti6_xmit.") Signed-off-by: Eyal Birger Signed-off-by: Steffen Klassert --- net/ipv6/ip6_vti.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index b7f28deddaea..c72ae3a4fe09 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -480,10 +480,6 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) goto tx_err_dst_release; } - skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); - skb_dst_set(skb, dst); - skb->dev = skb_dst(skb)->dev; - mtu = dst_mtu(dst); if (!skb->ignore_df && skb->len > mtu) { skb_dst_update_pmtu(skb, mtu); @@ -498,9 +494,14 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) htonl(mtu)); } - return -EMSGSIZE; + err = -EMSGSIZE; + goto tx_err_dst_release; } + skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); + skb_dst_set(skb, dst); + skb->dev = skb_dst(skb)->dev; + err = dst_output(t->net, skb->sk, skb); if (net_xmit_eval(err) == 0) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); -- 2.14.1
Re: [patch net-next RFC] net: sched: don't dump chains only held by actions
Fri, Jul 27, 2018 at 06:18:14AM CEST, xiyou.wangc...@gmail.com wrote: >On Thu, Jul 26, 2018 at 9:33 AM Jiri Pirko wrote: >> >> From: Jiri Pirko >> >> In case a chain is empty and not explicitly created by a user, >> such chain should not exist. The only exception is if there is >> an action "goto chain" pointing to it. In that case, don't show the >> chain in the dump. Track the chain references held by actions and >> use them to find out if a chain should or should not be shown >> in chain dump. > >Hiding it makes sense. But you still need to make sure >user can't find it by ID, hiding it merely means user can't >see it. > >Also, I don't understand why you increase the refcnt >for a hiding chain either, like Jakub mentioned. > >If user can't see it, it must not be found by ID either. Ack. Will do that.
Re: [patch net-next RFC] net: sched: don't dump chains only held by actions
Thu, Jul 26, 2018 at 09:59:30PM CEST, jakub.kicin...@netronome.com wrote: >On Thu, 26 Jul 2018 18:31:01 +0200, Jiri Pirko wrote: >> From: Jiri Pirko >> >> In case a chain is empty and not explicitly created by a user, >> such chain should not exist. The only exception is if there is >> an action "goto chain" pointing to it. In that case, don't show the >> chain in the dump. Track the chain references held by actions and >> use them to find out if a chain should or should not be shown >> in chain dump. >> >> Signed-off-by: Jiri Pirko > >I don't have any better ideas :) > >One question below. > >> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c >> index 75cce2819de9..76035cd6e3bf 100644 >> --- a/net/sched/cls_api.c >> +++ b/net/sched/cls_api.c >> @@ -262,6 +262,25 @@ static void tcf_chain_hold(struct tcf_chain *chain) >> ++chain->refcnt; >> } >> >> +static void tcf_chain_hold_by_act(struct tcf_chain *chain) >> +{ >> +++chain->action_refcnt; >> +} >> + >> +static void tcf_chain_release_by_act(struct tcf_chain *chain) >> +{ >> +--chain->action_refcnt; >> +} >> + >> +static bool tcf_chain_is_zombie(struct tcf_chain *chain) >> +{ >> +/* In case all the references are action references, this >> + * chain is a zombie and should not be listed in the chain >> + * dump list. >> + */ >> +return chain->refcnt == chain->action_refcnt; >> +} >> + >> static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, >>u32 chain_index) >> { >> @@ -298,6 +317,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block >> *block, u32 chain_index, >> } >> EXPORT_SYMBOL(tcf_chain_get); >> >> +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 >> chain_index) >> +{ >> +struct tcf_chain *chain = tcf_chain_get(block, chain_index, true); >> + >> +tcf_chain_hold_by_act(chain); >> +return chain; >> +} >> +EXPORT_SYMBOL(tcf_chain_get_by_act); >> + >> static void tc_chain_tmplt_del(struct tcf_chain *chain); >> >> void tcf_chain_put(struct tcf_chain *chain) >> @@ -310,6 +338,13 @@ void tcf_chain_put(struct tcf_chain *chain) >> } >> EXPORT_SYMBOL(tcf_chain_put); >> >> +void tcf_chain_put_by_act(struct tcf_chain *chain) >> +{ >> +tcf_chain_release_by_act(chain); >> +tcf_chain_put(chain); >> +} >> +EXPORT_SYMBOL(tcf_chain_put_by_act); >> + >> static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) >> { >> if (chain->explicitly_created) >> @@ -1803,17 +1838,26 @@ static int tc_ctl_chain(struct sk_buff *skb, struct >> nlmsghdr *n, >> chain = tcf_chain_lookup(block, chain_index); >> if (n->nlmsg_type == RTM_NEWCHAIN) { >> if (chain) { >> -NL_SET_ERR_MSG(extack, "Filter chain already exists"); >> -return -EEXIST; >> -} >> -if (!(n->nlmsg_flags & NLM_F_CREATE)) { >> -NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and >> NLM_F_CREATE to create a new chain"); >> -return -ENOENT; >> -} >> -chain = tcf_chain_create(block, chain_index); >> -if (!chain) { >> -NL_SET_ERR_MSG(extack, "Failed to create filter chain"); >> -return -ENOMEM; >> +if (tcf_chain_is_zombie(chain)) { >> +/* The chain exists only because there is >> + * some action referencing it, meaning it >> + * is a zombie. >> + */ >> +tcf_chain_hold(chain); > >I'm not 100% sure why this is needed? In my tree below I see: > > switch (n->nlmsg_type) { > case RTM_NEWCHAIN: > err = tc_chain_tmplt_add(chain, net, tca, extack); > if (err) > goto errout; > /* In case the chain was successfully added, take a reference >* to the chain. This ensures that an empty chain >* does not disappear at the end of this function. >*/ > tcf_chain_hold(chain); > chain->explicitly_created = true; > >so one reference will be taken.. do we need two? There is a put at the end of this function. > >> +} else { >> +NL_SET_ERR_MSG(extack, "Filter chain already >> exists"); >> +return -EEXIST; >> +} >> +} else { >> +if (!(n->nlmsg_flags & NLM_F_CREATE)) { >> +NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN >> and NLM_F_CREATE to create a new chain"); >> +return -ENOENT; >> +} >> +chain = tcf_chain_create(block, chain_index); >> +if (!chain) { >> +NL_SET_ERR_MSG(extack, "Failed to create fi
RE: [PATCH net-next] net/tls: Corrected enabling of zero-copy mode
> -Original Message- > From: netdev-ow...@vger.kernel.org [mailto:netdev- > ow...@vger.kernel.org] On Behalf Of David Miller > Sent: Thursday, July 26, 2018 1:59 AM > To: Vakul Garg > Cc: netdev@vger.kernel.org; bor...@mellanox.com; > avia...@mellanox.com; davejwat...@fb.com > Subject: Re: [PATCH net-next] net/tls: Corrected enabling of zero-copy mode > > From: Vakul Garg > Date: Mon, 23 Jul 2018 21:00:06 +0530 > > > @@ -787,7 +787,7 @@ int tls_sw_recvmsg(struct sock *sk, > > target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); > > timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); > > do { > > - bool zc = false; > > + bool zc; > > int chunk = 0; > > > > skb = tls_wait_data(sk, flags, timeo, &err); > ... > > @@ -836,6 +835,7 @@ int tls_sw_recvmsg(struct sock *sk, > > if (err < 0) > > goto fallback_to_reg_recv; > > > > + zc = true; > > err = decrypt_skb_update(sk, skb, sgin, &zc); > > for (; pages > 0; pages--) > > put_page(sg_page(&sgin[pages])); > @@ -845,6 +845,7 @@ int > > tls_sw_recvmsg(struct sock *sk, > > } > > } else { > > fallback_to_reg_recv: > > + zc = false; > > err = decrypt_skb_update(sk, skb, NULL, > &zc); > > if (err < 0) { > > tls_err_abort(sk, EBADMSG); > > -- > > 2.13.6 > > > > This will leave a code path where 'zc' is evaluated but not initialized to > any value. > > And that's the path taken when ctx->decrypted is true. The code after > your changes looks like: > > bool zc; > ... > if (!ctx->decrypted) { > > ... assignments to 'zc' happen in this code block > > ctx->decrypted = true; > } > > if (!zc) { > > So when ctx->decrypted it true, the if(!zc) condition runs on an > uninitialized value. > > I have to say that your TLS changes are becomming quite a time sink > for two reasons. > > First, you are making a lot of changes that seem not so needed, and > whose value is purely determined by taste. I'd put the > msg_data_left() multiple evaluation patch into this category. > > The rest require deep review and understanding of the complicated > details of the TLS code, and many of them turn out to be incorrect. My apologies for sending incorrect patches. I would be more careful next time. > > As I find more errors in your submissions, I begin to scrutinize your > patches even more. Thus, review of your changes takes even more time. > > And it isn't helping that there are not a lot of other developers > helping actively to review your changes. > > I would like to just make a small request to you, that you concentrate > on fixing clear bugs and clear issues that need to be resolved. > > Thank you.
Re: [PATCH] xfrm: fix ptr_ret.cocci warnings
On Thu, Jul 26, 2018 at 03:09:52PM +0800, kbuild test robot wrote: > From: kbuild test robot > > net/xfrm/xfrm_interface.c:692:1-3: WARNING: PTR_ERR_OR_ZERO can be used > > > Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR > > Generated by: scripts/coccinelle/api/ptr_ret.cocci > > Fixes: 44e2b838c24d ("xfrm: Return detailed errors from xfrmi_newlink") > CC: Benedict Wong > Signed-off-by: kbuild test robot Applied, thanks!
Re: [PATCH ipsec-next] xfrm: Return detailed errors from xfrmi_newlink
On Wed, Jul 25, 2018 at 01:45:29PM -0700, Benedict Wong wrote: > Currently all failure modes of xfrm interface creation return EEXIST. > This change improves the granularity of errnos provided by also > returning ENODEV or EINVAL if failures happen in looking up the > underlying interface, or a required parameter is not provided. > > This change has been tested against the Android Kernel Networking Tests, > with additional xfrmi_newlink tests here: > > https://android-review.googlesource.com/c/kernel/tests/+/715755 > > Signed-off-by: Benedict Wong Applied to ipsec-next, thanks Benedict!
Re: [PATCH bpf-next 0/4] samples: bpf: convert two more samples to libbpf
On 07/26/2018 11:32 PM, Jakub Kicinski wrote: > Hi! > > This set converts xdpsock_user.c and xdp_fwd_user.c to use libbpf instead > of bpf_load.o. First two patches are minor improvements to libbpf to make > the conversion (and use of libbpf in general) nicer. > > Jakub Kicinski (4): > tools: libbpf: handle NULL program gracefully in bpf_program__nth_fd() > tools: libbpf: add bpf_object__find_program_by_title() > samples: bpf: convert xdp_fwd_user.c to libbpf > samples: bpf: convert xdpsock_user.c to libbpf > > samples/bpf/Makefile | 4 ++-- > samples/bpf/xdp_fwd_user.c | 34 +++--- > samples/bpf/xdpsock_user.c | 38 +- > tools/lib/bpf/libbpf.c | 15 +++ > tools/lib/bpf/libbpf.h | 3 +++ > 5 files changed, 72 insertions(+), 22 deletions(-) > Applied to bpf-next, thanks Jakub!
Re: [PATCH bpf-next 0/6] nfp: bpf: improve efficiency of offloaded perf events
On 07/26/2018 04:53 AM, Jakub Kicinski wrote: > Hi! > > This set is focused on improving the performance of perf events > reported from BPF offload. Perf events can now be received on > packet data queues, which significantly improves the performance > (from total of 0.5 Msps to 5Msps per core). To get to this > performance we need a fast path for control messages which will > operate on raw buffers and recycle them immediately. > > Patch 5 replaces the map pointers for perf maps with map IDs. > We look the pointers up in a hashtable, anyway, to validate they > are correct, so there is no performance difference. Map IDs > have the advantage of being easier to understand for users in > case of errors (we no longer print raw pointers to the logs). > > Last patch improves info messages about map offload. > > Jakub Kicinski (6): > nfp: move repr handling on RX path > nfp: allow control message reception on data queues > nfp: bpf: pass raw data buffer to nfp_bpf_event_output() > nfp: bpf: allow receiving perf events on data queues > nfp: bpf: remember maps by ID > nfp: bpf: improve map offload info messages > > drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 25 +++- > drivers/net/ethernet/netronome/nfp/bpf/jit.c | 12 ++-- > drivers/net/ethernet/netronome/nfp/bpf/main.c | 5 +- > drivers/net/ethernet/netronome/nfp/bpf/main.h | 9 ++- > .../net/ethernet/netronome/nfp/bpf/offload.c | 63 +++ > drivers/net/ethernet/netronome/nfp/nfp_app.c | 2 + > drivers/net/ethernet/netronome/nfp/nfp_app.h | 17 + > .../ethernet/netronome/nfp/nfp_net_common.c | 40 +++- > .../net/ethernet/netronome/nfp/nfp_net_ctrl.h | 1 + > 9 files changed, 125 insertions(+), 49 deletions(-) > Applied to bpf-next, thanks Jakub!
Re: [PATCH v5 bpf-next 2/9] veth: Add driver XDP
Hi John, On 2018/07/27 12:02, John Fastabend wrote: > On 07/26/2018 07:40 AM, Toshiaki Makita wrote: >> From: Toshiaki Makita >> >> This is the basic implementation of veth driver XDP. >> >> Incoming packets are sent from the peer veth device in the form of skb, >> so this is generally doing the same thing as generic XDP. >> >> This itself is not so useful, but a starting point to implement other >> useful veth XDP features like TX and REDIRECT. >> >> This introduces NAPI when XDP is enabled, because XDP is now heavily >> relies on NAPI context. Use ptr_ring to emulate NIC ring. Tx function >> enqueues packets to the ring and peer NAPI handler drains the ring. >> >> Currently only one ring is allocated for each veth device, so it does >> not scale on multiqueue env. This can be resolved by allocating rings >> on the per-queue basis later. >> >> Note that NAPI is not used but netif_rx is used when XDP is not loaded, >> so this does not change the default behaviour. >> >> v3: >> - Fix race on closing the device. >> - Add extack messages in ndo_bpf. >> >> v2: >> - Squashed with the patch adding NAPI. >> - Implement adjust_tail. >> - Don't acquire consumer lock because it is guarded by NAPI. >> - Make poll_controller noop since it is unnecessary. >> - Register rxq_info on enabling XDP rather than on opening the device. >> >> Signed-off-by: Toshiaki Makita >> --- > > > [...] > > One nit and one question. > >> + >> +static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv, >> +struct sk_buff *skb) >> +{ >> +u32 pktlen, headroom, act, metalen; >> +void *orig_data, *orig_data_end; >> +int size, mac_len, delta, off; >> +struct bpf_prog *xdp_prog; >> +struct xdp_buff xdp; >> + >> +rcu_read_lock(); >> +xdp_prog = rcu_dereference(priv->xdp_prog); >> +if (unlikely(!xdp_prog)) { >> +rcu_read_unlock(); >> +goto out; >> +} >> + >> +mac_len = skb->data - skb_mac_header(skb); >> +pktlen = skb->len + mac_len; >> +size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + >> + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); >> +if (size > PAGE_SIZE) >> +goto drop; > > I'm not sure why it matters if size > PAGE_SIZE here. Why not > just consume it and use the correct page order in alloc_page if > its not linear. Indeed. We can allow such skbs here at least if we don't need reallocation (which is highly unlikely though). But I'm not sure we should allocate multiple pages in atomic context. It tends to cause random allocation failure which is IMO more frustrating. We are now prohibiting such a situation by max_mtu and dropping features, which looks more robust to me. >> + >> +headroom = skb_headroom(skb) - mac_len; >> +if (skb_shared(skb) || skb_head_is_locked(skb) || >> +skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { >> +struct sk_buff *nskb; >> +void *head, *start; >> +struct page *page; >> +int head_off; >> + >> +page = alloc_page(GFP_ATOMIC); > > Should also have __NO_WARN here as well this can be triggered by > external events so we don't want DDOS here to flood system logs. Sure, thanks! -- Toshiaki Makita
Re: [pull request][net-next 00/13] Mellanox, mlx5e updates 2018-07-26 (XDP redirect)
From: Saeed Mahameed Date: Thu, 26 Jul 2018 15:56:34 -0700 > This series from Tariq adds the support for device-out XDP redirect. > > For more information please see tag log below. > > Please pull and let me know if there's any problem. Looks good, pulled, thanks a lot.
Re: [PATCH net] net: ena: Fix use of uninitialized DMA address bits field
From: Gal Pressman Date: Thu, 26 Jul 2018 23:40:33 +0300 > UBSAN triggers the following undefined behaviour warnings: > [...] > [ 13.236124] UBSAN: Undefined behaviour in > drivers/net/ethernet/amazon/ena/ena_eth_com.c:468:22 > [ 13.240043] shift exponent 64 is too large for 64-bit type 'long long > unsigned int' > [...] > [ 13.744769] UBSAN: Undefined behaviour in > drivers/net/ethernet/amazon/ena/ena_eth_com.c:373:4 > [ 13.748694] shift exponent 64 is too large for 64-bit type 'long long > unsigned int' > [...] > > When splitting the address to high and low, GENMASK_ULL is used to generate > a bitmask with dma_addr_bits field from io_sq (in ena_com_prepare_tx and > ena_com_add_single_rx_desc). > The problem is that dma_addr_bits is not initialized with a proper value > (besides being cleared in ena_com_create_io_queue). > Assign dma_addr_bits the correct value that is stored in ena_dev when > initializing the SQ. > > Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network > Adapters (ENA)") > Signed-off-by: Gal Pressman Applied and queued up for -stable.
Re: [PATCH net-next] netdevsim: make debug dirs' dentries static
From: Jakub Kicinski Date: Thu, 26 Jul 2018 14:25:26 -0700 > The root directories of netdevsim should only be used by the core > to create per-device subdirectories, so limit their visibility to > the core file. > > Signed-off-by: Jakub Kicinski > Reviewed-by: Quentin Monnet Applied.
Re: [patch net-next RFC] net: sched: don't dump chains only held by actions
On Thu, Jul 26, 2018 at 9:33 AM Jiri Pirko wrote: > > From: Jiri Pirko > > In case a chain is empty and not explicitly created by a user, > such chain should not exist. The only exception is if there is > an action "goto chain" pointing to it. In that case, don't show the > chain in the dump. Track the chain references held by actions and > use them to find out if a chain should or should not be shown > in chain dump. Hiding it makes sense. But you still need to make sure user can't find it by ID, hiding it merely means user can't see it. Also, I don't understand why you increase the refcnt for a hiding chain either, like Jakub mentioned. If user can't see it, it must not be found by ID either.
Re: [PATCH v5 bpf-next 5/9] veth: Add ndo_xdp_xmit
On 07/26/2018 07:40 AM, Toshiaki Makita wrote: > From: Toshiaki Makita > > This allows NIC's XDP to redirect packets to veth. The destination veth > device enqueues redirected packets to the napi ring of its peer, then > they are processed by XDP on its peer veth device. > This can be thought as calling another XDP program by XDP program using > REDIRECT, when the peer enables driver XDP. > > Note that when the peer veth device does not set driver xdp, redirected > packets will be dropped because the peer is not ready for NAPI. > > v4: > - Don't use xdp_ok_fwd_dev() because checking IFF_UP is not necessary. > Add comments about it and check only MTU. > > v2: > - Drop the part converting xdp_frame into skb when XDP is not enabled. > - Implement bulk interface of ndo_xdp_xmit. > - Implement XDP_XMIT_FLUSH bit and drop ndo_xdp_flush. > > Signed-off-by: Toshiaki Makita > --- > drivers/net/veth.c | 51 +++ > 1 file changed, 51 insertions(+) > Acked-by: John Fastabend
Re: [PATCH v5 bpf-next 4/9] veth: Handle xdp_frames in xdp napi ring
On 07/26/2018 07:40 AM, Toshiaki Makita wrote: > From: Toshiaki Makita > > This is preparation for XDP TX and ndo_xdp_xmit. > This allows napi handler to handle xdp_frames through xdp ring as well > as sk_buff. > > v3: > - Revert v2 change around rings and use a flag to differentiate skb and > xdp_frame, since bulk skb xmit makes little performance difference > for now. > > v2: > - Use another ring instead of using flag to differentiate skb and > xdp_frame. This approach makes bulk skb transmit possible in > veth_xmit later. > - Clear xdp_frame feilds in skb->head. > - Implement adjust_tail. > > Signed-off-by: Toshiaki Makita > --- > drivers/net/veth.c | 87 > ++ > 1 file changed, 82 insertions(+), 5 deletions(-) > Acked-by: John Fastabend
Re: [PATCH v5 bpf-next 2/9] veth: Add driver XDP
On 07/26/2018 07:40 AM, Toshiaki Makita wrote: > From: Toshiaki Makita > > This is the basic implementation of veth driver XDP. > > Incoming packets are sent from the peer veth device in the form of skb, > so this is generally doing the same thing as generic XDP. > > This itself is not so useful, but a starting point to implement other > useful veth XDP features like TX and REDIRECT. > > This introduces NAPI when XDP is enabled, because XDP is now heavily > relies on NAPI context. Use ptr_ring to emulate NIC ring. Tx function > enqueues packets to the ring and peer NAPI handler drains the ring. > > Currently only one ring is allocated for each veth device, so it does > not scale on multiqueue env. This can be resolved by allocating rings > on the per-queue basis later. > > Note that NAPI is not used but netif_rx is used when XDP is not loaded, > so this does not change the default behaviour. > > v3: > - Fix race on closing the device. > - Add extack messages in ndo_bpf. > > v2: > - Squashed with the patch adding NAPI. > - Implement adjust_tail. > - Don't acquire consumer lock because it is guarded by NAPI. > - Make poll_controller noop since it is unnecessary. > - Register rxq_info on enabling XDP rather than on opening the device. > > Signed-off-by: Toshiaki Makita > --- [...] One nit and one question. > + > +static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv, > + struct sk_buff *skb) > +{ > + u32 pktlen, headroom, act, metalen; > + void *orig_data, *orig_data_end; > + int size, mac_len, delta, off; > + struct bpf_prog *xdp_prog; > + struct xdp_buff xdp; > + > + rcu_read_lock(); > + xdp_prog = rcu_dereference(priv->xdp_prog); > + if (unlikely(!xdp_prog)) { > + rcu_read_unlock(); > + goto out; > + } > + > + mac_len = skb->data - skb_mac_header(skb); > + pktlen = skb->len + mac_len; > + size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + > +SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > + if (size > PAGE_SIZE) > + goto drop; I'm not sure why it matters if size > PAGE_SIZE here. Why not just consume it and use the correct page order in alloc_page if its not linear. > + > + headroom = skb_headroom(skb) - mac_len; > + if (skb_shared(skb) || skb_head_is_locked(skb) || > + skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { > + struct sk_buff *nskb; > + void *head, *start; > + struct page *page; > + int head_off; > + > + page = alloc_page(GFP_ATOMIC); Should also have __NO_WARN here as well this can be triggered by external events so we don't want DDOS here to flood system logs. > + if (!page) > + goto drop; > + > + head = page_address(page); > + start = head + VETH_XDP_HEADROOM; > + if (skb_copy_bits(skb, -mac_len, start, pktlen)) { > + page_frag_free(head); > + goto drop; > + } > + > + nskb = veth_build_skb(head, > + VETH_XDP_HEADROOM + mac_len, skb->len, > + PAGE_SIZE); > + if (!nskb) { > + page_frag_free(head); > + goto drop; > + } > + > + skb_copy_header(nskb, skb); > + head_off = skb_headroom(nskb) - skb_headroom(skb); > + skb_headers_offset_update(nskb, head_off); > + if (skb->sk) > + skb_set_owner_w(nskb, skb->sk); > + consume_skb(skb); > + skb = nskb; > + } > + > + xdp.data_hard_start = skb->head; > + xdp.data = skb_mac_header(skb); > + xdp.data_end = xdp.data + pktlen; > + xdp.data_meta = xdp.data; > + xdp.rxq = &priv->xdp_rxq; > + orig_data = xdp.data; > + orig_data_end = xdp.data_end; > + > + act = bpf_prog_run_xdp(xdp_prog, &xdp); > + > + switch (act) { > + case XDP_PASS: > + break; > + default: > + bpf_warn_invalid_xdp_action(act); > + case XDP_ABORTED: > + trace_xdp_exception(priv->dev, xdp_prog, act); > + case XDP_DROP: > + goto drop; > + } > + rcu_read_unlock(); > + > + delta = orig_data - xdp.data; > + off = mac_len + delta; > + if (off > 0) > + __skb_push(skb, off); > + else if (off < 0) > + __skb_pull(skb, -off); > + skb->mac_header -= delta; > + off = xdp.data_end - orig_data_end; > + if (off != 0) > + __skb_put(skb, off); > + skb->protocol = eth_type_trans(skb, priv->dev); > + > + metalen = xdp.data - xdp.data_meta; > + if (metalen) > + skb_metadata_set(skb, metalen); > +out: > + return skb; > +drop: > + rcu_read_unlock(); > + kfree_skb(skb); > + return NULL; > +} > + Thanks, John
Re: [PATCH net-next v3 1/5] tc/act: user space can't use TC_ACT_REDIRECT directly
On 07/26/2018 09:43 AM, Jiri Pirko wrote: > Wed, Jul 25, 2018 at 06:29:54PM CEST, dan...@iogearbox.net wrote: >> On 07/25/2018 05:48 PM, Paolo Abeni wrote: >>> On Wed, 2018-07-25 at 15:03 +0200, Jiri Pirko wrote: Wed, Jul 25, 2018 at 02:54:04PM CEST, pab...@redhat.com wrote: > On Wed, 2018-07-25 at 13:56 +0200, Jiri Pirko wrote: >> Tue, Jul 24, 2018 at 10:06:39PM CEST, pab...@redhat.com wrote: >>> Only cls_bpf and act_bpf can safely use such value. If a generic >>> action is configured by user space to return TC_ACT_REDIRECT, >>> the usually visible behavior is passing the skb up the stack - as >>> for unknown action, but, with complex configuration, more random >>> results can be obtained. >>> >>> This patch forcefully converts TC_ACT_REDIRECT to TC_ACT_UNSPEC >>> at action init time, making the kernel behavior more consistent. >>> >>> v1 -> v3: use TC_ACT_UNSPEC instead of a newly definied act value >>> >>> Signed-off-by: Paolo Abeni >>> --- >>> net/sched/act_api.c | 5 + >>> 1 file changed, 5 insertions(+) >>> >>> diff --git a/net/sched/act_api.c b/net/sched/act_api.c >>> index 148a89ab789b..24b5534967fe 100644 >>> --- a/net/sched/act_api.c >>> +++ b/net/sched/act_api.c >>> @@ -895,6 +895,11 @@ struct tc_action *tcf_action_init_1(struct net >>> *net, struct tcf_proto *tp, >>> } >>> } >>> >>> + if (a->tcfa_action == TC_ACT_REDIRECT) { >>> + net_warn_ratelimited("TC_ACT_REDIRECT can't be used >>> directly"); >> >> Can't you push this warning through extack? >> >> But, wouldn't it be more appropriate to fail here? User is passing >> invalid configuration > > Jiri, Jamal, thank you for the feedback. > > Please allow me to answer both of you here, since you raised similar > concers. > > I thought about rejecting the action, but that change of behavior could > break some users, as currently most kind of invalid tcfa_action values > are simply accepted. > > If there is consensus about it, I can simply fail. Well it was obviously wrong to expose TC_ACT_REDIRECT to uapi and it really has no meaning for anyone to use it throughout its whole history. >> >> That claim is completely wrong. > > Why? Does addition of TC_ACT_REDIRECT to uapi have any meaning? BPF programs return TC_ACT_* as a verdict which is then further processed, including TC_ACT_REDIRECT. Hence, it's a contract for them similarly as e.g. helper enums (BPF_FUNC_*) and other things they make use of. Cheers, Daniel
Re: [PATCH bpf] bpf: btf: Use exact btf value_size match in map_check_btf()
On 07/26/2018 06:57 PM, Martin KaFai Lau wrote: > The current map_check_btf() in BPF_MAP_TYPE_ARRAY rejects > '> map->value_size' to ensure map_seq_show_elem() will not > access things beyond an array element. > > Yonghong suggested that using '!=' is a more correct > check. The 8 bytes round_up on value_size is stored > in array->elem_size. Hence, using '!=' on map->value_size > is a proper check. > > This patch also adds new tests to check the btf array > key type and value type. Two of these new tests verify > the btf's value_size (the change in this patch). > > It also fixes two existing tests that wrongly encoded > a btf's type size (pprint_test) and the value_type_id (in one > of the raw_tests[]). However, that do not affect these two > BTF verification tests before or after this test changes. > These two tests mainly failed at array creation time after > this patch. > > Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic > arraymap") > Suggested-by: Yonghong Song > Acked-by: Yonghong Song > Signed-off-by: Martin KaFai Lau Applied to bpf, thanks Martin!
Re: [PATCH V3 bpf] xdp: add NULL pointer check in __xdp_return()
On 07/26/2018 04:17 PM, Taehee Yoo wrote: > rhashtable_lookup() can return NULL. so that NULL pointer > check routine should be added. > > Fixes: 02b55e5657c3 ("xdp: add MEM_TYPE_ZERO_COPY") > Acked-by: Martin KaFai Lau > Signed-off-by: Taehee Yoo Applied to bpf, thanks Taehee!
Re: [PATCH v5 bpf-next 3/9] veth: Avoid drops by oversized packets when XDP is enabled
On Fri, 27 Jul 2018 10:06:41 +0900, Toshiaki Makita wrote: > On 2018/07/27 9:51, Jakub Kicinski wrote: > > On Thu, 26 Jul 2018 23:40:26 +0900, Toshiaki Makita wrote: > >> + max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - > >> +peer->hard_header_len - > >> +SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > >> + if (peer->mtu > max_mtu) { > >> + NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to > >> set XDP"); > >> + err = -ERANGE; > >> + goto err; > >> + } > > > > You need to add .ndo_change_mtu and check this condition there too. > > I'm setting peer->max_mtu so no need to add .ndo_change_mtu. > Inappropriate MTU will be refused in dev_set_mtu(). missed that, sorry
Re: [PATCH v5 bpf-next 3/9] veth: Avoid drops by oversized packets when XDP is enabled
On 2018/07/27 9:51, Jakub Kicinski wrote: > On Thu, 26 Jul 2018 23:40:26 +0900, Toshiaki Makita wrote: >> +max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - >> + peer->hard_header_len - >> + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); >> +if (peer->mtu > max_mtu) { >> +NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to >> set XDP"); >> +err = -ERANGE; >> +goto err; >> +} > > You need to add .ndo_change_mtu and check this condition there too. I'm setting peer->max_mtu so no need to add .ndo_change_mtu. Inappropriate MTU will be refused in dev_set_mtu(). -- Toshiaki Makita
Re: [PATCH v5 bpf-next 3/9] veth: Avoid drops by oversized packets when XDP is enabled
On Thu, 26 Jul 2018 23:40:26 +0900, Toshiaki Makita wrote: > + max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - > + peer->hard_header_len - > + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > + if (peer->mtu > max_mtu) { > + NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to > set XDP"); > + err = -ERANGE; > + goto err; > + } You need to add .ndo_change_mtu and check this condition there too.
Re: [PATCH net-next v4 1/4] net/sched: user-space can't set unknown tcfa_action values
Hi, On Thu, Jul 26, 2018 at 04:34:57PM +0200, Paolo Abeni wrote: ... > @@ -895,6 +904,14 @@ struct tc_action *tcf_action_init_1(struct net *net, > struct tcf_proto *tp, > } > } > > + if (!tcf_action_valid(a->tcfa_action)) { > + net_warn_ratelimited("invalid %d action value, using " > + "TC_ACT_UNSPEC instead", a->tcfa_action); Now that it is reporting the error via extack, do we really need this warn net_warn? extack will be shown as a warning by iproute2 even if the command succeeds. > + NL_SET_ERR_MSG(extack, "invalid action value, using " > +"TC_ACT_UNSPEC instead"); Quoted strings shouldn't be broken down into multiple lines.. > + a->tcfa_action = TC_ACT_UNSPEC; > + } > + > return a; > > err_mod: > -- > 2.17.1 >
Re: [PATCH bpf-next 0/4] samples: bpf: convert two more samples to libbpf
On Thu, Jul 26, 2018 at 2:32 PM, Jakub Kicinski wrote: > Hi! > > This set converts xdpsock_user.c and xdp_fwd_user.c to use libbpf instead > of bpf_load.o. First two patches are minor improvements to libbpf to make > the conversion (and use of libbpf in general) nicer. > > Jakub Kicinski (4): > tools: libbpf: handle NULL program gracefully in bpf_program__nth_fd() > tools: libbpf: add bpf_object__find_program_by_title() > samples: bpf: convert xdp_fwd_user.c to libbpf > samples: bpf: convert xdpsock_user.c to libbpf LGTM. Ack for the whole series. Acked-by: Yonghong Song > > samples/bpf/Makefile | 4 ++-- > samples/bpf/xdp_fwd_user.c | 34 +++--- > samples/bpf/xdpsock_user.c | 38 +- > tools/lib/bpf/libbpf.c | 15 +++ > tools/lib/bpf/libbpf.h | 3 +++ > 5 files changed, 72 insertions(+), 22 deletions(-) > > -- > 2.17.1 >
Re: [PATCH net-next v4 3/4] net/tc: introduce TC_ACT_REINSERT.
On Thu, Jul 26, 2018 at 7:35 AM Paolo Abeni wrote: > > This is similar TC_ACT_REDIRECT, but with a slightly different > semantic: > - on ingress the mirred skbs are passed to the target device > network stack without any additional check not scrubbing. > - the rcu-protected stats provided via the tcf_result struct > are updated on error conditions. > > This new tcfa_action value is not exposed to the user-space > and can be used only internally by clsact. > > v1 -> v2: do not touch TC_ACT_REDIRECT code path, introduce > a new action type instead > > v2 -> v3: > - rename the new action value TC_ACT_REINJECT, update the >helper accordingly > - take care of uncloned reinjected packets in XDP generic >hook > > v3 -> v4: > - renamed again the new action value (JiriP) > > Signed-off-by: Paolo Abeni Acked-by: Cong Wang > --- > Note: this patch still touch only overlimits, even there is some > agreement to touch (also) drops on reinsert/mirred failure, but > such change is independent to this series Totally agree. Thanks!
Re: [PATCH net-next v4 4/4] act_mirred: use TC_ACT_REINSERT when possible
On Thu, Jul 26, 2018 at 7:35 AM Paolo Abeni wrote: > > When mirred is invoked from the ingress path, and it wants to redirect > the processed packet, it can now use the TC_ACT_REINSERT action, > filling the tcf_result accordingly, and avoiding a per packet > skb_clone(). > > Overall this gives a ~10% improvement in forwarding performance for the > TC S/W data path and TC S/W performances are now comparable to the > kernel openvswitch datapath. > > v1 -> v2: use ACT_MIRRED instead of ACT_REDIRECT > v2 -> v3: updated after action rename, fixed typo into the commit > message > v3 -> v4: updated again after action rename, added more comments to > the code (JiriP), skip the optimization if the control action > need to touch the tcf_result (Paolo) > > Signed-off-by: Paolo Abeni Overall it looks good to me now. Acked-by: Cong Wang Thanks!
Re: [PATCH net-next v3 4/5] net/tc: introduce TC_ACT_REINJECT.
On Thu, Jul 26, 2018 at 5:52 AM Jamal Hadi Salim wrote: > > On 25/07/18 01:09 PM, Marcelo Ricardo Leitner wrote: > > On Wed, Jul 25, 2018 at 09:48:16AM -0700, Cong Wang wrote: > >> On Wed, Jul 25, 2018 at 5:27 AM Jamal Hadi Salim wrote: > >>> > >>> Those changes were there from the beginning (above patch did > >>> not introduce them). > >>> IIRC, the reason was to distinguish between policy intended > >>> drops and drops because of errors. > >> > >> There must be a limit for "overlimit" to make sense. There is > >> no limit in mirred action's context, probably there is only > >> such a limit in act_police. So, all rest should not touch overlimit. > > > > +1 > > > > I agree we should at least record drop count(unrelated patch though). > we should keep overlimit (for no other reason other than this > has been around for at least 15 years). > > On why "overlimit"? It is just a name for a counter that is useless > for most actions (but was still being transfered to user space). > It is the closest counter to counting "this failed because of > runtime errors" as opposed to "user asked us to drop this". > > Probably a good alternative is to make a very small stats v3 structure > (we have migrated stats structures before) and extend for > each action/classifier/qdisc to add its extra counters using XSTATS. Agreed. > > Note: > If you are _mirroring_ packets - incrementing the drop counter is > misleading because the packet is not dropped by the system. > i.e the qdisc will not record it as dropped; it should for > redirect policy. It is useful to be able to tell > them apart when you are collecting analytics just for actions. Sounds like we just need another counter rather than re-using overlimit or drops. > (if youve worked on a massive amount of machines you'll appreciate > being able to debug by looking at counters that reduce ambiguity). > Yes, this is how I found out the overlimit of htb qdisc is inaccurate or misleading, instead the one in htb class is accurate, see: commit 3c75f6ee139d464351f8ab77a042dd3635769d98 Author: Eric Dumazet Date: Mon Sep 18 12:36:22 2017 -0700 net_sched: sch_htb: add per class overlimits counter
Re: [PATCH] bpf: verifier: BPF_MOV don't mark dst reg if src == dst
On Thu, Jul 26, 2018 at 9:52 AM, Arthur Fabre wrote: > Oops, gmail seems to have mangled everything. Will resend using git > send-email. > > I've added the test cases for mov64, but I'm not sure of the expected mov32 > behavior. The interpreter has below: ALU_MOV_X: DST = (u32) SRC; CONT; ... ALU64_MOV_X: DST = SRC; CONT; The later verifier code does seem to mark dst_reg properly for both ALU and ALU64. > Currently coerce_reg_to_size() is called after mark_reg_unknown(), > which sets the bounds to 64bits. coerce_reg_to_size() resets the bounds > again, > as they're too "wide" to fit the new size. It sets SMIN = UMIN = 0, > which seems weird. Shouldn't SMIN be 1 << (size * 8 - 1)? Same applies for > SMAX. The SMIN/UMIN still should be 0 since there is no negative here due to smaller width? > Should mov32 always mark the dst as unbounded? We can do better than unbounded for dst register of mov32, which is the code already doing? > > > On Thu, Jul 26, 2018 at 1:42 AM, Daniel Borkmann > wrote: >> >> On 07/26/2018 12:08 AM, Arthur Fabre wrote: >> > When check_alu_op() handles a BPF_MOV between two registers, >> > it calls check_reg_arg() on the dst register, marking it as unbounded. >> > If the src and dst register are the same, this marks the src as >> > unbounded, which can lead to unexpected errors for further checks that >> > rely on bounds info. Could you explain (and add to the commit messages eventually) what are these unexpected errors? >> > >> > check_alu_op() now only marks the dst register as unbounded if it >> > different from the src register. >> > >> > Signed-off-by: Arthur Fabre >> > --- >> > kernel/bpf/verifier.c | 5 +++-- >> > 1 file changed, 3 insertions(+), 2 deletions(-) >> > >> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c >> > index 63aaac52a265..ddfe3c544a80 100644 >> > --- a/kernel/bpf/verifier.c >> > +++ b/kernel/bpf/verifier.c >> > @@ -3238,8 +3238,9 @@ static int check_alu_op(struct bpf_verifier_env >> > *env, struct bpf_insn *insn) >> > } >> > } >> > >> > - /* check dest operand */ >> > - err = check_reg_arg(env, insn->dst_reg, DST_OP); >> > + /* check dest operand, only mark if dest != src */ >> > + err = check_reg_arg(env, insn->dst_reg, >> > + insn->dst_reg == insn->src_reg ? >> > DST_OP_NO_MARK : DST_OP); >> > if (err) >> > return err; >> > >> >> Thanks a lot for the patch! Looks like it's corrupted wrt newline. >> >> Please also add test cases to tools/testing/selftests/bpf/test_verifier.c >> for the cases of mov64 and mov32 where in each src==dst and src!=dst; >> mov32 >> should mark it as unbounded but not former, so would be good to keep >> tracking >> that in selftests. > >
[net-next 03/13] net/mlx5e: Gather all XDP pre-requisite checks in a single function
From: Tariq Toukan Dedicate a function to all checks done when setting an XDP program. Take indications from priv instead of netdev features. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 33 --- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c214b23e14ba..a1a54ea3aeb4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4094,26 +4094,37 @@ static void mlx5e_tx_timeout(struct net_device *dev) queue_work(priv->wq, &priv->tx_timeout_work); } +static int mlx5e_xdp_allowed(struct mlx5e_priv *priv) +{ + struct net_device *netdev = priv->netdev; + + if (priv->channels.params.lro_en) { + netdev_warn(netdev, "can't set XDP while LRO is on, disable LRO first\n"); + return -EINVAL; + } + + if (MLX5_IPSEC_DEV(priv->mdev)) { + netdev_warn(netdev, "can't set XDP with IPSec offload\n"); + return -EINVAL; + } + + return 0; +} + static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) { struct mlx5e_priv *priv = netdev_priv(netdev); struct bpf_prog *old_prog; - int err = 0; bool reset, was_opened; + int err; int i; mutex_lock(&priv->state_lock); - if ((netdev->features & NETIF_F_LRO) && prog) { - netdev_warn(netdev, "can't set XDP while LRO is on, disable LRO first\n"); - err = -EINVAL; - goto unlock; - } - - if ((netdev->features & NETIF_F_HW_ESP) && prog) { - netdev_warn(netdev, "can't set XDP with IPSec offload\n"); - err = -EINVAL; - goto unlock; + if (prog) { + err = mlx5e_xdp_allowed(priv); + if (err) + goto unlock; } was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); -- 2.17.0
[net-next 06/13] net/mlx5e: Add counter for XDP redirect in RX
From: Tariq Toukan Add per-ring and total stats for received packets that goes into XDP redirection. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 2 ++ 3 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 649675c1af61..34accf3f4cee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -69,6 +69,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, rq->xdpsq.db.redirect_flush = true; mlx5e_page_dma_unmap(rq, di); } + rq->stats->xdp_redirect++; return true; default: bpf_warn_invalid_xdp_action(act); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index c0507fada0be..b88db9d923ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -59,6 +59,7 @@ static const struct counter_desc sw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary_inner) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_redirect) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_cqe) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_full) }, @@ -142,6 +143,7 @@ void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv) s->rx_csum_unnecessary += rq_stats->csum_unnecessary; s->rx_csum_unnecessary_inner += rq_stats->csum_unnecessary_inner; s->rx_xdp_drop += rq_stats->xdp_drop; + s->rx_xdp_redirect += rq_stats->xdp_redirect; s->rx_xdp_tx += rq_stats->xdp_tx; s->rx_xdp_tx_cqe += rq_stats->xdp_tx_cqe; s->rx_xdp_tx_full += rq_stats->xdp_tx_full; @@ -1126,6 +1128,7 @@ static const struct counter_desc rq_stats_desc[] = { { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_none) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_drop) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_redirect) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_tx) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_tx_cqe) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_tx_full) }, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index fc3f66003edd..07529cc58471 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -70,6 +70,7 @@ struct mlx5e_sw_stats { u64 rx_csum_complete; u64 rx_csum_unnecessary_inner; u64 rx_xdp_drop; + u64 rx_xdp_redirect; u64 rx_xdp_tx; u64 rx_xdp_tx_cqe; u64 rx_xdp_tx_full; @@ -178,6 +179,7 @@ struct mlx5e_rq_stats { u64 lro_bytes; u64 removed_vlan_packets; u64 xdp_drop; + u64 xdp_redirect; u64 xdp_tx; u64 xdp_tx_cqe; u64 xdp_tx_full; -- 2.17.0
[net-next 10/13] net/mlx5e: Add support for XDP_REDIRECT in device-out side
From: Tariq Toukan Add implementation for the ndo_xdp_xmit callback. Dedicate a new set of XDP-SQ instances to satisfy the XDP_REDIRECT requests. These instances are totally separated from the existing XDP-SQ objects that satisfy local XDP_TX actions. Performance tests: xdp_redirect_map from ConnectX-5 to ConnectX-5. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz Packet-rate of 64B packets. Single queue: 7 Mpps. Multi queue: 55 Mpps. Signed-off-by: Tariq Toukan Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 5 + .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 92 --- .../net/ethernet/mellanox/mlx5/core/en/xdp.h | 2 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 36 ++-- .../ethernet/mellanox/mlx5/core/en_stats.c| 32 ++- .../ethernet/mellanox/mlx5/core/en_stats.h| 5 + .../net/ethernet/mellanox/mlx5/core/en_txrx.c | 3 + 7 files changed, 154 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 16e219a8f240..3f21cafe6be3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -344,6 +344,7 @@ enum { MLX5E_SQ_STATE_IPSEC, MLX5E_SQ_STATE_AM, MLX5E_SQ_STATE_TLS, + MLX5E_SQ_STATE_REDIRECT, }; struct mlx5e_sq_wqe_info { @@ -598,6 +599,9 @@ struct mlx5e_channel { __be32 mkey_be; u8 num_tc; + /* XDP_REDIRECT */ + struct mlx5e_xdpsq xdpsq; + /* data path - accessed per napi poll */ struct irq_desc *irq_desc; struct mlx5e_ch_stats *stats; @@ -621,6 +625,7 @@ struct mlx5e_channel_stats { struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC]; struct mlx5e_rq_stats rq; struct mlx5e_xdpsq_stats rq_xdpsq; + struct mlx5e_xdpsq_stats xdpsq; } cacheline_aligned_in_smp; enum mlx5e_traffic_types { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index eabd5537927d..bab8cd44d1c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -167,6 +167,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) struct mlx5e_xdpsq *sq; struct mlx5_cqe64 *cqe; struct mlx5e_rq *rq; + bool is_redirect; u16 sqcc; int i; @@ -179,6 +180,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) if (!cqe) return false; + is_redirect = test_bit(MLX5E_SQ_STATE_REDIRECT, &sq->state); rq = container_of(sq, struct mlx5e_rq, xdpsq); /* sq->cc must be updated only after mlx5_cqwq_update_db_record(), @@ -196,17 +198,20 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) wqe_counter = be16_to_cpu(cqe->wqe_counter); do { - struct mlx5e_xdp_info *xdpi; - u16 ci; + u16 ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); + struct mlx5e_xdp_info *xdpi = &sq->db.xdpi[ci]; last_wqe = (sqcc == wqe_counter); - - ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); - xdpi = &sq->db.xdpi[ci]; - sqcc++; - /* Recycle RX page */ - mlx5e_page_release(rq, &xdpi->di, true); + + if (is_redirect) { + xdp_return_frame(xdpi->xdpf); + dma_unmap_single(sq->pdev, xdpi->dma_addr, +xdpi->xdpf->len, DMA_TO_DEVICE); + } else { + /* Recycle RX page */ + mlx5e_page_release(rq, &xdpi->di, true); + } } while (!last_wqe); } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); @@ -223,16 +228,75 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) { - struct mlx5e_rq *rq = container_of(sq, struct mlx5e_rq, xdpsq); - struct mlx5e_xdp_info *xdpi; - u16 ci; + struct mlx5e_rq *rq; + bool is_redirect; + + is_redirect = test_bit(MLX5E_SQ_STATE_REDIRECT, &sq->state); + rq = is_redirect ? NULL : container_of(sq, struct mlx5e_rq, xdpsq); while (sq->cc != sq->pc) { - ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->cc); - xdpi = &sq->db.xdpi[ci]; + u16 ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->cc); + struct mlx5e_xdp_info *xdpi = &sq->db.xdpi[ci]; + sq->cc++; - mlx5e_page_release(rq, &xdpi->di, false); + if (is_redirect) { + xdp_return_fram
[net-next 12/13] net/mlx5e: TX, Move DB fields in TXQ-SQ struct
From: Tariq Toukan Pointers in DB are static, move them to read-only area so they do not share a cacheline with fields modified in datapath. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 3f21cafe6be3..c41cfc2a4b70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -365,16 +365,14 @@ struct mlx5e_txqsq { struct mlx5e_cqcq; - /* write@xmit, read@completion */ - struct { - struct mlx5e_sq_dma *dma_fifo; - struct mlx5e_tx_wqe_info *wqe_info; - } db; - /* read only */ struct mlx5_wq_cyc wq; u32dma_fifo_mask; struct mlx5e_sq_stats *stats; + struct { + struct mlx5e_sq_dma *dma_fifo; + struct mlx5e_tx_wqe_info *wqe_info; + } db; void __iomem *uar_map; struct netdev_queue *txq; u32sqn; -- 2.17.0
[net-next 04/13] net/mlx5e: Restrict the combination of large MTU and XDP
From: Tariq Toukan Add checks in control path upon an MTU change or an XDP program set, to prevent reaching cases where large MTU and XDP are set simultaneously. This is to make sure we allow XDP only with the linear RX memory scheme, i.e. a received packet is not scattered to different pages. Change mlx5e_rx_get_linear_frag_sz() accordingly, so that we make sure the XDP configuration can really be set, instead of assuming that it is. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 39 +++ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index dc9aa070eafa..7840f6f44379 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -72,6 +72,8 @@ struct page_pool; #define MLX5_RX_HEADROOM NET_SKB_PAD #define MLX5_SKB_FRAG_SZ(len) (SKB_DATA_ALIGN(len) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) +#define MLX5E_XDP_MAX_MTU ((int)(PAGE_SIZE - \ +MLX5_SKB_FRAG_SZ(XDP_PACKET_HEADROOM))) #define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \ (6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a1a54ea3aeb4..56ae6e428fdf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -96,14 +96,19 @@ bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) static u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params) { - if (!params->xdp_prog) { - u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); - u16 rq_headroom = MLX5_RX_HEADROOM + NET_IP_ALIGN; + u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + u16 linear_rq_headroom = params->xdp_prog ? + XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM; + u32 frag_sz; - return MLX5_SKB_FRAG_SZ(rq_headroom + hw_mtu); - } + linear_rq_headroom += NET_IP_ALIGN; - return PAGE_SIZE; + frag_sz = MLX5_SKB_FRAG_SZ(linear_rq_headroom + hw_mtu); + + if (params->xdp_prog && frag_sz < PAGE_SIZE) + frag_sz = PAGE_SIZE; + + return frag_sz; } static u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params) @@ -3707,6 +3712,14 @@ int mlx5e_change_mtu(struct net_device *netdev, int new_mtu, new_channels.params = *params; new_channels.params.sw_mtu = new_mtu; + if (params->xdp_prog && + !mlx5e_rx_is_linear_skb(priv->mdev, &new_channels.params)) { + netdev_err(netdev, "MTU(%d) > %d is not allowed while XDP enabled\n", + new_mtu, MLX5E_XDP_MAX_MTU); + err = -EINVAL; + goto out; + } + if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { u8 ppw_old = mlx5e_mpwqe_log_pkts_per_wqe(params); u8 ppw_new = mlx5e_mpwqe_log_pkts_per_wqe(&new_channels.params); @@ -4094,9 +4107,10 @@ static void mlx5e_tx_timeout(struct net_device *dev) queue_work(priv->wq, &priv->tx_timeout_work); } -static int mlx5e_xdp_allowed(struct mlx5e_priv *priv) +static int mlx5e_xdp_allowed(struct mlx5e_priv *priv, struct bpf_prog *prog) { struct net_device *netdev = priv->netdev; + struct mlx5e_channels new_channels = {}; if (priv->channels.params.lro_en) { netdev_warn(netdev, "can't set XDP while LRO is on, disable LRO first\n"); @@ -4108,6 +4122,15 @@ static int mlx5e_xdp_allowed(struct mlx5e_priv *priv) return -EINVAL; } + new_channels.params = priv->channels.params; + new_channels.params.xdp_prog = prog; + + if (!mlx5e_rx_is_linear_skb(priv->mdev, &new_channels.params)) { + netdev_warn(netdev, "XDP is not allowed with MTU(%d) > %d\n", + new_channels.params.sw_mtu, MLX5E_XDP_MAX_MTU); + return -EINVAL; + } + return 0; } @@ -4122,7 +4145,7 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) mutex_lock(&priv->state_lock); if (prog) { - err = mlx5e_xdp_allowed(priv); + err = mlx5e_xdp_allowed(priv, prog); if (err) goto unlock; } -- 2.17.0
[net-next 13/13] net/mlx5e: TX, Use function to access sq_dma object in fifo
From: Tariq Toukan Use designated function mlx5e_dma_get() to get the mlx5e_sq_dma object to be pushed into fifo. Signed-off-by: Tariq Toukan Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tx.c | 19 +-- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 9106ea45e3cb..ae73ea992845 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -66,22 +66,21 @@ static inline void mlx5e_tx_dma_unmap(struct device *pdev, } } +static inline struct mlx5e_sq_dma *mlx5e_dma_get(struct mlx5e_txqsq *sq, u32 i) +{ + return &sq->db.dma_fifo[i & sq->dma_fifo_mask]; +} + static inline void mlx5e_dma_push(struct mlx5e_txqsq *sq, dma_addr_t addr, u32 size, enum mlx5e_dma_map_type map_type) { - u32 i = sq->dma_fifo_pc & sq->dma_fifo_mask; + struct mlx5e_sq_dma *dma = mlx5e_dma_get(sq, sq->dma_fifo_pc++); - sq->db.dma_fifo[i].addr = addr; - sq->db.dma_fifo[i].size = size; - sq->db.dma_fifo[i].type = map_type; - sq->dma_fifo_pc++; -} - -static inline struct mlx5e_sq_dma *mlx5e_dma_get(struct mlx5e_txqsq *sq, u32 i) -{ - return &sq->db.dma_fifo[i & sq->dma_fifo_mask]; + dma->addr = addr; + dma->size = size; + dma->type = map_type; } static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma) -- 2.17.0
[net-next 11/13] net/mlx5e: RX, Prefetch the xdp_frame data area
From: Tariq Toukan A loaded XDP program might write to the xdp_frame data area, prefetchw() it to avoid a potential cache miss. Performance tests: ConnectX-5, XDP_TX packet rate, single ring. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz Before: 13,172,976 pps After: 13,456,248 pps 2% gain. Fixes: 22f453988194 ("net/mlx5e: Support XDP over Striding RQ") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index e33ca03b2100..15d8ae28c040 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1099,6 +1099,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, dma_sync_single_range_for_cpu(rq->pdev, di->addr, head_offset, frag_size, DMA_FROM_DEVICE); + prefetchw(va); /* xdp_frame data area */ prefetch(data); rcu_read_lock(); -- 2.17.0
[net-next 01/13] net/mlx5e: Replace call to MPWQE free with dealloc in interface down flow
From: Tariq Toukan No need to expose the MPWQE free function to control path. The dealloc function already exposed, use it. Signed-off-by: Tariq Toukan Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index e1b237ccdf56..dc9aa070eafa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -892,7 +892,6 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq); bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq); void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix); void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix); -void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi); struct sk_buff * mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, u16 cqe_bcnt, u32 head_offset, u32 page_idx); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dccde18f6170..c214b23e14ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -877,7 +877,7 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq) /* UMR WQE (if in progress) is always at wq->head */ if (rq->mpwqe.umr_in_progress) - mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]); + rq->dealloc_wqe(rq, wq->head); while (!mlx5_wq_ll_is_empty(wq)) { struct mlx5e_rx_wqe_ll *wqe; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 1d5295ee863c..e6b3d178c45f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -396,7 +396,7 @@ mlx5e_copy_skb_header_mpwqe(struct device *pdev, } } -void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi) +static void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi) { const bool no_xdp_xmit = bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE); -- 2.17.0
[net-next 08/13] net/mlx5e: Refactor XDP counters
From: Tariq Toukan Separate the XDP counters into two sets: (1) One set reside in the RQ stats, and they monitor XDP stats in the RQ side. (2) Another set is per XDP-SQ, and they monitor XDP stats that are related to XDP transmit flow. Signed-off-by: Tariq Toukan Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 + .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 12 ++--- .../net/ethernet/mellanox/mlx5/core/en_main.c | 1 + .../ethernet/mellanox/mlx5/core/en_stats.c| 47 +-- .../ethernet/mellanox/mlx5/core/en_stats.h| 16 +-- 5 files changed, 52 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 118d66207079..d9e24fbe6a28 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -424,6 +424,7 @@ struct mlx5e_xdpsq { /* read only */ struct mlx5_wq_cyc wq; + struct mlx5e_xdpsq_stats *stats; void __iomem *uar_map; u32sqn; struct device *pdev; @@ -619,6 +620,7 @@ struct mlx5e_channel_stats { struct mlx5e_ch_stats ch; struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC]; struct mlx5e_rq_stats rq; + struct mlx5e_xdpsq_stats rq_xdpsq; } cacheline_aligned_in_smp; enum mlx5e_traffic_types { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 53d011eb71ab..26e24823504b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -106,8 +106,6 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi) u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi); - struct mlx5e_rq *rq = container_of(sq, struct mlx5e_rq, xdpsq); - struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; struct mlx5_wqe_eth_seg *eseg = &wqe->eth; struct mlx5_wqe_data_seg *dseg = wqe->data; @@ -116,12 +114,12 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi) dma_addr_t dma_addr = xdpi->dma_addr; unsigned int dma_len = xdpf->len; - struct mlx5e_rq_stats *stats = rq->stats; + struct mlx5e_xdpsq_stats *stats = sq->stats; prefetchw(wqe); if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE || sq->hw_mtu < dma_len)) { - stats->xdp_drop++; + stats->err++; return false; } @@ -131,7 +129,7 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi) mlx5e_xmit_xdp_doorbell(sq); sq->db.doorbell = false; } - stats->xdp_tx_full++; + stats->full++; return false; } @@ -160,7 +158,7 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi) sq->db.doorbell = true; - stats->xdp_tx++; + stats->xmit++; return true; } @@ -212,7 +210,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) } while (!last_wqe); } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); - rq->stats->xdp_tx_cqe += i; + sq->stats->cqes += i; mlx5_cqwq_update_db_record(&cq->wq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 7ed71db9b32f..5a6c4403a3e2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1001,6 +1001,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->uar_map = mdev->mlx5e_res.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu= MLX5E_SW2HW_MTU(params, params->sw_mtu); + sq->stats = &c->priv->channel_stats[c->ix].rq_xdpsq; param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index b88db9d923ad..85b1827b76c6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -60,9 +60,10 @@ static const struct counter_desc sw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary_inner) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_redirect) }, - { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx) }, - { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_cqe)
[net-next 07/13] net/mlx5e: Make XDP xmit functions more generic
From: Tariq Toukan Convert the XDP xmit functions to use the generic xdp_frame API in XDP_TX flow. Same functions will be used later in this series to transmit the XDP redirect-out packets as well. Signed-off-by: Tariq Toukan Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 20 -- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 70 +++ .../net/ethernet/mellanox/mlx5/core/en/xdp.h | 3 +- .../net/ethernet/mellanox/mlx5/core/en_main.c | 10 +-- 4 files changed, 61 insertions(+), 42 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 2f1058da0907..118d66207079 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -395,6 +395,17 @@ struct mlx5e_txqsq { } recover; } cacheline_aligned_in_smp; +struct mlx5e_dma_info { + struct page *page; + dma_addr_t addr; +}; + +struct mlx5e_xdp_info { + struct xdp_frame *xdpf; + dma_addr_tdma_addr; + struct mlx5e_dma_info di; +}; + struct mlx5e_xdpsq { /* data path */ @@ -406,7 +417,7 @@ struct mlx5e_xdpsq { /* write@xmit, read@completion */ struct { - struct mlx5e_dma_info *di; + struct mlx5e_xdp_info *xdpi; bool doorbell; bool redirect_flush; } db; @@ -419,6 +430,7 @@ struct mlx5e_xdpsq { __be32 mkey_be; u8 min_inline_mode; unsigned long state; + unsigned int hw_mtu; /* control path */ struct mlx5_wq_ctrlwq_ctrl; @@ -455,11 +467,6 @@ mlx5e_wqc_has_room_for(struct mlx5_wq_cyc *wq, u16 cc, u16 pc, u16 n) return (mlx5_wq_cyc_ctr2ix(wq, cc - pc) >= n) || (cc == pc); } -struct mlx5e_dma_info { - struct page *page; - dma_addr_t addr; -}; - struct mlx5e_wqe_frag_info { struct mlx5e_dma_info *di; u32 offset; @@ -562,7 +569,6 @@ struct mlx5e_rq { /* XDP */ struct bpf_prog *xdp_prog; - unsigned int hw_mtu; struct mlx5e_xdpsq xdpsq; DECLARE_BITMAP(flags, 8); struct page_pool *page_pool; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 34accf3f4cee..53d011eb71ab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -33,6 +33,23 @@ #include #include "en/xdp.h" +static inline bool +mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_dma_info *di, + struct xdp_buff *xdp) +{ + struct mlx5e_xdp_info xdpi; + + xdpi.xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpi.xdpf)) + return false; + xdpi.dma_addr = di->addr + (xdpi.xdpf->data - (void *)xdpi.xdpf); + dma_sync_single_for_device(sq->pdev, xdpi.dma_addr, + xdpi.xdpf->len, PCI_DMA_TODEVICE); + xdpi.di = *di; + + return mlx5e_xmit_xdp_frame(sq, &xdpi); +} + /* returns true if packet was consumed by xdp */ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, void *va, u16 *rx_headroom, u32 *len) @@ -58,22 +75,24 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, *len = xdp.data_end - xdp.data; return false; case XDP_TX: - if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp))) - trace_xdp_exception(rq->netdev, prog, act); + if (unlikely(!mlx5e_xmit_xdp_buff(&rq->xdpsq, di, &xdp))) + goto xdp_abort; + __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */ return true; case XDP_REDIRECT: /* When XDP enabled then page-refcnt==1 here */ err = xdp_do_redirect(rq->netdev, &xdp, prog); - if (!err) { - __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); - rq->xdpsq.db.redirect_flush = true; - mlx5e_page_dma_unmap(rq, di); - } + if (unlikely(err)) + goto xdp_abort; + __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); + rq->xdpsq.db.redirect_flush = true; + mlx5e_page_dma_unmap(rq, di); rq->stats->xdp_redirect++; return true; default: bpf_warn_invalid_xdp_action(act); case XDP_ABORTED: +xdp_abort: trace_xdp_exception(rq->netdev, prog, act); case XDP_DROP: rq->stats->xdp_drop++; @@ -81,27 +100,27 @@ bool mlx5e_xd
[net-next 09/13] net/mlx5e: Re-order fields of struct mlx5e_xdpsq
From: Tariq Toukan In the downstream patch that adds support to XDP_REDIRECT-out, the XDP xmit frame function doesn't share the same run context as the NAPI that polls the XDP-SQ completion queue. Hence, need to re-order the XDP-SQ fields to avoid cacheline false-sharing. Take redirect_flush and doorbell out of DB, into separated cachelines. Add a cacheline breaker within the stats struct. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 18 +- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 8 .../net/ethernet/mellanox/mlx5/core/en_rx.c| 8 .../net/ethernet/mellanox/mlx5/core/en_stats.h | 3 ++- 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index d9e24fbe6a28..16e219a8f240 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -409,22 +409,22 @@ struct mlx5e_xdp_info { struct mlx5e_xdpsq { /* data path */ - /* dirtied @rx completion */ + /* dirtied @completion */ u16cc; - u16pc; + bool redirect_flush; - struct mlx5e_cqcq; + /* dirtied @xmit */ + u16pc cacheline_aligned_in_smp; + bool doorbell; - /* write@xmit, read@completion */ - struct { - struct mlx5e_xdp_info *xdpi; - bool doorbell; - bool redirect_flush; - } db; + struct mlx5e_cqcq; /* read only */ struct mlx5_wq_cyc wq; struct mlx5e_xdpsq_stats *stats; + struct { + struct mlx5e_xdp_info *xdpi; + } db; void __iomem *uar_map; u32sqn; struct device *pdev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 26e24823504b..eabd5537927d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -85,7 +85,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, if (unlikely(err)) goto xdp_abort; __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); - rq->xdpsq.db.redirect_flush = true; + rq->xdpsq.redirect_flush = true; mlx5e_page_dma_unmap(rq, di); rq->stats->xdp_redirect++; return true; @@ -124,10 +124,10 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi) } if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1))) { - if (sq->db.doorbell) { + if (sq->doorbell) { /* SQ is full, ring doorbell */ mlx5e_xmit_xdp_doorbell(sq); - sq->db.doorbell = false; + sq->doorbell = false; } stats->full++; return false; @@ -156,7 +156,7 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi) sq->db.xdpi[pi] = *xdpi; sq->pc++; - sq->db.doorbell = true; + sq->doorbell = true; stats->xmit++; return true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 70b984c4e8a4..e33ca03b2100 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1201,14 +1201,14 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget) rq->handle_rx_cqe(rq, cqe); } while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); - if (xdpsq->db.doorbell) { + if (xdpsq->doorbell) { mlx5e_xmit_xdp_doorbell(xdpsq); - xdpsq->db.doorbell = false; + xdpsq->doorbell = false; } - if (xdpsq->db.redirect_flush) { + if (xdpsq->redirect_flush) { xdp_do_flush_map(); - xdpsq->db.redirect_flush = false; + xdpsq->redirect_flush = false; } mlx5_cqwq_update_db_record(&cq->wq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 95e1f32c67d9..7aa8ff389cdd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -230,7 +230,8 @@ struct mlx5e_xdpsq_stats { u64 xmit; u64 full; u64 err; - u64 cqes; + /* dirtied @completion */ + u64 cqes cacheline_aligned_in
[net-next 05/13] net/mlx5e: Move XDP related code into new XDP files
From: Tariq Toukan Take XDP code out of the general EN header and RX file into new XDP files. Currently, XDP-SQ resides only within an RQ and used from a single flow (XDP_TX) triggered upon RX completions. In a downstream patch, additional type of XDP-SQ instances will be presented and used for the XDP_REDIRECT flow, totally unrelated to the RX context. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 9 +- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 225 ++ .../net/ethernet/mellanox/mlx5/core/en/xdp.h | 62 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 1 + .../net/ethernet/mellanox/mlx5/core/en_rx.c | 208 +--- .../net/ethernet/mellanox/mlx5/core/en_txrx.c | 1 + 7 files changed, 293 insertions(+), 215 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index fa7fcca5dc78..ae2bdcb1647c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -14,7 +14,7 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \ fpga/ipsec.o fpga/tls.o mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \ - en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o \ + en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o vxlan.o \ en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 7840f6f44379..2f1058da0907 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -72,8 +72,6 @@ struct page_pool; #define MLX5_RX_HEADROOM NET_SKB_PAD #define MLX5_SKB_FRAG_SZ(len) (SKB_DATA_ALIGN(len) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) -#define MLX5E_XDP_MAX_MTU ((int)(PAGE_SIZE - \ -MLX5_SKB_FRAG_SZ(XDP_PACKET_HEADROOM))) #define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \ (6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */ @@ -149,10 +147,6 @@ struct page_pool; (DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_BB)) #define MLX5E_ICOSQ_MAX_WQEBBS MLX5E_UMR_WQEBBS -#define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN) -#define MLX5E_XDP_TX_DS_COUNT \ - ((sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + 1 /* SG DS */) - #define MLX5E_NUM_MAIN_GROUPS 9 #define MLX5E_MSG_LEVELNETIF_MSG_LINK @@ -878,14 +872,13 @@ void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event); int mlx5e_napi_poll(struct napi_struct *napi, int budget); bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget); int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget); -bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq); void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq); -void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq); bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev); bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev, struct mlx5e_params *params); +void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info); void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info, bool recycle); void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c new file mode 100644 index ..649675c1af61 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + *copyright notice, this list of conditions and the following + *disclaimer. + * + * - Redistributions in binary form must reproduce the above + *copyright notice, this list of conditions and the following + *disclaimer in the documentation and/or other ma
[net-next 02/13] net/mlx5e: Do not recycle RX pages in interface down flow
From: Tariq Toukan Keep all page-pool recycle calls within NAPI context. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 37 ++- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index e6b3d178c45f..97db5eeca9f3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -277,10 +277,11 @@ static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq, } static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq, -struct mlx5e_wqe_frag_info *frag) +struct mlx5e_wqe_frag_info *frag, +bool recycle) { if (frag->last_in_page) - mlx5e_page_release(rq, frag->di, true); + mlx5e_page_release(rq, frag->di, recycle); } static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix) @@ -308,25 +309,26 @@ static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe, free_frags: while (--i >= 0) - mlx5e_put_rx_frag(rq, --frag); + mlx5e_put_rx_frag(rq, --frag, true); return err; } static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq, -struct mlx5e_wqe_frag_info *wi) +struct mlx5e_wqe_frag_info *wi, +bool recycle) { int i; for (i = 0; i < rq->wqe.info.num_frags; i++, wi++) - mlx5e_put_rx_frag(rq, wi); + mlx5e_put_rx_frag(rq, wi, recycle); } void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix) { struct mlx5e_wqe_frag_info *wi = get_frag(rq, ix); - mlx5e_free_rx_wqe(rq, wi); + mlx5e_free_rx_wqe(rq, wi, false); } static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk) @@ -396,7 +398,8 @@ mlx5e_copy_skb_header_mpwqe(struct device *pdev, } } -static void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi) +static void +mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, bool recycle) { const bool no_xdp_xmit = bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE); @@ -405,7 +408,7 @@ static void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi) for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap)) - mlx5e_page_release(rq, &dma_info[i], true); + mlx5e_page_release(rq, &dma_info[i], recycle); } static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq) @@ -505,8 +508,8 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) { struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix]; - - mlx5e_free_rx_mpwqe(rq, wi); + /* Don't recycle, this function is called on rq/netdev close */ + mlx5e_free_rx_mpwqe(rq, wi, false); } bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq) @@ -1113,7 +1116,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) napi_gro_receive(rq->cq.napi, skb); free_wqe: - mlx5e_free_rx_wqe(rq, wi); + mlx5e_free_rx_wqe(rq, wi, true); wq_cyc_pop: mlx5_wq_cyc_pop(wq); } @@ -1155,7 +1158,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) napi_gro_receive(rq->cq.napi, skb); free_wqe: - mlx5e_free_rx_wqe(rq, wi); + mlx5e_free_rx_wqe(rq, wi, true); wq_cyc_pop: mlx5_wq_cyc_pop(wq); } @@ -1292,7 +1295,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) wq = &rq->mpwqe.wq; wqe = mlx5_wq_ll_get_wqe(wq, wqe_id); - mlx5e_free_rx_mpwqe(rq, wi); + mlx5e_free_rx_mpwqe(rq, wi, true); mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index); } @@ -1521,7 +1524,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) napi_gro_receive(rq->cq.napi, skb); wq_free_wqe: - mlx5e_free_rx_wqe(rq, wi); + mlx5e_free_rx_wqe(rq, wi, true); mlx5_wq_cyc_pop(wq); } @@ -1544,19 +1547,19 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt); if (unlikely(!skb)) { /* a DROP, save the page-reuse checks */ - mlx5e_free_rx_wqe(rq, wi); + mlx5e_free_rx_wqe(rq, wi, true); goto wq_cyc_pop; } skb = mlx5e_ipsec_handle_rx_skb(rq->netdev, skb, &cqe_bcnt); if (unlikely(!skb)) { - mlx5e_free_rx_wqe(rq, wi); + mlx5e_free_rx_wqe(rq, wi, true);
[pull request][net-next 00/13] Mellanox, mlx5e updates 2018-07-26 (XDP redirect)
Hi Dave, This series from Tariq adds the support for device-out XDP redirect. For more information please see tag log below. Please pull and let me know if there's any problem. Thanks, Saeed. --- The following changes since commit 6a8fab17940d4934293d4145abce00e178393bec: Merge branch '10GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue (2018-07-26 14:14:01 -0700) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git tags/mlx5e-updates-2018-07-26 for you to fetch changes up to 8ee48233566624826d185bf63735cc01d7113fce: net/mlx5e: TX, Use function to access sq_dma object in fifo (2018-07-26 15:23:59 -0700) mlx5e-updates-2018-07-26 (XDP redirect) This series from Tariq adds the support for device-out XDP redirect. Start with a simple RX and XDP cleanups: - Replace call to MPWQE free with dealloc in interface down flow - Do not recycle RX pages in interface down flow - Gather all XDP pre-requisite checks in a single function - Restrict the combination of large MTU and XDP Since now XDP logic is going to be called from TX side as well, generic XDP TX logic is not RX only anymore, for that Tariq creates a new xdp.c file and moves XDP related code into it, and generalizes the code to support XDP TX for XDP redirect, such as the xdp tx sq structures and xdp counters. XDP redirect support: Add implementation for the ndo_xdp_xmit callback. Dedicate a new set of XDP-SQ instances to satisfy the XDP_REDIRECT requests. These instances are totally separated from the existing XDP-SQ objects that satisfy local XDP_TX actions. Performance tests: xdp_redirect_map from ConnectX-5 to ConnectX-5. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz Packet-rate of 64B packets. Single queue: 7 Mpps. Multi queue: 55 Mpps. -Saeed. Tariq Toukan (13): net/mlx5e: Replace call to MPWQE free with dealloc in interface down flow net/mlx5e: Do not recycle RX pages in interface down flow net/mlx5e: Gather all XDP pre-requisite checks in a single function net/mlx5e: Restrict the combination of large MTU and XDP net/mlx5e: Move XDP related code into new XDP files net/mlx5e: Add counter for XDP redirect in RX net/mlx5e: Make XDP xmit functions more generic net/mlx5e: Refactor XDP counters net/mlx5e: Re-order fields of struct mlx5e_xdpsq net/mlx5e: Add support for XDP_REDIRECT in device-out side net/mlx5e: RX, Prefetch the xdp_frame data area net/mlx5e: TX, Move DB fields in TXQ-SQ struct net/mlx5e: TX, Use function to access sq_dma object in fifo drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 61 +++-- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 302 + drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h | 63 + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 116 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c| 254 ++--- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 80 +- drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 24 +- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c| 19 +- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 4 + 10 files changed, 612 insertions(+), 313 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
RE: [net-next 1/6] ixgbe: Do not allow LRO or MTU change with XDP
> From: Jakub Kicinski [mailto:jakub.kicin...@netronome.com] > Sent: Thursday, July 26, 2018 2:47 PM > On Thu, 26 Jul 2018 14:21:08 -0700, Jakub Kicinski wrote: > > On Thu, 26 Jul 2018 10:40:45 -0700, Jeff Kirsher wrote: > > > From: Tony Nguyen > > > > > > XDP does not support jumbo frames or LRO. These checks are being > > > made outside the driver when an XDP program is loaded, however, > > > there is nothing preventing these from changing after an XDP program is > loaded. > > > Add the checks so that while an XDP program is loaded, do not allow > > > MTU to be changed or LRO to be enabled. > > > > > > Signed-off-by: Tony Nguyen > > > Tested-by: Andrew Bowers > > > Signed-off-by: Jeff Kirsher > > > --- > > > drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++ > > > 1 file changed, 10 insertions(+) > > > > > > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > > > b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > > > index 5a6600f7b382..c42256e91997 100644 > > > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > > > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > > > @@ -6469,6 +6469,11 @@ static int ixgbe_change_mtu(struct net_device > > > *netdev, int new_mtu) { > > > struct ixgbe_adapter *adapter = netdev_priv(netdev); > > > > > > + if (adapter->xdp_prog) { > > > + e_warn(probe, "MTU cannot be changed while XDP program is > loaded\n"); > > > + return -EPERM; > > > > EPERM looks wrong, EINVAL is common. Also most drivers will just > > check the bounds like you do in ixgbe_xdp_setup(), allowing the change > > if new MTU still fits constraints. > > > > FWIW are the IXGBE_FLAG_SRIOV_ENABLED and IXGBE_FLAG_DCB_ENABLED > flag > > changes also covered while xdp is enabled? Quick grep doesn't reveal > > them being checked against xdp_prog other than in ixgbe_xdp_setup(). > > Ah, I didn't make the review in time :) Could you follow up? Yes, I'll make a follow-on patch to address your comments.
Re: [net-next 1/6] ixgbe: Do not allow LRO or MTU change with XDP
On Thu, 26 Jul 2018 14:21:08 -0700, Jakub Kicinski wrote: > On Thu, 26 Jul 2018 10:40:45 -0700, Jeff Kirsher wrote: > > From: Tony Nguyen > > > > XDP does not support jumbo frames or LRO. These checks are being made > > outside the driver when an XDP program is loaded, however, there is > > nothing preventing these from changing after an XDP program is loaded. > > Add the checks so that while an XDP program is loaded, do not allow MTU > > to be changed or LRO to be enabled. > > > > Signed-off-by: Tony Nguyen > > Tested-by: Andrew Bowers > > Signed-off-by: Jeff Kirsher > > --- > > drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++ > > 1 file changed, 10 insertions(+) > > > > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > > b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > > index 5a6600f7b382..c42256e91997 100644 > > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > > @@ -6469,6 +6469,11 @@ static int ixgbe_change_mtu(struct net_device > > *netdev, int new_mtu) > > { > > struct ixgbe_adapter *adapter = netdev_priv(netdev); > > > > + if (adapter->xdp_prog) { > > + e_warn(probe, "MTU cannot be changed while XDP program is > > loaded\n"); > > + return -EPERM; > > EPERM looks wrong, EINVAL is common. Also most drivers will just check > the bounds like you do in ixgbe_xdp_setup(), allowing the change if new > MTU still fits constraints. > > FWIW are the IXGBE_FLAG_SRIOV_ENABLED and IXGBE_FLAG_DCB_ENABLED flag > changes also covered while xdp is enabled? Quick grep doesn't reveal > them being checked against xdp_prog other than in ixgbe_xdp_setup(). Ah, I didn't make the review in time :) Could you follow up?
[RFC bpf-next 0/6] net: xsk: minor improvements around queue handling
Hi! This set tries to make the core take care of error checking for the drivers. In particular making sure that the AF_XDP UMEM is not installed on queues which don't exist (or are disabled) and that changing queue (AKA ethtool channel) count cannot disable queues with active AF_XDF zero-copy sockets. I'm sending as an RFC because I'm not entirely sure what the desired behaviour is here. Is it Okay to install AF_XDP on queues which don't exist? I presume not? Are the AF_XDP queue_ids referring to TX queues as well as RX queues in case of the driver? I presume not? Should we try to prevent disabling queues which have non zero-copy sockets installed as well? :S Anyway, if any of those patches seem useful and reasonable, please let me know I will repost as non-RFC. Jakub Kicinski (6): net: update real_num_rx_queues even when !CONFIG_SYSFS xsk: refactor xdp_umem_assign_dev() xsk: don't allow umem replace at stack level xsk: don't allow installing UMEM beyond the number of queues ethtool: rename local variable max -> curr ethtool: don't allow disabling queues with umem installed include/linux/netdevice.h | 16 +++-- net/core/ethtool.c| 19 ++ net/xdp/xdp_umem.c| 73 --- 3 files changed, 71 insertions(+), 37 deletions(-) -- 2.17.1
[RFC bpf-next 2/6] xsk: refactor xdp_umem_assign_dev()
Return early and only take the ref on dev once there is no possibility of failing. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- net/xdp/xdp_umem.c | 49 -- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index f47abb46c587..c199d66b5f3f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -56,41 +56,34 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, if (force_copy) return 0; - dev_hold(dev); + if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) + return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ - if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) { - bpf.command = XDP_QUERY_XSK_UMEM; + bpf.command = XDP_QUERY_XSK_UMEM; - rtnl_lock(); - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); - if (err) { - dev_put(dev); - return force_zc ? -ENOTSUPP : 0; - } + if (err) + return force_zc ? -ENOTSUPP : 0; - bpf.command = XDP_SETUP_XSK_UMEM; - bpf.xsk.umem = umem; - bpf.xsk.queue_id = queue_id; + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = umem; + bpf.xsk.queue_id = queue_id; - rtnl_lock(); - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); - if (err) { - dev_put(dev); - return force_zc ? err : 0; /* fail or fallback */ - } - - umem->dev = dev; - umem->queue_id = queue_id; - umem->zc = true; - return 0; - } + if (err) + return force_zc ? err : 0; /* fail or fallback */ - dev_put(dev); - return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ + dev_hold(dev); + umem->dev = dev; + umem->queue_id = queue_id; + umem->zc = true; + return 0; } static void xdp_umem_clear_dev(struct xdp_umem *umem) -- 2.17.1
[RFC bpf-next 4/6] xsk: don't allow installing UMEM beyond the number of queues
Don't allow installing UMEM on queue IDs higher than real_num_rx_queues. Note that the check in xsk_bind() is advisory at most, since it's done without rtnl. Besides from driver's perspective the UMEM queue ID really only relates to RX queues. TX real_num_tx_queues queues the driver exposes are for SKBs. AF_XDP zero-copy TX will most likely use separate rings much like XDP_TX does. So the AF_XDP's concept of queue indexes only relates to RX queues at driver level. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- net/xdp/xdp_umem.c | 5 + 1 file changed, 5 insertions(+) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 911ca6d3cb5a..8139a2c5c5ed 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -79,6 +79,11 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, bpf.command = XDP_QUERY_XSK_UMEM; rtnl_lock(); + if (queue_id >= dev->real_num_rx_queues) { + err = -EINVAL; + goto err_rtnl_unlock; + } + err = xdp_umem_query(dev, queue_id); if (err) { err = err < 0 ? -ENOTSUPP : -EBUSY; -- 2.17.1
[RFC bpf-next 6/6] ethtool: don't allow disabling queues with umem installed
We already check the RSS indirection table does not use queues which would be disabled by channel reconfiguration. Make sure user does not try to disable queues which have a UMEM and zero- -copy AF_XDP socket installed. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- include/linux/netdevice.h | 8 net/core/ethtool.c| 7 +++ 2 files changed, 15 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a5a34f0fb485..c0df40deec54 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3566,7 +3566,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, enum bpf_netdev_command cmd); + +#if defined(CONFIG_XDP_SOCKETS) int xdp_umem_query(struct net_device *dev, u16 queue_id); +#else +static inline int xdp_umem_query(struct net_device *dev, u16 queue_id) +{ + return 0; +} +#endif int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4d5093577fe6..a8e693de5b11 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1745,6 +1745,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, { struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; u32 max_rx_in_use = 0; + unsigned int i; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; @@ -1768,6 +1769,12 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, (channels.combined_count + channels.rx_count) <= max_rx_in_use) return -EINVAL; + /* Disabling channels, query zero-copy AF_XDP sockets */ + for (i = channels.combined_count + channels.rx_count; +i < curr.combined_count + curr.rx_count; i++) + if (xdp_umem_query(dev, i) > 0) + return -EINVAL; + return dev->ethtool_ops->set_channels(dev, &channels); } -- 2.17.1
[RFC bpf-next 5/6] ethtool: rename local variable max -> curr
ethtool_set_channels() validates the config against driver's max settings. It retrieves the current config and stores it in a variable called max. This was okay when only max settings were accessed but we will soon want to access current settings as well, so calling the entire structure max makes the code less readable. While at it drop unnecessary parenthesis. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- net/core/ethtool.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c9993c6c2fd4..4d5093577fe6 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1743,7 +1743,7 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS }; + struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; u32 max_rx_in_use = 0; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) @@ -1752,13 +1752,13 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; - dev->ethtool_ops->get_channels(dev, &max); + dev->ethtool_ops->get_channels(dev, &curr); /* ensure new counts are within the maximums */ - if ((channels.rx_count > max.max_rx) || - (channels.tx_count > max.max_tx) || - (channels.combined_count > max.max_combined) || - (channels.other_count > max.max_other)) + if (channels.rx_count > curr.max_rx || + channels.tx_count > curr.max_tx || + channels.combined_count > curr.max_combined || + channels.other_count > curr.max_other) return -EINVAL; /* ensure the new Rx count fits within the configured Rx flow -- 2.17.1
[RFC bpf-next 1/6] net: update real_num_rx_queues even when !CONFIG_SYSFS
We used to depend on real_num_rx_queues as a upper bound for sanity checks. For AF_XDP it's useful if drivers can trust the stack never to try to install UMEM for queues which are not configured. Update dev->real_num_rx_queues even without sysfs compiled, otherwise it would always stay equal dev->num_rx_queues. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c1295c7a452e..6717dc7e8fbf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3433,6 +3433,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq); static inline int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { + dev->real_num_rx_queues = rxqs; return 0; } #endif -- 2.17.1
[RFC bpf-next 3/6] xsk: don't allow umem replace at stack level
Currently drivers have to check if they already have a umem installed for a given queue and return an error if so. Make better use of XDP_QUERY_XSK_UMEM and move this functionality to the core. We need to keep rtnl across the calls now. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- include/linux/netdevice.h | 7 --- net/xdp/xdp_umem.c| 37 - 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6717dc7e8fbf..a5a34f0fb485 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -872,10 +872,10 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; - /* XDP_SETUP_XSK_UMEM */ + /* XDP_QUERY_XSK_UMEM, XDP_SETUP_XSK_UMEM */ struct { - struct xdp_umem *umem; - u16 queue_id; + struct xdp_umem *umem; /* out for query*/ + u16 queue_id; /* in for query */ } xsk; }; }; @@ -3566,6 +3566,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, enum bpf_netdev_command cmd); +int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index c199d66b5f3f..911ca6d3cb5a 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include "xdp_umem.h" #include "xsk_queue.h" @@ -40,6 +42,21 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) } } +int xdp_umem_query(struct net_device *dev, u16 queue_id) +{ + struct netdev_bpf bpf; + + ASSERT_RTNL(); + + memset(&bpf, 0, sizeof(bpf)); + bpf.command = XDP_QUERY_XSK_UMEM; + bpf.xsk.queue_id = queue_id; + + if (!dev->netdev_ops->ndo_bpf) + return 0; + return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem; +} + int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags) { @@ -62,28 +79,30 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, bpf.command = XDP_QUERY_XSK_UMEM; rtnl_lock(); - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); - - if (err) - return force_zc ? -ENOTSUPP : 0; + err = xdp_umem_query(dev, queue_id); + if (err) { + err = err < 0 ? -ENOTSUPP : -EBUSY; + goto err_rtnl_unlock; + } bpf.command = XDP_SETUP_XSK_UMEM; bpf.xsk.umem = umem; bpf.xsk.queue_id = queue_id; - rtnl_lock(); err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); - if (err) - return force_zc ? err : 0; /* fail or fallback */ + goto err_rtnl_unlock; + rtnl_unlock(); dev_hold(dev); umem->dev = dev; umem->queue_id = queue_id; umem->zc = true; return 0; + +err_rtnl_unlock: + rtnl_unlock(); + return force_zc ? err : 0; /* fail or fallback */ } static void xdp_umem_clear_dev(struct xdp_umem *umem) -- 2.17.1
[PATCH bpf-next 2/4] tools: libbpf: add bpf_object__find_program_by_title()
Allow users to find programs by section names. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- tools/lib/bpf/libbpf.c | 12 tools/lib/bpf/libbpf.h | 3 +++ 2 files changed, 15 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index afa9860db755..857d3d16968e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -873,6 +873,18 @@ bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx) return NULL; } +struct bpf_program * +bpf_object__find_program_by_title(struct bpf_object *obj, const char *title) +{ + struct bpf_program *pos; + + bpf_object__for_each_program(pos, obj) { + if (pos->section_name && !strcmp(pos->section_name, title)) + return pos; + } + return NULL; +} + static int bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, Elf_Data *data, struct bpf_object *obj) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 1f8fc2060460..a295fe2f822b 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -86,6 +86,9 @@ const char *bpf_object__name(struct bpf_object *obj); unsigned int bpf_object__kversion(struct bpf_object *obj); int bpf_object__btf_fd(const struct bpf_object *obj); +struct bpf_program * +bpf_object__find_program_by_title(struct bpf_object *obj, const char *title); + struct bpf_object *bpf_object__next(struct bpf_object *prev); #define bpf_object__for_each_safe(pos, tmp)\ for ((pos) = bpf_object__next(NULL),\ -- 2.17.1
[PATCH bpf-next 1/4] tools: libbpf: handle NULL program gracefully in bpf_program__nth_fd()
bpf_map__fd() handles NULL map gracefully and returns -EINVAL. bpf_program__fd() and bpf_program__nth_fd() crash in this case. Make the behaviour more consistent by validating prog pointer as well. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- tools/lib/bpf/libbpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 955f8eafbf41..afa9860db755 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1991,6 +1991,9 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n) { int fd; + if (!prog) + return -EINVAL; + if (n >= prog->instances.nr || n < 0) { pr_warning("Can't get the %dth fd from program %s: only %d instances\n", n, prog->section_name, prog->instances.nr); -- 2.17.1
[PATCH bpf-next 0/4] samples: bpf: convert two more samples to libbpf
Hi! This set converts xdpsock_user.c and xdp_fwd_user.c to use libbpf instead of bpf_load.o. First two patches are minor improvements to libbpf to make the conversion (and use of libbpf in general) nicer. Jakub Kicinski (4): tools: libbpf: handle NULL program gracefully in bpf_program__nth_fd() tools: libbpf: add bpf_object__find_program_by_title() samples: bpf: convert xdp_fwd_user.c to libbpf samples: bpf: convert xdpsock_user.c to libbpf samples/bpf/Makefile | 4 ++-- samples/bpf/xdp_fwd_user.c | 34 +++--- samples/bpf/xdpsock_user.c | 38 +- tools/lib/bpf/libbpf.c | 15 +++ tools/lib/bpf/libbpf.h | 3 +++ 5 files changed, 72 insertions(+), 22 deletions(-) -- 2.17.1
[PATCH bpf-next 4/4] samples: bpf: convert xdpsock_user.c to libbpf
Convert xdpsock_user.c to use libbpf instead of bpf_load.o. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- CC: Magnus Karlsson CC: Björn Töpel samples/bpf/Makefile | 2 +- samples/bpf/xdpsock_user.c | 38 +- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 405fa6c880bb..6bfa250fc76e 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -105,7 +105,7 @@ xdp_rxq_info-objs := xdp_rxq_info_user.o syscall_tp-objs := bpf_load.o syscall_tp_user.o cpustat-objs := bpf_load.o cpustat_user.o xdp_adjust_tail-objs := xdp_adjust_tail_user.o -xdpsock-objs := bpf_load.o xdpsock_user.o +xdpsock-objs := xdpsock_user.o xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 5904b1543831..5c1fcb1a84f1 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -26,7 +26,7 @@ #include #include -#include "bpf_load.h" +#include "bpf/libbpf.h" #include "bpf_util.h" #include @@ -886,7 +886,13 @@ static void l2fwd(struct xdpsock *xsk) int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + int prog_fd, qidconf_map, xsks_map; + struct bpf_object *obj; char xdp_filename[256]; + struct bpf_map *map; int i, ret, key = 0; pthread_t pt; @@ -899,24 +905,38 @@ int main(int argc, char **argv) } snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]); + prog_load_attr.file = xdp_filename; - if (load_bpf_file(xdp_filename)) { - fprintf(stderr, "ERROR: load_bpf_file %s\n", bpf_log_buf); + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + exit(EXIT_FAILURE); + if (prog_fd < 0) { + fprintf(stderr, "ERROR: no program found: %s\n", + strerror(prog_fd)); exit(EXIT_FAILURE); } - if (!prog_fd[0]) { - fprintf(stderr, "ERROR: load_bpf_file: \"%s\"\n", - strerror(errno)); + map = bpf_object__find_map_by_name(obj, "qidconf_map"); + qidconf_map = bpf_map__fd(map); + if (qidconf_map < 0) { + fprintf(stderr, "ERROR: no qidconf map found: %s\n", + strerror(qidconf_map)); + exit(EXIT_FAILURE); + } + + map = bpf_object__find_map_by_name(obj, "xsks_map"); + xsks_map = bpf_map__fd(map); + if (xsks_map < 0) { + fprintf(stderr, "ERROR: no xsks map found: %s\n", + strerror(xsks_map)); exit(EXIT_FAILURE); } - if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd[0], opt_xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) { fprintf(stderr, "ERROR: link set xdp fd failed\n"); exit(EXIT_FAILURE); } - ret = bpf_map_update_elem(map_fd[0], &key, &opt_queue, 0); + ret = bpf_map_update_elem(qidconf_map, &key, &opt_queue, 0); if (ret) { fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n"); exit(EXIT_FAILURE); @@ -933,7 +953,7 @@ int main(int argc, char **argv) /* ...and insert them into the map. */ for (i = 0; i < num_socks; i++) { key = i; - ret = bpf_map_update_elem(map_fd[1], &key, &xsks[i]->sfd, 0); + ret = bpf_map_update_elem(xsks_map, &key, &xsks[i]->sfd, 0); if (ret) { fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); exit(EXIT_FAILURE); -- 2.17.1
[PATCH bpf-next 3/4] samples: bpf: convert xdp_fwd_user.c to libbpf
Convert xdp_fwd_user.c to use libbpf instead of bpf_load.o. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- CC: dsah...@gmail.com samples/bpf/Makefile | 2 +- samples/bpf/xdp_fwd_user.c | 34 +++--- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 9ea2f7b64869..405fa6c880bb 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -106,7 +106,7 @@ syscall_tp-objs := bpf_load.o syscall_tp_user.o cpustat-objs := bpf_load.o cpustat_user.o xdp_adjust_tail-objs := xdp_adjust_tail_user.o xdpsock-objs := bpf_load.o xdpsock_user.o -xdp_fwd-objs := bpf_load.o xdp_fwd_user.o +xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c index a87a2048ed32..f88e1d7093d6 100644 --- a/samples/bpf/xdp_fwd_user.c +++ b/samples/bpf/xdp_fwd_user.c @@ -24,8 +24,7 @@ #include #include -#include "bpf_load.h" -#include "bpf_util.h" +#include "bpf/libbpf.h" #include @@ -63,9 +62,15 @@ static void usage(const char *prog) int main(int argc, char **argv) { + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + const char *prog_name = "xdp_fwd"; + struct bpf_program *prog; char filename[PATH_MAX]; + struct bpf_object *obj; int opt, i, idx, err; - int prog_id = 0; + int prog_fd, map_fd; int attach = 1; int ret = 0; @@ -75,7 +80,7 @@ int main(int argc, char **argv) attach = 0; break; case 'D': - prog_id = 1; + prog_name = "xdp_fwd_direct"; break; default: usage(basename(argv[0])); @@ -90,6 +95,7 @@ int main(int argc, char **argv) if (attach) { snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; if (access(filename, O_RDONLY) < 0) { printf("error accessing file %s: %s\n", @@ -97,19 +103,25 @@ int main(int argc, char **argv) return 1; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) return 1; - } - if (!prog_fd[prog_id]) { - printf("load_bpf_file: %s\n", strerror(errno)); + prog = bpf_object__find_program_by_title(obj, prog_name); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + printf("program not found: %s\n", strerror(prog_fd)); + return 1; + } + map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj, + "tx_port")); + if (map_fd < 0) { + printf("map not found: %s\n", strerror(map_fd)); return 1; } } if (attach) { for (i = 1; i < 64; ++i) - bpf_map_update_elem(map_fd[0], &i, &i, 0); + bpf_map_update_elem(map_fd, &i, &i, 0); } for (i = optind; i < argc; ++i) { @@ -126,7 +138,7 @@ int main(int argc, char **argv) if (err) ret = err; } else { - err = do_attach(idx, prog_fd[prog_id], argv[i]); + err = do_attach(idx, prog_fd, argv[i]); if (err) ret = err; } -- 2.17.1
[PATCH net-next] netdevsim: make debug dirs' dentries static
The root directories of netdevsim should only be used by the core to create per-device subdirectories, so limit their visibility to the core file. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet --- drivers/net/netdevsim/netdev.c| 6 +++--- drivers/net/netdevsim/netdevsim.h | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index 2d244551298b..8d8e2b3f263e 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -41,6 +41,9 @@ struct nsim_vf_config { static u32 nsim_dev_id; +static struct dentry *nsim_ddir; +static struct dentry *nsim_sdev_ddir; + static int nsim_num_vf(struct device *dev) { struct netdevsim *ns = to_nsim(dev); @@ -566,9 +569,6 @@ static struct rtnl_link_ops nsim_link_ops __read_mostly = { .dellink= nsim_dellink, }; -struct dentry *nsim_ddir; -struct dentry *nsim_sdev_ddir; - static int __init nsim_module_init(void) { int err; diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 02be199eb005..384c254fafc5 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -103,9 +103,6 @@ struct netdevsim { struct nsim_ipsec ipsec; }; -extern struct dentry *nsim_ddir; -extern struct dentry *nsim_sdev_ddir; - #ifdef CONFIG_BPF_SYSCALL int nsim_bpf_init(struct netdevsim *ns); void nsim_bpf_uninit(struct netdevsim *ns); -- 2.17.1
Re: [net-next 1/6] ixgbe: Do not allow LRO or MTU change with XDP
On Thu, 26 Jul 2018 10:40:45 -0700, Jeff Kirsher wrote: > From: Tony Nguyen > > XDP does not support jumbo frames or LRO. These checks are being made > outside the driver when an XDP program is loaded, however, there is > nothing preventing these from changing after an XDP program is loaded. > Add the checks so that while an XDP program is loaded, do not allow MTU > to be changed or LRO to be enabled. > > Signed-off-by: Tony Nguyen > Tested-by: Andrew Bowers > Signed-off-by: Jeff Kirsher > --- > drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++ > 1 file changed, 10 insertions(+) > > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > index 5a6600f7b382..c42256e91997 100644 > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > @@ -6469,6 +6469,11 @@ static int ixgbe_change_mtu(struct net_device *netdev, > int new_mtu) > { > struct ixgbe_adapter *adapter = netdev_priv(netdev); > > + if (adapter->xdp_prog) { > + e_warn(probe, "MTU cannot be changed while XDP program is > loaded\n"); > + return -EPERM; EPERM looks wrong, EINVAL is common. Also most drivers will just check the bounds like you do in ixgbe_xdp_setup(), allowing the change if new MTU still fits constraints. FWIW are the IXGBE_FLAG_SRIOV_ENABLED and IXGBE_FLAG_DCB_ENABLED flag changes also covered while xdp is enabled? Quick grep doesn't reveal them being checked against xdp_prog other than in ixgbe_xdp_setup(). > + } > + > /* >* For 82599EB we cannot allow legacy VFs to enable their receive >* paths when MTU greater than 1500 is configured. So display a > @@ -9407,6 +9412,11 @@ static netdev_features_t ixgbe_fix_features(struct > net_device *netdev, > if (!(adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)) > features &= ~NETIF_F_LRO; > > + if (adapter->xdp_prog && (features & NETIF_F_LRO)) { > + e_dev_err("LRO is not supported with XDP\n"); > + features &= ~NETIF_F_LRO; > + } > + > return features; > } >
Re: [net-next 0/6][pull request] 10GbE Intel Wired LAN Driver Updates 2018-07-26
From: Jeff Kirsher Date: Thu, 26 Jul 2018 10:40:44 -0700 > This series contains updates to ixgbe and igb. Pulled, thanks Jeff.
Re: [patch net-next] net: sched: unmark chain as explicitly created on delete
From: Jiri Pirko Date: Thu, 26 Jul 2018 18:27:58 +0200 > From: Jiri Pirko > > Once user manually deletes the chain using "chain del", the chain cannot > be marked as explicitly created anymore. > > Signed-off-by: Jiri Pirko > Fixes: 32a4f5ecd738 ("net: sched: introduce chain object to uapi") Applied.
Re: [PATCH net-next] l2tp: remove ->recv_payload_hook
From: Guillaume Nault Date: Wed, 25 Jul 2018 14:53:33 +0200 > The tunnel reception hook is only used by l2tp_ppp for skipping PPP > framing bytes. This is a session specific operation, but once a PPP > session sets ->recv_payload_hook on its tunnel, all frames received by > the tunnel will enter pppol2tp_recv_payload_hook(), including those > targeted at Ethernet sessions (an L2TPv3 tunnel can multiplex PPP and > Ethernet sessions). > > So this mechanism is wrong, and uselessly complex. Let's just move this > functionality to the pppol2tp rx handler and drop ->recv_payload_hook. > > Signed-off-by: Guillaume Nault Applied, thanks Guillaume.
Re: [PATCH net-next] net/tls: Removed redundant checks for non-NULL
From: Vakul Garg Date: Tue, 24 Jul 2018 16:54:27 +0530 > Removed checks against non-NULL before calling kfree_skb() and > crypto_free_aead(). These functions are safe to be called with NULL > as an argument. > > Signed-off-by: Vakul Garg Applied.
Re: [PATCH net v2] net: rollback orig value on failure of dev_qdisc_change_tx_queue_len
From: Tariq Toukan Date: Tue, 24 Jul 2018 14:12:20 +0300 > Fix dev_change_tx_queue_len so it rolls back original value > upon a failure in dev_qdisc_change_tx_queue_len. > This is already done for notifirers' failures, share the code. > > In case of failure in dev_qdisc_change_tx_queue_len, some tx queues > would still be of the new length, while they should be reverted. > Currently, the revert is not done, and is marked with a TODO label > in dev_qdisc_change_tx_queue_len, and should find some nice solution > to do it. > Yet it is still better to not apply the newly requested value. > > Fixes: 48bfd55e7e41 ("net_sched: plug in qdisc ops change_tx_queue_len") > Signed-off-by: Tariq Toukan > Reviewed-by: Eran Ben Elisha > Reported-by: Ran Rozenstein Applied and queued up for -stable.
Re: [PATCH net-next] cbs: Add support for the graft function
From: Vinicius Costa Gomes Date: Mon, 23 Jul 2018 17:08:00 -0700 > This will allow to install a child qdisc under cbs. The main use case > is to install ETF (Earliest TxTime First) qdisc under cbs, so there's > another level of control for time-sensitive traffic. > > Signed-off-by: Vinicius Costa Gomes Applied, thank you.
[PATCH net] net: ena: Fix use of uninitialized DMA address bits field
UBSAN triggers the following undefined behaviour warnings: [...] [ 13.236124] UBSAN: Undefined behaviour in drivers/net/ethernet/amazon/ena/ena_eth_com.c:468:22 [ 13.240043] shift exponent 64 is too large for 64-bit type 'long long unsigned int' [...] [ 13.744769] UBSAN: Undefined behaviour in drivers/net/ethernet/amazon/ena/ena_eth_com.c:373:4 [ 13.748694] shift exponent 64 is too large for 64-bit type 'long long unsigned int' [...] When splitting the address to high and low, GENMASK_ULL is used to generate a bitmask with dma_addr_bits field from io_sq (in ena_com_prepare_tx and ena_com_add_single_rx_desc). The problem is that dma_addr_bits is not initialized with a proper value (besides being cleared in ena_com_create_io_queue). Assign dma_addr_bits the correct value that is stored in ena_dev when initializing the SQ. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Gal Pressman --- drivers/net/ethernet/amazon/ena/ena_com.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 1b9d3130af4d..17f12c18d225 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -333,6 +333,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr)); + io_sq->dma_addr_bits = ena_dev->dma_addr_bits; io_sq->desc_entry_size = (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ? sizeof(struct ena_eth_io_tx_desc) : -- 2.14.4
Re: [patch net-next RFC] net: sched: don't dump chains only held by actions
On Thu, 26 Jul 2018 18:31:01 +0200, Jiri Pirko wrote: > From: Jiri Pirko > > In case a chain is empty and not explicitly created by a user, > such chain should not exist. The only exception is if there is > an action "goto chain" pointing to it. In that case, don't show the > chain in the dump. Track the chain references held by actions and > use them to find out if a chain should or should not be shown > in chain dump. > > Signed-off-by: Jiri Pirko I don't have any better ideas :) One question below. > diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c > index 75cce2819de9..76035cd6e3bf 100644 > --- a/net/sched/cls_api.c > +++ b/net/sched/cls_api.c > @@ -262,6 +262,25 @@ static void tcf_chain_hold(struct tcf_chain *chain) > ++chain->refcnt; > } > > +static void tcf_chain_hold_by_act(struct tcf_chain *chain) > +{ > + ++chain->action_refcnt; > +} > + > +static void tcf_chain_release_by_act(struct tcf_chain *chain) > +{ > + --chain->action_refcnt; > +} > + > +static bool tcf_chain_is_zombie(struct tcf_chain *chain) > +{ > + /* In case all the references are action references, this > + * chain is a zombie and should not be listed in the chain > + * dump list. > + */ > + return chain->refcnt == chain->action_refcnt; > +} > + > static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, > u32 chain_index) > { > @@ -298,6 +317,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, > u32 chain_index, > } > EXPORT_SYMBOL(tcf_chain_get); > > +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 > chain_index) > +{ > + struct tcf_chain *chain = tcf_chain_get(block, chain_index, true); > + > + tcf_chain_hold_by_act(chain); > + return chain; > +} > +EXPORT_SYMBOL(tcf_chain_get_by_act); > + > static void tc_chain_tmplt_del(struct tcf_chain *chain); > > void tcf_chain_put(struct tcf_chain *chain) > @@ -310,6 +338,13 @@ void tcf_chain_put(struct tcf_chain *chain) > } > EXPORT_SYMBOL(tcf_chain_put); > > +void tcf_chain_put_by_act(struct tcf_chain *chain) > +{ > + tcf_chain_release_by_act(chain); > + tcf_chain_put(chain); > +} > +EXPORT_SYMBOL(tcf_chain_put_by_act); > + > static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) > { > if (chain->explicitly_created) > @@ -1803,17 +1838,26 @@ static int tc_ctl_chain(struct sk_buff *skb, struct > nlmsghdr *n, > chain = tcf_chain_lookup(block, chain_index); > if (n->nlmsg_type == RTM_NEWCHAIN) { > if (chain) { > - NL_SET_ERR_MSG(extack, "Filter chain already exists"); > - return -EEXIST; > - } > - if (!(n->nlmsg_flags & NLM_F_CREATE)) { > - NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and > NLM_F_CREATE to create a new chain"); > - return -ENOENT; > - } > - chain = tcf_chain_create(block, chain_index); > - if (!chain) { > - NL_SET_ERR_MSG(extack, "Failed to create filter chain"); > - return -ENOMEM; > + if (tcf_chain_is_zombie(chain)) { > + /* The chain exists only because there is > + * some action referencing it, meaning it > + * is a zombie. > + */ > + tcf_chain_hold(chain); I'm not 100% sure why this is needed? In my tree below I see: switch (n->nlmsg_type) { case RTM_NEWCHAIN: err = tc_chain_tmplt_add(chain, net, tca, extack); if (err) goto errout; /* In case the chain was successfully added, take a reference * to the chain. This ensures that an empty chain * does not disappear at the end of this function. */ tcf_chain_hold(chain); chain->explicitly_created = true; so one reference will be taken.. do we need two? > + } else { > + NL_SET_ERR_MSG(extack, "Filter chain already > exists"); > + return -EEXIST; > + } > + } else { > + if (!(n->nlmsg_flags & NLM_F_CREATE)) { > + NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN > and NLM_F_CREATE to create a new chain"); > + return -ENOENT; > + } > + chain = tcf_chain_create(block, chain_index); > + if (!chain) { > + NL_SET_ERR_MSG(extack, "Failed to create filter > chain"); > + return -ENOMEM; > + } > } > } else { > if (!chain) { > @@ -1944,6 +1988,8 @@ static int tc_
Re: [PATCH net] net/mlx5e: Move mlx5e_priv_flags into en_ethtool.c
On Sun, Jul 15, 2018 at 12:06 PM, Kamal Heib wrote: > Move the definition of mlx5e_priv_flags into en_ethtool.c because it's > only used there. > > Fixes: 4e59e2888139 ("net/mlx5e: Introduce net device priv flags > infrastructure") > Signed-off-by: Kamal Heib > --- > drivers/net/ethernet/mellanox/mlx5/core/en.h | 7 --- > drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 7 +++ > 2 files changed, 7 insertions(+), 7 deletions(-) > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h > b/drivers/net/ethernet/mellanox/mlx5/core/en.h > index eb9eb7aa953a..84e6a5b42286 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h > @@ -208,13 +208,6 @@ struct mlx5e_umr_wqe { > > extern const char mlx5e_self_tests[][ETH_GSTRING_LEN]; > > -static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = { > - "rx_cqe_moder", > - "tx_cqe_moder", > - "rx_cqe_compress", > - "rx_striding_rq", > -}; > - Hi Kamal, on a second thought, i would like to drop this change and keep mlx5e_priv_flags close/local to the below mlx5e_priv_flag. Please let me know. > enum mlx5e_priv_flag { > MLX5E_PFLAG_RX_CQE_BASED_MODER = (1 << 0), > MLX5E_PFLAG_TX_CQE_BASED_MODER = (1 << 1), > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c > b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c > index fffe514ba855..2a1c35d82c2e 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c > @@ -33,6 +33,13 @@ > #include "en.h" > #include "en/port.h" > > +static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = { > + "rx_cqe_moder", > + "tx_cqe_moder", > + "rx_cqe_compress", > + "rx_striding_rq", > +}; > + > void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv, >struct ethtool_drvinfo *drvinfo) > { > -- > 2.14.4 >
Re: [PATCH bpf] xdp: add NULL pointer check in __xdp_return()
On Thu, 26 Jul 2018 14:13:20 +0200, Björn Töpel wrote: > Den mån 23 juli 2018 kl 21:58 skrev Jakub Kicinski: > > On Mon, 23 Jul 2018 11:39:36 +0200, Björn Töpel wrote: > > > Den fre 20 juli 2018 kl 22:08 skrev Jakub Kicinski: > > > > On Fri, 20 Jul 2018 10:18:21 -0700, Martin KaFai Lau wrote: > > > > > On Sat, Jul 21, 2018 at 01:04:45AM +0900, Taehee Yoo wrote: > > > > > > rhashtable_lookup() can return NULL. so that NULL pointer > > > > > > check routine should be added. > > > > > > > > > > > > Fixes: 02b55e5657c3 ("xdp: add MEM_TYPE_ZERO_COPY") > > > > > > Signed-off-by: Taehee Yoo > > > > > > --- > > > > > > net/core/xdp.c | 3 ++- > > > > > > 1 file changed, 2 insertions(+), 1 deletion(-) > > > > > > > > > > > > diff --git a/net/core/xdp.c b/net/core/xdp.c > > > > > > index 9d1f220..1c12bc7 100644 > > > > > > --- a/net/core/xdp.c > > > > > > +++ b/net/core/xdp.c > > > > > > @@ -345,7 +345,8 @@ static void __xdp_return(void *data, struct > > > > > > xdp_mem_info *mem, bool napi_direct, > > > > > > rcu_read_lock(); > > > > > > /* mem->id is valid, checked in > > > > > > xdp_rxq_info_reg_mem_model() */ > > > > > > xa = rhashtable_lookup(mem_id_ht, &mem->id, > > > > > > mem_id_rht_params); > > > > > > - xa->zc_alloc->free(xa->zc_alloc, handle); > > > > > > + if (xa) > > > > > > + xa->zc_alloc->free(xa->zc_alloc, handle); > > > > > hmm...It is not clear to me the "!xa" case don't have to be handled? > > > > > > > > Actually I have a more fundamental question about this interface I've > > > > been meaning to ask. > > > > > > > > IIUC free() can happen on any CPU at any time, when whatever device, > > > > socket or CPU this got redirected to completed the TX. IOW there may > > > > be multiple producers. Drivers would need to create spin lock a'la the > > > > a9744f7ca200 ("xsk: fix potential race in SKB TX completion code") fix? > > > > > > > > > > Jakub, apologies for the slow response. I'm still in > > > "holiday/hammock&beer mode", but will be back in a week. :-P > > > > Ah, sorry to interrupt! :) > > > > Don't make it a habit! ;-P :-D > > > The idea with the xdp_return_* functions are that an xdp_buff and > > > xdp_frame can have custom allocations schemes. The difference beween > > > struct xdp_buff and struct xdp_frame is lifetime. The xdp_buff > > > lifetime is within the napi context, whereas xdp_frame can have a > > > lifetime longer/outside the napi context. E.g. for a XDP_REDIRECT > > > scenario an xdp_buff is converted to a xdp_frame. The conversion is > > > done in include/net/xdp.h:convert_to_xdp_frame. > > > > > > Currently, the zero-copy MEM_TYPE_ZERO_COPY memtype can *only* be used > > > for xdp_buff, meaning that the lifetime is constrained to a napi > > > context. Further, given an xdp_buff with memtype MEM_TYPE_ZERO_COPY, > > > doing XDP_REDIRECT to a target that is *not* an AF_XDP socket would > > > mean converting the xdp_buff to an xdp_frame. The xdp_frame can then > > > be free'd on any CPU. > > > > > > Note that the xsk_rcv* functions is always called from an napi > > > context, and therefore is using the xdp_return_buff calls. > > > > > > To answer your question -- no, this fix is *not* needed, because the > > > xdp_buff napi constrained, and the xdp_buff will only be free'd on one > > > CPU. > > > > Oh, thanks, I missed the check in convert_to_xdp_frame(), so the only > > frames which can come back via the free path are out of the error path > > in __xsk_rcv_zc()? > > > > That path looks a little surprising too, isn't the expectation that if > > xdp_do_redirect() returns an error the driver retains the ownership of > > the buffer? > > > > static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) > > { > > int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); > > > > if (err) { > > xdp_return_buff(xdp); > > xs->rx_dropped++; > > } > > > > return err; > > } > > > > This seems to call xdp_return_buff() *and* return an error. > > > > Ugh, this is indeed an error. The xdp_return buff should be removed. > Thanks for spotting this! > > So, yes, the way to get the buffer back (in ZC) to the driver is via > the error path (recycling) or via the UMEM fill queue. Okay, I'm gonna test and submit this later today for bpf tree: diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 72335c2e8108..4e937cd7c17d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -84,10 +84,8 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); - if (err) { - xdp_return_buff(xdp); + if (err) xs->rx_dropped++; - } return err; } Now the frame return/ZC allocator ->free path is all dead code :S
(no subject)
-- Do you need a loan at 2% interest rate for any reason ? Every interested applicant should contact us via the below contact details. E-mails: fritzmicloanf...@financier.com firtzmicloanf...@gmail.com Yours Sincerely Fritz Micheal.
[net-next 1/6] ixgbe: Do not allow LRO or MTU change with XDP
From: Tony Nguyen XDP does not support jumbo frames or LRO. These checks are being made outside the driver when an XDP program is loaded, however, there is nothing preventing these from changing after an XDP program is loaded. Add the checks so that while an XDP program is loaded, do not allow MTU to be changed or LRO to be enabled. Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 5a6600f7b382..c42256e91997 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6469,6 +6469,11 @@ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu) { struct ixgbe_adapter *adapter = netdev_priv(netdev); + if (adapter->xdp_prog) { + e_warn(probe, "MTU cannot be changed while XDP program is loaded\n"); + return -EPERM; + } + /* * For 82599EB we cannot allow legacy VFs to enable their receive * paths when MTU greater than 1500 is configured. So display a @@ -9407,6 +9412,11 @@ static netdev_features_t ixgbe_fix_features(struct net_device *netdev, if (!(adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)) features &= ~NETIF_F_LRO; + if (adapter->xdp_prog && (features & NETIF_F_LRO)) { + e_dev_err("LRO is not supported with XDP\n"); + features &= ~NETIF_F_LRO; + } + return features; } -- 2.17.1
[net-next 0/6][pull request] 10GbE Intel Wired LAN Driver Updates 2018-07-26
This series contains updates to ixgbe and igb. Tony fixes ixgbe to add checks to ensure jumbo frames or LRO get enabled after an XDP program is loaded. Shannon Nelson adds the missing security configuration registers to the ixgbe register dump, which will help in debugging. Christian Grönke fixes an issue in igb that occurs on SGMII based SPF mdoules, by reverting changes from 2 previous patches. The issue was that initialization would fail on the fore mentioned modules because the driver would try to reset the PHY before obtaining the PHY address of the SGMII attached PHY. Venkatesh Srinivas replaces wmb() with dma_wmb() for doorbell writes, which avoids SFENCEs before the doorbell writes. Alex cleans up and refactors ixgbe Tx/Rx shutdown to reduce time needed to stop the device. The code refactor allows us to take the completion time into account when disabling queues, so that on some platforms with higher completion times, would not result in receive queues disabled messages. The following are changes since commit dc66fe43b7ebdb53628dcbc1f8f15de3e000aacf: rds: send: Fix dead code in rds_sendmsg and are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 10GbE Alexander Duyck (2): ixgbe: Reorder Tx/Rx shutdown to reduce time needed to stop device ixgbe: Refactor queue disable logic to take completion time into account Christian Grönke (1): igb: Remove superfluous reset to PHY and page 0 selection Shannon Nelson (1): ixgbe: add ipsec security registers into ethtool register dump Tony Nguyen (1): ixgbe: Do not allow LRO or MTU change with XDP Venkatesh Srinivas (1): igb: Use dma_wmb() instead of wmb() before doorbell writes drivers/net/ethernet/intel/igb/e1000_82575.c | 12 - drivers/net/ethernet/intel/igb/igb_main.c | 4 +- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 3 +- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 42 +-- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 300 ++ 5 files changed, 250 insertions(+), 111 deletions(-) -- 2.17.1
[net-next 5/6] ixgbe: Reorder Tx/Rx shutdown to reduce time needed to stop device
From: Alexander Duyck This change is meant to help reduce the time needed to shutdown the transmit and receive paths for the device. Specifically what we now do after this patch is disable the transmit path first at the netdev level, and then work on disabling the Rx. This way while we are waiting on the Rx queues to be disabled the Tx queues have an opportunity to drain out. In addition I have dropped the 10ms timeout that was left in the ixgbe_down function that seems to have been carried through from back in e1000 as far as I can tell. We shouldn't need it since we don't actually disable the Tx until much later and we have additional logic in place for verifying the Tx queues have been disabled. Signed-off-by: Alexander Duyck Tested-by: Don Buchholz Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index c42256e91997..aa4f05c36260 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -5814,6 +5814,13 @@ void ixgbe_down(struct ixgbe_adapter *adapter) if (test_and_set_bit(__IXGBE_DOWN, &adapter->state)) return; /* do nothing if already down */ + /* Shut off incoming Tx traffic */ + netif_tx_stop_all_queues(netdev); + + /* call carrier off first to avoid false dev_watchdog timeouts */ + netif_carrier_off(netdev); + netif_tx_disable(netdev); + /* disable receives */ hw->mac.ops.disable_rx(hw); @@ -5822,16 +5829,9 @@ void ixgbe_down(struct ixgbe_adapter *adapter) /* this call also flushes the previous write */ ixgbe_disable_rx_queue(adapter, adapter->rx_ring[i]); - usleep_range(1, 2); - /* synchronize_sched() needed for pending XDP buffers to drain */ if (adapter->xdp_ring[0]) synchronize_sched(); - netif_tx_stop_all_queues(netdev); - - /* call carrier off first to avoid false dev_watchdog timeouts */ - netif_carrier_off(netdev); - netif_tx_disable(netdev); ixgbe_irq_disable(adapter); -- 2.17.1
[net-next 3/6] igb: Remove superfluous reset to PHY and page 0 selection
From: Christian Grönke This patch reverts two previous applied patches to fix an issue that appeared when using SGMII based SFP modules. In the current state the driver will try to reset the PHY before obtaining the phy_addr of the SGMII attached PHY. That leads to an error in e1000_write_phy_reg_sgmii_82575. Causing the initialization to fail: igb: Intel(R) Gigabit Ethernet Network Driver - version 5.4.0-k igb: Copyright (c) 2007-2014 Intel Corporation. igb: probe of :??:??.? failed with error -3 The patches being reverted are: commit 182785335447957409282ca745aa5bc3968facee Author: Aaron Sierra Date: Tue Nov 29 10:03:56 2016 -0600 igb: reset the PHY before reading the PHY ID commit 440aeca4b9858248d8f16d724d9fa87a4f65fa33 Author: Matwey V Kornilov Date: Thu Nov 24 13:32:48 2016 +0300 igb: Explicitly select page 0 at initialization The first reverted patch directly causes the problem mentioned above. In case of SGMII the phy_addr is not known at this point and will only be obtained by 'igb_get_phy_id_82575' further down in the code. The second removed patch selects forces selection of page 0 in the PHY. Something that the reset tries to address as well. As pointed out by Alexander Duzck, the patch below fixes the same issue but in the proper location: commit 4e684f59d760a2c7c716bb60190783546e2d08a1 Author: Chris J Arges Date: Wed Nov 2 09:13:42 2016 -0500 igb: Workaround for igb i210 firmware issue Reverts: 440aeca4b9858248d8f16d724d9fa87a4f65fa33. Reverts: 182785335447957409282ca745aa5bc3968facee. Signed-off-by: Christian Grönke Reviewed-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/e1000_82575.c | 12 1 file changed, 12 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.c b/drivers/net/ethernet/intel/igb/e1000_82575.c index b13b42e5a1d9..a795c07d0df7 100644 --- a/drivers/net/ethernet/intel/igb/e1000_82575.c +++ b/drivers/net/ethernet/intel/igb/e1000_82575.c @@ -225,19 +225,7 @@ static s32 igb_init_phy_params_82575(struct e1000_hw *hw) hw->bus.func = (rd32(E1000_STATUS) & E1000_STATUS_FUNC_MASK) >> E1000_STATUS_FUNC_SHIFT; - /* Make sure the PHY is in a good state. Several people have reported -* firmware leaving the PHY's page select register set to something -* other than the default of zero, which causes the PHY ID read to -* access something other than the intended register. -*/ - ret_val = hw->phy.ops.reset(hw); - if (ret_val) { - hw_dbg("Error resetting the PHY.\n"); - goto out; - } - /* Set phy->phy_addr and phy->id. */ - igb_write_phy_reg_82580(hw, I347AT4_PAGE_SELECT, 0); ret_val = igb_get_phy_id_82575(hw); if (ret_val) return ret_val; -- 2.17.1
[net-next 2/6] ixgbe: add ipsec security registers into ethtool register dump
From: Shannon Nelson Add the ixgbe's security configuration registers into the register dump. Signed-off-by: Shannon Nelson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index bd1ba88ec1d5..1d688840cd6c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -511,7 +511,7 @@ static void ixgbe_set_msglevel(struct net_device *netdev, u32 data) static int ixgbe_get_regs_len(struct net_device *netdev) { -#define IXGBE_REGS_LEN 1139 +#define IXGBE_REGS_LEN 1145 return IXGBE_REGS_LEN * sizeof(u32); } @@ -874,6 +874,14 @@ static void ixgbe_get_regs(struct net_device *netdev, /* X540 specific DCB registers */ regs_buff[1137] = IXGBE_READ_REG(hw, IXGBE_RTTQCNCR); regs_buff[1138] = IXGBE_READ_REG(hw, IXGBE_RTTQCNTG); + + /* Security config registers */ + regs_buff[1139] = IXGBE_READ_REG(hw, IXGBE_SECTXCTRL); + regs_buff[1140] = IXGBE_READ_REG(hw, IXGBE_SECTXSTAT); + regs_buff[1141] = IXGBE_READ_REG(hw, IXGBE_SECTXBUFFAF); + regs_buff[1142] = IXGBE_READ_REG(hw, IXGBE_SECTXMINIFG); + regs_buff[1143] = IXGBE_READ_REG(hw, IXGBE_SECRXCTRL); + regs_buff[1144] = IXGBE_READ_REG(hw, IXGBE_SECRXSTAT); } static int ixgbe_get_eeprom_len(struct net_device *netdev) -- 2.17.1
[net-next 4/6] igb: Use dma_wmb() instead of wmb() before doorbell writes
From: Venkatesh Srinivas igb writes to doorbells to post transmit and receive descriptors; after writing descriptors to memory but before writing to doorbells, use dma_wmb() rather than wmb(). wmb() is more heavyweight than necessary before doorbell writes. On x86, this avoids SFENCEs before doorbell writes in both the tx and rx refill paths. Signed-off-by: Venkatesh Srinivas Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index e3a0c02721c9..25720d95d4ea 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6031,7 +6031,7 @@ static int igb_tx_map(struct igb_ring *tx_ring, * We also need this memory barrier to make certain all of the * status bits have been updated before next_to_watch is written. */ - wmb(); + dma_wmb(); /* set next_to_watch value indicating a packet is present */ first->next_to_watch = tx_desc; @@ -8531,7 +8531,7 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count) * applicable for weak-ordered memory model archs, * such as IA-64). */ - wmb(); + dma_wmb(); writel(i, rx_ring->tail); } } -- 2.17.1
[PATCH v1 net-next] igb: Use an advanced ctx descriptor for launchtime
On i210, Launchtime (TxTime) requires the usage of an "Advanced Transmit Context Descriptor" for retrieving the timestamp of a packet. The igb driver correctly builds such descriptor on the segmentation flow (i.e. igb_tso()) or on the checksum one (i.e. igb_tx_csum()), but the feature is broken for AF_PACKET if the IGB_TX_FLAGS_VLAN is not set, which happens due to an early return on igb_tx_csum(). This flag is only set by the kernel when a VLAN interface is used, thus we can't just rely on it. Here we are fixing this issue by checking if launchtime is enabled for the current tx_ring before performing the early return. Signed-off-by: Jesus Sanchez-Palencia --- drivers/net/ethernet/intel/igb/igb_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index e3a0c02721c9..fa1089defcd5 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -5816,7 +5816,8 @@ static void igb_tx_csum(struct igb_ring *tx_ring, struct igb_tx_buffer *first) if (skb->ip_summed != CHECKSUM_PARTIAL) { csum_failed: - if (!(first->tx_flags & IGB_TX_FLAGS_VLAN)) + if (!(first->tx_flags & IGB_TX_FLAGS_VLAN) + && !tx_ring->launchtime_enable) return; goto no_csum; } -- 2.18.0
[PATCH bpf] bpf: btf: Use exact btf value_size match in map_check_btf()
The current map_check_btf() in BPF_MAP_TYPE_ARRAY rejects '> map->value_size' to ensure map_seq_show_elem() will not access things beyond an array element. Yonghong suggested that using '!=' is a more correct check. The 8 bytes round_up on value_size is stored in array->elem_size. Hence, using '!=' on map->value_size is a proper check. This patch also adds new tests to check the btf array key type and value type. Two of these new tests verify the btf's value_size (the change in this patch). It also fixes two existing tests that wrongly encoded a btf's type size (pprint_test) and the value_type_id (in one of the raw_tests[]). However, that do not affect these two BTF verification tests before or after this test changes. These two tests mainly failed at array creation time after this patch. Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") Suggested-by: Yonghong Song Acked-by: Yonghong Song Signed-off-by: Martin KaFai Lau --- kernel/bpf/arraymap.c | 2 +- tools/testing/selftests/bpf/test_btf.c | 86 +- 2 files changed, 85 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 544e58f5f642..2aa55d030c77 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -378,7 +378,7 @@ static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, return -EINVAL; value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size > map->value_size) + if (!value_type || value_size != map->value_size) return -EINVAL; return 0; diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 402c0f7cc418..ffdd27737c9e 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -507,7 +507,7 @@ static struct btf_raw_test raw_tests[] = { .key_size = sizeof(int), .value_size = sizeof(void *) * 4, .key_type_id = 1, - .value_type_id = 4, + .value_type_id = 5, .max_entries = 4, }, @@ -1292,6 +1292,88 @@ static struct btf_raw_test raw_tests[] = { .err_str = "type != 0", }, +{ + .descr = "arraymap invalid btf key (a bit field)", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* 32 bit int with 32 bit offset */ /* [2] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 32, 32, 8), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_map_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 2, + .value_type_id = 1, + .max_entries = 4, + .map_create_err = true, +}, + +{ + .descr = "arraymap invalid btf key (!= 32 bits)", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* 16 bit int with 0 bit offset */ /* [2] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 16, 2), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_map_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 2, + .value_type_id = 1, + .max_entries = 4, + .map_create_err = true, +}, + +{ + .descr = "arraymap invalid btf value (too small)", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_map_check_btf", + .key_size = sizeof(int), + /* btf_value_size < map->value_size */ + .value_size = sizeof(__u64), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .map_create_err = true, +}, + +{ + .descr = "arraymap invalid btf value (too big)", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_map_check_btf", + .key_size = sizeof(int), + /* btf_value_size > map->value_size */ + .value_size = sizeof(__u16), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .map_create_err = true, +}, + }; /* struct btf_raw_test raw_tests[] */ static con
Re: [PATCH] bpf: verifier: BPF_MOV don't mark dst reg if src == dst
Oops, gmail seems to have mangled everything. Will resend using git send-email. I've added the test cases for mov64, but I'm not sure of the expected mov32 behavior. Currently coerce_reg_to_size() is called after mark_reg_unknown(), which sets the bounds to 64bits. coerce_reg_to_size() resets the bounds again, as they're too "wide" to fit the new size. It sets SMIN = UMIN = 0, which seems weird. Shouldn't SMIN be 1 << (size * 8 - 1)? Same applies for SMAX. Should mov32 always mark the dst as unbounded? On Thu, Jul 26, 2018 at 1:42 AM, Daniel Borkmann wrote: > On 07/26/2018 12:08 AM, Arthur Fabre wrote: > > When check_alu_op() handles a BPF_MOV between two registers, > > it calls check_reg_arg() on the dst register, marking it as unbounded. > > If the src and dst register are the same, this marks the src as > > unbounded, which can lead to unexpected errors for further checks that > > rely on bounds info. > > > > check_alu_op() now only marks the dst register as unbounded if it > > different from the src register. > > > > Signed-off-by: Arthur Fabre > > --- > > kernel/bpf/verifier.c | 5 +++-- > > 1 file changed, 3 insertions(+), 2 deletions(-) > > > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > > index 63aaac52a265..ddfe3c544a80 100644 > > --- a/kernel/bpf/verifier.c > > +++ b/kernel/bpf/verifier.c > > @@ -3238,8 +3238,9 @@ static int check_alu_op(struct bpf_verifier_env > > *env, struct bpf_insn *insn) > > } > > } > > > > - /* check dest operand */ > > - err = check_reg_arg(env, insn->dst_reg, DST_OP); > > + /* check dest operand, only mark if dest != src */ > > + err = check_reg_arg(env, insn->dst_reg, > > + insn->dst_reg == insn->src_reg ? > > DST_OP_NO_MARK : DST_OP); > > if (err) > > return err; > > > > Thanks a lot for the patch! Looks like it's corrupted wrt newline. > > Please also add test cases to tools/testing/selftests/bpf/test_verifier.c > for the cases of mov64 and mov32 where in each src==dst and src!=dst; mov32 > should mark it as unbounded but not former, so would be good to keep > tracking > that in selftests. >
Re: [patch net-next] selftests: forwarding: add tests for TC chain get and dump operations
From: Jiri Pirko Date: Thu, 26 Jul 2018 11:38:34 +0200 > From: Jiri Pirko > > Signed-off-by: Jiri Pirko Applied, thanks Jiri.
[patch net-next RFC] net: sched: don't dump chains only held by actions
From: Jiri Pirko In case a chain is empty and not explicitly created by a user, such chain should not exist. The only exception is if there is an action "goto chain" pointing to it. In that case, don't show the chain in the dump. Track the chain references held by actions and use them to find out if a chain should or should not be shown in chain dump. Signed-off-by: Jiri Pirko --- include/net/pkt_cls.h | 3 +++ include/net/sch_generic.h | 1 + net/sched/act_api.c | 4 +-- net/sched/cls_api.c | 68 +++ 4 files changed, 63 insertions(+), 13 deletions(-) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a3101582f642..6d02f31abba8 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -39,7 +39,10 @@ bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, + u32 chain_index); void tcf_chain_put(struct tcf_chain *chain); +void tcf_chain_put_by_act(struct tcf_chain *chain); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 085c509c8674..c5432362dc26 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -314,6 +314,7 @@ struct tcf_chain { struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; + unsigned int action_refcnt; bool explicitly_created; const struct tcf_proto_ops *tmplt_ops; void *tmplt_priv; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 148a89ab789b..b43df1e25c6d 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -36,7 +36,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) if (!tp) return -EINVAL; - a->goto_chain = tcf_chain_get(tp->chain->block, chain_index, true); + a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index); if (!a->goto_chain) return -ENOMEM; return 0; @@ -44,7 +44,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) static void tcf_action_goto_chain_fini(struct tc_action *a) { - tcf_chain_put(a->goto_chain); + tcf_chain_put_by_act(a->goto_chain); } static void tcf_action_goto_chain_exec(const struct tc_action *a, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 75cce2819de9..76035cd6e3bf 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -262,6 +262,25 @@ static void tcf_chain_hold(struct tcf_chain *chain) ++chain->refcnt; } +static void tcf_chain_hold_by_act(struct tcf_chain *chain) +{ + ++chain->action_refcnt; +} + +static void tcf_chain_release_by_act(struct tcf_chain *chain) +{ + --chain->action_refcnt; +} + +static bool tcf_chain_is_zombie(struct tcf_chain *chain) +{ + /* In case all the references are action references, this +* chain is a zombie and should not be listed in the chain +* dump list. +*/ + return chain->refcnt == chain->action_refcnt; +} + static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, u32 chain_index) { @@ -298,6 +317,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, } EXPORT_SYMBOL(tcf_chain_get); +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) +{ + struct tcf_chain *chain = tcf_chain_get(block, chain_index, true); + + tcf_chain_hold_by_act(chain); + return chain; +} +EXPORT_SYMBOL(tcf_chain_get_by_act); + static void tc_chain_tmplt_del(struct tcf_chain *chain); void tcf_chain_put(struct tcf_chain *chain) @@ -310,6 +338,13 @@ void tcf_chain_put(struct tcf_chain *chain) } EXPORT_SYMBOL(tcf_chain_put); +void tcf_chain_put_by_act(struct tcf_chain *chain) +{ + tcf_chain_release_by_act(chain); + tcf_chain_put(chain); +} +EXPORT_SYMBOL(tcf_chain_put_by_act); + static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) { if (chain->explicitly_created) @@ -1803,17 +1838,26 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { if (chain) { - NL_SET_ERR_MSG(extack, "Filter chain already exists"); - return -EEXIST; - } - if (!(n->nlmsg_flags & NLM_F_CREATE)) { - NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); -
[patch net-next] net: sched: unmark chain as explicitly created on delete
From: Jiri Pirko Once user manually deletes the chain using "chain del", the chain cannot be marked as explicitly created anymore. Signed-off-by: Jiri Pirko Fixes: 32a4f5ecd738 ("net: sched: introduce chain object to uapi") --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f3d78c23338e..75cce2819de9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1844,6 +1844,7 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, * to the chain previously taken during addition. */ tcf_chain_put_explicitly_created(chain); + chain->explicitly_created = false; break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, -- 2.14.4
Re: [PATCH V3 bpf] xdp: add NULL pointer check in __xdp_return()
Den tors 26 juli 2018 kl 16:18 skrev Taehee Yoo : > > rhashtable_lookup() can return NULL. so that NULL pointer > check routine should be added. > Thanks Taehee! Acked-by: Björn Töpel > Fixes: 02b55e5657c3 ("xdp: add MEM_TYPE_ZERO_COPY") > Acked-by: Martin KaFai Lau > Signed-off-by: Taehee Yoo > --- > V3 : reduce code line > V2 : add WARN_ON_ONCE when xa is NULL. > net/core/xdp.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/net/core/xdp.c b/net/core/xdp.c > index 9d1f220..6771f18 100644 > --- a/net/core/xdp.c > +++ b/net/core/xdp.c > @@ -345,7 +345,8 @@ static void __xdp_return(void *data, struct xdp_mem_info > *mem, bool napi_direct, > rcu_read_lock(); > /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() > */ > xa = rhashtable_lookup(mem_id_ht, &mem->id, > mem_id_rht_params); > - xa->zc_alloc->free(xa->zc_alloc, handle); > + if (!WARN_ON_ONCE(!xa)) > + xa->zc_alloc->free(xa->zc_alloc, handle); > rcu_read_unlock(); > default: > /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ > -- > 2.9.3 >
Re: [Query]: DSA Understanding
> I am bit confused on how dsa needs to be actually working, > Q's > 1] should I be running a dhcp server on eth1 (where switch is connected) > so that devices connected on lan* devices get an ip ? Nope. You need eth1 up, but otherwise you do not use it. Use the lanX interfaces like normal Linux interfaces. Run your dhclient on lanX, etc. > > 2] From the device where switch is connected if the cpu port wants to send >any data to any other user ports lan* how do i do it (just open > socket on eth1 or lan*) ? Just treat the lanX interfaces as normal Linux interfaces. Andrew
Re: [Query]: DSA Understanding
On Thu, Jul 26, 2018 at 3:08 PM, Andrew Lunn wrote: >> Yes I am using fixed phy on slave1, following is my dts: > > Posting the original DTS file is better, not the decompiled version. > My bad will take care of it next time. >> >> ethernet@48484000 { >> compatible = "ti,dra7-cpsw", "ti,cpsw"; >> ti,hwmods = "gmac"; >> clocks = <0x124 0x125>; >> clock-names = "fck", "cpts"; >> cpdma_channels = <0x8>; >> ale_entries = <0x400>; >> bd_ram_size = <0x2000>; >> mac_control = <0x20>; >> slaves = <0x2>; >> active_slave = <0x0>; >> cpts_clock_mult = <0x784cfe14>; >> cpts_clock_shift = <0x1d>; >> reg = <0x48484000 0x1000 0x48485200 0x2e00>; >> #address-cells = <0x1>; >> #size-cells = <0x1>; >> ti,no-idle; >> interrupts = <0x0 0x14e 0x4 0x0 0x14f 0x4 0x0 0x150 0x4 >> 0x0 0x151 0x4>; >> ranges; >> syscon = <0x8>; >> status = "okay"; >> pinctrl-names = "default", "sleep"; >> pinctrl-0 = <0x126 0x127>; >> pinctrl-1 = <0x128 0x129>; >> dual_emac; >> linux,phandle = <0x500>; >> phandle = <0x500>; > > So here is 0x500 > >> >> mdio@48485000 { >> compatible = "ti,cpsw-mdio", "ti,davinci_mdio"; >> #address-cells = <0x1>; >> #size-cells = <0x0>; >> ti,hwmods = "davinci_mdio"; >> bus_freq = <0xf4240>; >> reg = <0x48485000 0x100>; >> status = "okay"; >> pinctrl-names = "default", "sleep"; >> pinctrl-0 = <0x12a>; >> pinctrl-1 = <0x12b>; >> >> ethernet-phy@1 { >> reg = <0x1>; >> linux,phandle = <0x12c>; >> phandle = <0x12c>; >> }; >> }; >> >> slave@48480200 { >> mac-address = [00 00 00 00 00 00]; >> status = "okay"; >> phy-handle = <0x12c>; >> phy-mode = "rgmii"; >> dual_emac_res_vlan = <0x1>; >> }; >> >> slave@48480300 { >> mac-address = [00 00 00 00 00 00]; >> status = "okay"; >> phy-mode = "rgmii"; >> dual_emac_res_vlan = <0x2>; >> linux,phandle = <0xf3>; >> phandle = <0xf3>; > > This is the actual interface you are using and it has a phandle of > 0xf3. > The dsa of_find_net_device_by_node() looks for ethernet device node, but the above node is for the cpsw slave. I tried adding 0xf3 and got probe defer, that is the reason I added 0x500 (mdio) reference to cpu switch and patched to pick up eth1 instead of eth0. >> >> fixed-link { >> speed = <0x3e8>; >> full-duplex; >> }; >> }; >> >> cpsw-phy-sel@4a002554 { >> compatible = "ti,dra7xx-cpsw-phy-sel"; >> reg = <0x4a002554 0x4>; >> reg-names = "gmii-sel"; >> }; >> }; >> >> spi@480ba000 { >> compatible = "ti,omap4-mcspi"; >> reg = <0x480ba000 0x200>; >> interrupts = <0x0 0x2b 0x4>; >> #address-cells = <0x1>; >> #size-cells = <0x0>; >> ti,hwmods = "mcspi4"; >> ti,spi-num-cs = <0x1>; >> dmas = <0xb2 0x46 0xb2 0x47>; >> dma-names = "tx0", "rx0"; >> status = "okay"; >> ti,pindir-d0-out-d1-in; >> >> ksz9477@0 { >> compatible = "microchip,ksz9477"; >> reg = <0x0>; >> spi-max-frequency = <0x2dc6c00>; >> spi-cpha; >> spi-cpol; >> >> ports { >> #address-cells = <0x1>; >> #size-cells = <0x0>; >> >> port@0 { >> reg = <0x0>; >> label = "lan0"; >> }; >> >> port@1 { >> reg = <0x1>; >> label = "lan1"; >> }; >> >> port@2 { >> reg = <0x2>; >> label = "lan2"; >> }; >> >> port@3 { >> reg = <0x3>; >> label = "lan3"; >> }; >> >> port@4 { >> reg = <0x4>; >> label = "lan4"; >> }; >> >> port@5 { >> reg = <0x5>; >> label = "cpu"; >> ethernet = <0x5
[PATCH net-next v2 1/2] tls: Remove dead code in tls_sw_sendmsg
tls_push_record either returns 0 on success or a negative value on failure. This patch removes code that would only be executed if tls_push_record were to return a positive value. Signed-off-by: Doron Roberts-Kedes --- net/tls/tls_sw.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 03f1370f5db1..0279bc4a168b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -423,12 +423,10 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) copied += try_to_copy; ret = tls_push_record(sk, msg->msg_flags, record_type); - if (!ret) - continue; - if (ret < 0) + if (ret) goto send_end; + continue; - copied -= try_to_copy; fallback_to_reg_send: iov_iter_revert(&msg->msg_iter, ctx->sg_plaintext_size - orig_size); -- 2.17.1
[PATCH net-next v2 2/2] tls: Fix improper revert in zerocopy_from_iter
The current code is problematic because the iov_iter is reverted and never advanced in the non-error case. This patch skips the revert in the non-error case. This patch also fixes the amount by which the iov_iter is reverted. Currently, iov_iter is reverted by size, which can be greater than the amount by which the iter was actually advanced. Instead, only revert by the amount that the iter was advanced. Fixes: 4718799817c5 ("tls: Fix zerocopy_from_iter iov handling") Signed-off-by: Doron Roberts-Kedes --- net/tls/tls_sw.c | 12 +--- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0279bc4a168b..190b0aa79c85 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -263,7 +263,7 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, int length, int *pages_used, unsigned int *size_used, struct scatterlist *to, int to_max_pages, - bool charge, bool revert) + bool charge) { struct page *pages[MAX_SKB_FRAGS]; @@ -312,10 +312,10 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, } out: + if (rc) + iov_iter_revert(from, size - *size_used); *size_used = size; *pages_used = num_elem; - if (revert) - iov_iter_revert(from, size); return rc; } @@ -417,7 +417,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) &ctx->sg_plaintext_size, ctx->sg_plaintext_data, ARRAY_SIZE(ctx->sg_plaintext_data), - true, false); + true); if (ret) goto fallback_to_reg_send; @@ -428,8 +428,6 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) continue; fallback_to_reg_send: - iov_iter_revert(&msg->msg_iter, - ctx->sg_plaintext_size - orig_size); trim_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, @@ -833,7 +831,7 @@ int tls_sw_recvmsg(struct sock *sk, err = zerocopy_from_iter(sk, &msg->msg_iter, to_copy, &pages, &chunk, &sgin[1], -MAX_SKB_FRAGS, false, true); +MAX_SKB_FRAGS, false); if (err < 0) goto fallback_to_reg_recv; -- 2.17.1
[PATCH net-next v2 0/2] tls: Fix improper revert in zerocopy_from_iter
This series fixes the improper iov_iter_revert introcded in "tls: Fix zerocopy_from_iter iov handling". Changes from v1: - call iov_iter_revert inside zerocopy_from_iter Doron Roberts-Kedes (2): tls: Remove dead code in tls_sw_sendmsg tls: Fix improper revert in zerocopy_from_iter net/tls/tls_sw.c | 18 +++--- 1 file changed, 7 insertions(+), 11 deletions(-) -- 2.17.1
Re: [PATCH rdma-next v2 0/8] Support mlx5 flow steering with RAW data
On Thu, Jul 26, 2018 at 07:35:49AM +0300, Leon Romanovsky wrote: > On Wed, Jul 25, 2018 at 08:35:17AM -0600, Jason Gunthorpe wrote: > > On Wed, Jul 25, 2018 at 08:37:03AM +0300, Leon Romanovsky wrote: > > > > > > Also, I would like to keep the specs consistently formatted according > > > > to clang-format with 'BinPackParameters: true', so I reflowed them as > > > > well. > > > > > > I'm using default VIM clang-format.py without anything in .clang-format. > > > Do you have an extra definitions there, except BinPackParameters? > > > > These days Linux includes a top level .clang-format that does a > > pretty good job. > > > > I have to manually switch BinPackParameters on when working with these > > specs to get the right indenting.. A pain, but maybe there is a better > > way someday.. > > I don't think that it is feasible to ask from people to change some > defaults only for patches that touch those specs. Any change in this > area will change formatting back. Eventually I think we might be able toadd a code comment to tell clang-format, but that would be down the road a bit.. > Jason, bottom line, I won't use BinPackParameters for my patches. Well, you can make sure the specs macro follows the required formatting code style by hand if you prefer.. But, I want to see them in this layout, so they are easier to maintain, not the haphazard layout we had before. Jason
[PATCH v5 bpf-next 9/9] veth: Support per queue XDP ring
From: Toshiaki Makita Move XDP and napi related fields in veth_priv to newly created veth_rq structure. When xdp_frames are enqueued from ndo_xdp_xmit and XDP_TX, rxq is selected by current cpu. When skbs are enqueued from the peer device, rxq is one to one mapping of its peer txq. This way we have a restriction that the number of rxqs must not less than the number of peer txqs, but leave the possibility to achieve bulk skb xmit in the future because txq lock would make it possible to remove rxq ptr_ring lock. v3: - Add extack messages. - Fix array overrun in veth_xmit. Signed-off-by: Toshiaki Makita --- drivers/net/veth.c | 278 - 1 file changed, 188 insertions(+), 90 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 60397a8ea2e9..3059b897ecea 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -42,20 +42,24 @@ struct pcpu_vstats { struct u64_stats_sync syncp; }; -struct veth_priv { +struct veth_rq { struct napi_struct xdp_napi; struct net_device *dev; struct bpf_prog __rcu *xdp_prog; - struct bpf_prog *_xdp_prog; - struct net_device __rcu *peer; - atomic64_t dropped; struct xdp_mem_info xdp_mem; - unsignedrequested_headroom; boolrx_notify_masked; struct ptr_ring xdp_ring; struct xdp_rxq_info xdp_rxq; }; +struct veth_priv { + struct net_device __rcu *peer; + atomic64_t dropped; + struct bpf_prog *_xdp_prog; + struct veth_rq *rq; + unsigned intrequested_headroom; +}; + /* * ethtool interface */ @@ -144,19 +148,19 @@ static void veth_ptr_free(void *ptr) kfree_skb(ptr); } -static void __veth_xdp_flush(struct veth_priv *priv) +static void __veth_xdp_flush(struct veth_rq *rq) { /* Write ptr_ring before reading rx_notify_masked */ smp_mb(); - if (!priv->rx_notify_masked) { - priv->rx_notify_masked = true; - napi_schedule(&priv->xdp_napi); + if (!rq->rx_notify_masked) { + rq->rx_notify_masked = true; + napi_schedule(&rq->xdp_napi); } } -static int veth_xdp_rx(struct veth_priv *priv, struct sk_buff *skb) +static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) { - if (unlikely(ptr_ring_produce(&priv->xdp_ring, skb))) { + if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { dev_kfree_skb_any(skb); return NET_RX_DROP; } @@ -164,21 +168,22 @@ static int veth_xdp_rx(struct veth_priv *priv, struct sk_buff *skb) return NET_RX_SUCCESS; } -static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, bool xdp) +static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, + struct veth_rq *rq, bool xdp) { - struct veth_priv *priv = netdev_priv(dev); - return __dev_forward_skb(dev, skb) ?: xdp ? - veth_xdp_rx(priv, skb) : + veth_xdp_rx(rq, skb) : netif_rx(skb); } static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct veth_rq *rq = NULL; struct net_device *rcv; int length = skb->len; bool rcv_xdp = false; + int rxq; rcu_read_lock(); rcv = rcu_dereference(priv->peer); @@ -188,9 +193,15 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) } rcv_priv = netdev_priv(rcv); - rcv_xdp = rcu_access_pointer(rcv_priv->xdp_prog); + rxq = skb_get_queue_mapping(skb); + if (rxq < rcv->real_num_rx_queues) { + rq = &rcv_priv->rq[rxq]; + rcv_xdp = rcu_access_pointer(rq->xdp_prog); + if (rcv_xdp) + skb_record_rx_queue(skb, rxq); + } - if (likely(veth_forward_skb(rcv, skb, rcv_xdp) == NET_RX_SUCCESS)) { + if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); u64_stats_update_begin(&stats->syncp); @@ -203,7 +214,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) } if (rcv_xdp) - __veth_xdp_flush(rcv_priv); + __veth_xdp_flush(rq); rcu_read_unlock(); @@ -278,12 +289,18 @@ static struct sk_buff *veth_build_skb(void *head, int headroom, int len, return skb; } +static int veth_select_rxq(struct net_device *dev) +{ + return smp_processor_id() % dev->real_num_rx_queues; +} + static int veth_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct veth_p
[PATCH v5 bpf-next 6/9] bpf: Make redirect_info accessible from modules
From: Toshiaki Makita We are going to add kern_flags field in redirect_info for kernel internal use. In order to avoid function call to access the flags, make redirect_info accessible from modules. Also as it is now non-static, add prefix bpf_ to redirect_info. Signed-off-by: Toshiaki Makita --- include/linux/filter.h | 10 ++ net/core/filter.c | 29 +++-- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index c73dd7396886..4717af8b95e6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -537,6 +537,16 @@ struct sk_msg_buff { struct list_head list; }; +struct bpf_redirect_info { + u32 ifindex; + u32 flags; + struct bpf_map *map; + struct bpf_map *map_to_flush; + unsigned long map_owner; +}; + +DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) diff --git a/net/core/filter.c b/net/core/filter.c index 104d560946da..acf322296535 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2080,19 +2080,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; -struct redirect_info { - u32 ifindex; - u32 flags; - struct bpf_map *map; - struct bpf_map *map_to_flush; - unsigned long map_owner; -}; - -static DEFINE_PER_CPU(struct redirect_info, redirect_info); +DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +EXPORT_SYMBOL_GPL(bpf_redirect_info); BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags & ~(BPF_F_INGRESS))) return TC_ACT_SHOT; @@ -2105,7 +2098,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) int skb_do_redirect(struct sk_buff *skb) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); @@ -3198,7 +3191,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, void xdp_do_flush_map(void) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = ri->map_to_flush; ri->map_to_flush = NULL; @@ -3243,7 +3236,7 @@ static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3283,7 +3276,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *fwd; u32 index = ri->ifindex; int err; @@ -3315,7 +3308,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3366,7 +3359,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; @@ -3397,7 +3390,7 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; @@ -3421,7 +3414,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
[PATCH v5 bpf-next 8/9] veth: Add XDP TX and REDIRECT
From: Toshiaki Makita This allows further redirection of xdp_frames like NIC -> veth--veth -> veth--veth (XDP) (XDP) (XDP) The intermediate XDP, redirecting packets from NIC to the other veth, reuses xdp_mem_info from NIC so that page recycling of the NIC works on the destination veth's XDP. In this way return_frame is not fully guarded by NAPI, since another NAPI handler on another cpu may use the same xdp_mem_info concurrently. Thus disable napi_direct by xdp_set_return_frame_no_direct() during the NAPI context. v4: - Use xdp_[set|clear]_return_frame_no_direct() instead of a flag in xdp_mem_info. v3: - Fix double free when veth_xdp_tx() returns a positive value. - Convert xdp_xmit and xdp_redir variables into flags. Signed-off-by: Toshiaki Makita --- drivers/net/veth.c | 119 + 1 file changed, 110 insertions(+), 9 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index acdb1c543f4b..60397a8ea2e9 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -32,6 +32,10 @@ #define VETH_RING_SIZE 256 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) +/* Separating two types of XDP xmit */ +#define VETH_XDP_TXBIT(0) +#define VETH_XDP_REDIR BIT(1) + struct pcpu_vstats { u64 packets; u64 bytes; @@ -45,6 +49,7 @@ struct veth_priv { struct bpf_prog *_xdp_prog; struct net_device __rcu *peer; atomic64_t dropped; + struct xdp_mem_info xdp_mem; unsignedrequested_headroom; boolrx_notify_masked; struct ptr_ring xdp_ring; @@ -317,10 +322,42 @@ static int veth_xdp_xmit(struct net_device *dev, int n, return n - drops; } +static void veth_xdp_flush(struct net_device *dev) +{ + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct net_device *rcv; + + rcu_read_lock(); + rcv = rcu_dereference(priv->peer); + if (unlikely(!rcv)) + goto out; + + rcv_priv = netdev_priv(rcv); + /* xdp_ring is initialized on receive side? */ + if (unlikely(!rcu_access_pointer(rcv_priv->xdp_prog))) + goto out; + + __veth_xdp_flush(rcv_priv); +out: + rcu_read_unlock(); +} + +static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) +{ + struct xdp_frame *frame = convert_to_xdp_frame(xdp); + + if (unlikely(!frame)) + return -EOVERFLOW; + + return veth_xdp_xmit(dev, 1, &frame, 0); +} + static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv, - struct xdp_frame *frame) + struct xdp_frame *frame, + unsigned int *xdp_xmit) { int len = frame->len, delta = 0; + struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; unsigned int headroom; struct sk_buff *skb; @@ -344,6 +381,29 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv, delta = frame->data - xdp.data; len = xdp.data_end - xdp.data; break; + case XDP_TX: + orig_frame = *frame; + xdp.data_hard_start = frame; + xdp.rxq->mem = frame->mem; + if (unlikely(veth_xdp_tx(priv->dev, &xdp) < 0)) { + trace_xdp_exception(priv->dev, xdp_prog, act); + frame = &orig_frame; + goto err_xdp; + } + *xdp_xmit |= VETH_XDP_TX; + rcu_read_unlock(); + goto xdp_xmit; + case XDP_REDIRECT: + orig_frame = *frame; + xdp.data_hard_start = frame; + xdp.rxq->mem = frame->mem; + if (xdp_do_redirect(priv->dev, &xdp, xdp_prog)) { + frame = &orig_frame; + goto err_xdp; + } + *xdp_xmit |= VETH_XDP_REDIR; + rcu_read_unlock(); + goto xdp_xmit; default: bpf_warn_invalid_xdp_action(act); case XDP_ABORTED: @@ -368,12 +428,13 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv, err_xdp: rcu_read_unlock(); xdp_return_frame(frame); - +xdp_xmit: return NULL; } static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv, - struct sk_buff *skb) + struct sk_buff *skb, + unsigned int *xdp_xmit) {
[PATCH v5 bpf-next 5/9] veth: Add ndo_xdp_xmit
From: Toshiaki Makita This allows NIC's XDP to redirect packets to veth. The destination veth device enqueues redirected packets to the napi ring of its peer, then they are processed by XDP on its peer veth device. This can be thought as calling another XDP program by XDP program using REDIRECT, when the peer enables driver XDP. Note that when the peer veth device does not set driver xdp, redirected packets will be dropped because the peer is not ready for NAPI. v4: - Don't use xdp_ok_fwd_dev() because checking IFF_UP is not necessary. Add comments about it and check only MTU. v2: - Drop the part converting xdp_frame into skb when XDP is not enabled. - Implement bulk interface of ndo_xdp_xmit. - Implement XDP_XMIT_FLUSH bit and drop ndo_xdp_flush. Signed-off-by: Toshiaki Makita --- drivers/net/veth.c | 51 +++ 1 file changed, 51 insertions(+) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index ef22d991f678..acdb1c543f4b 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -125,6 +126,11 @@ static void *veth_ptr_to_xdp(void *ptr) return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); } +static void *veth_xdp_to_ptr(void *ptr) +{ + return (void *)((unsigned long)ptr | VETH_XDP_FLAG); +} + static void veth_ptr_free(void *ptr) { if (veth_is_xdp_frame(ptr)) @@ -267,6 +273,50 @@ static struct sk_buff *veth_build_skb(void *head, int headroom, int len, return skb; } +static int veth_xdp_xmit(struct net_device *dev, int n, +struct xdp_frame **frames, u32 flags) +{ + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct net_device *rcv; + unsigned int max_len; + int i, drops = 0; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + rcv = rcu_dereference(priv->peer); + if (unlikely(!rcv)) + return -ENXIO; + + rcv_priv = netdev_priv(rcv); + /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive +* side. This means an XDP program is loaded on the peer and the peer +* device is up. +*/ + if (!rcu_access_pointer(rcv_priv->xdp_prog)) + return -ENXIO; + + max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; + + spin_lock(&rcv_priv->xdp_ring.producer_lock); + for (i = 0; i < n; i++) { + struct xdp_frame *frame = frames[i]; + void *ptr = veth_xdp_to_ptr(frame); + + if (unlikely(frame->len > max_len || +__ptr_ring_produce(&rcv_priv->xdp_ring, ptr))) { + xdp_return_frame_rx_napi(frame); + drops++; + } + } + spin_unlock(&rcv_priv->xdp_ring.producer_lock); + + if (flags & XDP_XMIT_FLUSH) + __veth_xdp_flush(rcv_priv); + + return n - drops; +} + static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv, struct xdp_frame *frame) { @@ -766,6 +816,7 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom= veth_set_rx_headroom, .ndo_bpf= veth_xdp, + .ndo_xdp_xmit = veth_xdp_xmit, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ -- 2.14.3
[PATCH v5 bpf-next 4/9] veth: Handle xdp_frames in xdp napi ring
From: Toshiaki Makita This is preparation for XDP TX and ndo_xdp_xmit. This allows napi handler to handle xdp_frames through xdp ring as well as sk_buff. v3: - Revert v2 change around rings and use a flag to differentiate skb and xdp_frame, since bulk skb xmit makes little performance difference for now. v2: - Use another ring instead of using flag to differentiate skb and xdp_frame. This approach makes bulk skb transmit possible in veth_xmit later. - Clear xdp_frame feilds in skb->head. - Implement adjust_tail. Signed-off-by: Toshiaki Makita --- drivers/net/veth.c | 87 ++ 1 file changed, 82 insertions(+), 5 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 1b4006d3df32..ef22d991f678 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -22,12 +22,12 @@ #include #include #include -#include #include #define DRV_NAME "veth" #define DRV_VERSION"1.0" +#define VETH_XDP_FLAG BIT(0) #define VETH_RING_SIZE 256 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) @@ -115,6 +115,24 @@ static const struct ethtool_ops veth_ethtool_ops = { /* general routines */ +static bool veth_is_xdp_frame(void *ptr) +{ + return (unsigned long)ptr & VETH_XDP_FLAG; +} + +static void *veth_ptr_to_xdp(void *ptr) +{ + return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); +} + +static void veth_ptr_free(void *ptr) +{ + if (veth_is_xdp_frame(ptr)) + xdp_return_frame(veth_ptr_to_xdp(ptr)); + else + kfree_skb(ptr); +} + static void __veth_xdp_flush(struct veth_priv *priv) { /* Write ptr_ring before reading rx_notify_masked */ @@ -249,6 +267,61 @@ static struct sk_buff *veth_build_skb(void *head, int headroom, int len, return skb; } +static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv, + struct xdp_frame *frame) +{ + int len = frame->len, delta = 0; + struct bpf_prog *xdp_prog; + unsigned int headroom; + struct sk_buff *skb; + + rcu_read_lock(); + xdp_prog = rcu_dereference(priv->xdp_prog); + if (likely(xdp_prog)) { + struct xdp_buff xdp; + u32 act; + + xdp.data_hard_start = frame->data - frame->headroom; + xdp.data = frame->data; + xdp.data_end = frame->data + frame->len; + xdp.data_meta = frame->data - frame->metasize; + xdp.rxq = &priv->xdp_rxq; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + switch (act) { + case XDP_PASS: + delta = frame->data - xdp.data; + len = xdp.data_end - xdp.data; + break; + default: + bpf_warn_invalid_xdp_action(act); + case XDP_ABORTED: + trace_xdp_exception(priv->dev, xdp_prog, act); + case XDP_DROP: + goto err_xdp; + } + } + rcu_read_unlock(); + + headroom = frame->data - delta - (void *)frame; + skb = veth_build_skb(frame, headroom, len, 0); + if (!skb) { + xdp_return_frame(frame); + goto err; + } + + memset(frame, 0, sizeof(*frame)); + skb->protocol = eth_type_trans(skb, priv->dev); +err: + return skb; +err_xdp: + rcu_read_unlock(); + xdp_return_frame(frame); + + return NULL; +} + static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv, struct sk_buff *skb) { @@ -358,12 +431,16 @@ static int veth_xdp_rcv(struct veth_priv *priv, int budget) int i, done = 0; for (i = 0; i < budget; i++) { - struct sk_buff *skb = __ptr_ring_consume(&priv->xdp_ring); + void *ptr = __ptr_ring_consume(&priv->xdp_ring); + struct sk_buff *skb; - if (!skb) + if (!ptr) break; - skb = veth_xdp_rcv_skb(priv, skb); + if (veth_is_xdp_frame(ptr)) + skb = veth_xdp_rcv_one(priv, veth_ptr_to_xdp(ptr)); + else + skb = veth_xdp_rcv_skb(priv, ptr); if (skb) napi_gro_receive(&priv->xdp_napi, skb); @@ -416,7 +493,7 @@ static void veth_napi_del(struct net_device *dev) napi_disable(&priv->xdp_napi); netif_napi_del(&priv->xdp_napi); priv->rx_notify_masked = false; - ptr_ring_cleanup(&priv->xdp_ring, __skb_array_destroy_skb); + ptr_ring_cleanup(&priv->xdp_ring, veth_ptr_free); } static int veth_enable_xdp(struct net_device *dev) -- 2.14.3
[PATCH v5 bpf-next 7/9] xdp: Helpers for disabling napi_direct of xdp_return_frame
From: Toshiaki Makita We need some mechanism to disable napi_direct on calling xdp_return_frame_rx_napi() from some context. When veth gets support of XDP_REDIRECT, it will redirects packets which are redirected from other devices. On redirection veth will reuse xdp_mem_info of the redirection source device to make return_frame work. But in this case .ndo_xdp_xmit() called from veth redirection uses xdp_mem_info which is not guarded by NAPI, because the .ndo_xdp_xmit() is not called directly from the rxq which owns the xdp_mem_info. This approach introduces a flag in bpf_redirect_info to indicate that napi_direct should be disabled even when _rx_napi variant is used as well as helper functions to use it. A NAPI handler who wants to use this flag needs to call xdp_set_return_frame_no_direct() before processing packets, and call xdp_clear_return_frame_no_direct() after xdp_do_flush_map() before exiting NAPI. v4: - Use bpf_redirect_info for storing the flag instead of xdp_mem_info to avoid per-frame copy cost. Signed-off-by: Toshiaki Makita --- include/linux/filter.h | 25 + net/core/xdp.c | 6 -- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 4717af8b95e6..2b072dab32c0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -543,10 +543,14 @@ struct bpf_redirect_info { struct bpf_map *map; struct bpf_map *map_to_flush; unsigned long map_owner; + u32 kern_flags; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -775,6 +779,27 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +static inline bool xdp_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_set_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_clear_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; +} + static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { diff --git a/net/core/xdp.c b/net/core/xdp.c index 57285383ed00..3dd99e1c04f5 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -330,10 +330,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (xa) + if (xa) { + napi_direct &= !xdp_return_frame_no_direct(); page_pool_put_page(xa->page_pool, page, napi_direct); - else + } else { put_page(page); + } rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: -- 2.14.3
[PATCH v5 bpf-next 3/9] veth: Avoid drops by oversized packets when XDP is enabled
From: Toshiaki Makita All oversized packets including GSO packets are dropped if XDP is enabled on receiver side, so don't send such packets from peer. Drop TSO and SCTP fragmentation features so that veth devices themselves segment packets with XDP enabled. Also cap MTU accordingly. v4: - Don't auto-adjust MTU but cap max MTU. Signed-off-by: Toshiaki Makita --- drivers/net/veth.c | 47 +-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 78fa08cb6e24..1b4006d3df32 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -542,6 +542,23 @@ static int veth_get_iflink(const struct net_device *dev) return iflink; } +static netdev_features_t veth_fix_features(struct net_device *dev, + netdev_features_t features) +{ + struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; + + peer = rtnl_dereference(priv->peer); + if (peer) { + struct veth_priv *peer_priv = netdev_priv(peer); + + if (peer_priv->_xdp_prog) + features &= ~NETIF_F_GSO_SOFTWARE; + } + + return features; +} + static void veth_set_rx_headroom(struct net_device *dev, int new_hr) { struct veth_priv *peer_priv, *priv = netdev_priv(dev); @@ -571,6 +588,7 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct veth_priv *priv = netdev_priv(dev); struct bpf_prog *old_prog; struct net_device *peer; + unsigned int max_mtu; int err; old_prog = priv->_xdp_prog; @@ -584,6 +602,15 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, goto err; } + max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - + peer->hard_header_len - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + if (peer->mtu > max_mtu) { + NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); + err = -ERANGE; + goto err; + } + if (dev->flags & IFF_UP) { err = veth_enable_xdp(dev); if (err) { @@ -591,14 +618,29 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, goto err; } } + + if (!old_prog) { + peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; + peer->max_mtu = max_mtu; + } } if (old_prog) { - if (!prog && dev->flags & IFF_UP) - veth_disable_xdp(dev); + if (!prog) { + if (dev->flags & IFF_UP) + veth_disable_xdp(dev); + + if (peer) { + peer->hw_features |= NETIF_F_GSO_SOFTWARE; + peer->max_mtu = ETH_MAX_MTU; + } + } bpf_prog_put(old_prog); } + if ((!!old_prog ^ !!prog) && peer) + netdev_update_features(peer); + return 0; err: priv->_xdp_prog = old_prog; @@ -643,6 +685,7 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_poll_controller= veth_poll_controller, #endif .ndo_get_iflink = veth_get_iflink, + .ndo_fix_features = veth_fix_features, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom= veth_set_rx_headroom, .ndo_bpf= veth_xdp, -- 2.14.3