Re: pls merge net to net-next
From: Stephen HemmingerDate: Tue, 16 Aug 2016 14:02:18 -0700 > Since Vitaly's changes to hyperv drivers went into net it makes > it harder to submit other changes for net-next since they will invariably > cause conflicts. Done.
[PATCH 2/2] pptp: Reset call_id as 0 to avoid one useless lookup at next time
From: Gao FengWhen pptp fails to get valid callid, the global call_id is set as MAX_CALLID. Then it must fail to get callid at next time, when invoke find_next_zero_bit from call_id+1. Because the call_id+1 exceeds the limit "MAX_CALLID". So reset call_id as 0 when fail to get valid callid. And add one variable to check if need the second lookup. Signed-off-by: Gao Feng --- v1: Initial patch drivers/net/ppp/pptp.c | 19 +++ 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 48c3701..9afef3c 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -103,12 +103,21 @@ static int add_chan(struct pppox_sock *sock, static int call_id; spin_lock(_lock); - if (!sa->call_id) { + if (!sa->call_id) { + bool from_start = (call_id == 0); + call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, call_id + 1); if (call_id == MAX_CALLID) { - call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, 1); - if (call_id == MAX_CALLID) + if (unlikely(from_start)) { + call_id = 0; goto out_err; + } else { + call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, 1); + if (call_id == MAX_CALLID) { + call_id = 0; + goto out_err; + } + } } sa->call_id = call_id; } else if (test_bit(sa->call_id, callid_bitmap)) { @@ -656,8 +665,10 @@ static int __init pptp_init_module(void) pr_info("PPTP driver version " PPTP_DRIVER_VERSION "\n"); callid_sock = vzalloc((MAX_CALLID + 1) * sizeof(void *)); - if (!callid_sock) + if (!callid_sock) { + pr_err("PPTP: can't alloc callid_sock mem"); return -ENOMEM; + } err = gre_add_protocol(_pptp_protocol, GREPROTO_PPTP); if (err) { -- 1.9.1
[PATCH v1 1/2] pptp: Use macro and sizeof instead of literal number
From: Gao FengUse existing macros like PPP_ADDRESS, SC_COMP_PROT and sizeof fixed variables instead of original literal number to enhance readbility. BTW, the original pptp_rcv uses literal number "12" as the param of pskb_may_pull. Actually the "12" is less than the size of struct pptp_gre_header. Now use the sizeof(*header) fixes this issue. Signed-off-by: Gao Feng --- v1: Initial patch drivers/net/ppp/pptp.c | 23 --- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 1951b10..48c3701 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -54,6 +54,8 @@ static struct proto pptp_sk_proto __read_mostly; static const struct ppp_channel_ops pptp_chan_ops; static const struct proto_ops pptp_ops; +static const u8 fixed_ppphdr[2] = {PPP_ALLSTATIONS, PPP_UI}; + static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr) { struct pppox_sock *sock; @@ -167,7 +169,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) tdev = rt->dst.dev; - max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(*iph) + sizeof(*hdr) + 2; + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(*iph) + sizeof(*hdr) + sizeof(fixed_ppphdr); if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); @@ -190,9 +192,9 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) /* Put in the address/control bytes if necessary */ if ((opt->ppp_flags & SC_COMP_AC) == 0 || islcp) { - data = skb_push(skb, 2); - data[0] = PPP_ALLSTATIONS; - data[1] = PPP_UI; + data = skb_push(skb, sizeof(fixed_ppphdr)); + data[0] = fixed_ppphdr[0]; + data[1] = fixed_ppphdr[1]; } len = skb->len; @@ -219,8 +221,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) } hdr->payload_len = htons(len); - /* Push down and install the IP header. */ - + /* Push down and install the IP header. */ skb_reset_transport_header(skb); skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); @@ -319,14 +320,14 @@ static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb) allow_packet: skb_pull(skb, headersize); - if (payload[0] == PPP_ALLSTATIONS && payload[1] == PPP_UI) { + if (PPP_ADDRESS(payload) == PPP_ALLSTATIONS && PPP_CONTROL(payload) == PPP_UI) { /* chop off address/control */ if (skb->len < 3) goto drop; - skb_pull(skb, 2); + skb_pull(skb, sizeof(fixed_ppphdr)); } - if ((*skb->data) & 1) { + if ((*skb->data) & SC_COMP_PROT) { /* protocol is compressed */ skb_push(skb, 1)[0] = 0; } @@ -351,7 +352,7 @@ static int pptp_rcv(struct sk_buff *skb) if (skb->pkt_type != PACKET_HOST) goto drop; - if (!pskb_may_pull(skb, 12)) + if (!pskb_may_pull(skb, sizeof(*header))) goto drop; iph = ip_hdr(skb); @@ -468,7 +469,7 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, ip_rt_put(rt); po->chan.mtu -= PPTP_HEADER_OVERHEAD; - po->chan.hdrlen = 2 + sizeof(struct pptp_gre_header); + po->chan.hdrlen = sizeof(fixed_ppphdr) + sizeof(struct pptp_gre_header); error = ppp_register_channel(>chan); if (error) { pr_err("PPTP: failed to register PPP channel (%d)\n", error); -- 1.9.1
[PATCH] fib_trie: Fix the description of pos and bits
1) Fix one typo: s/tn/tp/ 2) Fix the description about the "u" bits. Signed-off-by: Xunlei Pang--- net/ipv4/fib_trie.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d07fc07..eb7c5d1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -249,7 +249,7 @@ static inline unsigned long get_index(t_key key, struct key_vector *kv) * index into the parent's child array. That is, they will be used to find * 'n' among tp's children. * - * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits + * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits * for the node n. * * All the bits we have seen so far are significant to the node n. The rest @@ -258,7 +258,7 @@ static inline unsigned long get_index(t_key key, struct key_vector *kv) * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into * n's child array, and will of course be different for each child. * - * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown + * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown * at this point. */ -- 1.8.3.1
Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver
Florian Fainelli wrote: The larger issue is that the emac_sgmii node in the form you posted is going to be backed by a platform device in Linux while you want a PHY device with a reg property that describes a MDIO address (#address-cells = 1, #size-cells = 0). But how do I get the platform device for the emac_sgmii node? If I create an of_device_id ID for it, like this: static const struct of_device_id emac_dt_match[] = { { .compatible = "qcom,fsm9900-emac", }, {} }; static const struct of_device_id emac_sgmii_dt_match[] = { { .compatible = "qcom,fsm9900-emac-sgmii", }, {} }; Then the probe function will be called for qcom,fsm9900-emac-sgmii separately from qcom,fsm9900-emac, which just confuses things. So I can't create emac_sgmii_dt_match. I know this is standard DT stuff, and I used to do a lot of work on DT so maybe I should know this already. But it seems to me that I need to manually create the platform_device for qcom,fsm9900-emac-sgmii. IIRC the amd xgbe driver mainline had a similar design but still implemented a PHY device anyway although it may not have been using Device Tree. It should still be possible to implement a PHY driver that you manually register and bind to its device_node pointer such that of_phy_find_device and friends still work. You would do this from the emac_sgmii platform device driver and parent devices in a way that satisfy the PHY device driver lifecycle as well. Hope this helps. It doesn't, sorry. The emac_sgmii is really just another register block for the driver to program. Creating another PHY driver for it doesn't really make sense. It's not on an MDIO bus. -- Sent by an employee of the Qualcomm Innovation Center, Inc. The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by The Linux Foundation.
Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver
On August 17, 2016 8:27:19 PM PDT, Timur Tabiwrote: >Florian Fainelli wrote: > >>> emac_sgmii: ethernet-phy@410400 { >>> compatible = "qcom,qdf2432-emac-phy"; >>> reg = <0x0 0x00410400 0x0 0x100>; >>> interrupts = <0 0x104 0>; >>> }; >>> >> >> Is this register range relative to the emac0 node here, or is this >> really a separate node, within the same adress space as your emac0 >node? > >It's a separate node within the same address space. We can't guarantee > >that it's adjacent to any other EMAC register -- it could be anywhere >in >memory. 0x00410400 is the real physical address of those registers. Ok > >> Answer largely depends on whether your device is really located >outside >> of the emac, if it located outside, then a platform device matching >the >> compatible string would get you what you want. If the emac_sgmii >block >> is a sub-block within the EMAC, then a few things need fixing: >> >> - your emac_sgmii node should be a sub-node of the emac node, not a >sibling >> - the emac0 node should have a "ranges" property that indicates how >to >> translate the sub-nodes' "reg" property based on the base register >> address of the emac0 block >> - you would have to call of_platform_populate from the EMAC driver to >> ensure that the emac_sgmii child node and therefore platform device >gets >> created > >Even if the emac_sgmii block were a sub-block, wouldn't it conflict >with >the ethernet-phy@4 node? The #address-cells and #size-cells properties > >cannot be valid for both the emac_sgmii and ethernet-phy@4 nodes. These two properties apply to subnodes within the emac0 and emac_sgmii nodes, this is standard DT stuff here. The larger issue is that the emac_sgmii node in the form you posted is going to be backed by a platform device in Linux while you want a PHY device with a reg property that describes a MDIO address (#address-cells = 1, #size-cells = 0). IIRC the amd xgbe driver mainline had a similar design but still implemented a PHY device anyway although it may not have been using Device Tree. It should still be possible to implement a PHY driver that you manually register and bind to its device_node pointer such that of_phy_find_device and friends still work. You would do this from the emac_sgmii platform device driver and parent devices in a way that satisfy the PHY device driver lifecycle as well. Hope this helps. NB: another way to do this is to give the SGMII PHY device a MDIO address, provided that it has one, and you pass the memory mapped register range as a syscon property pointing to the emac_sgmii register range. This is not really clean through. -- Florian
Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver
Florian Fainelli wrote: emac_sgmii: ethernet-phy@410400 { compatible = "qcom,qdf2432-emac-phy"; reg = <0x0 0x00410400 0x0 0x100>; interrupts = <0 0x104 0>; }; Is this register range relative to the emac0 node here, or is this really a separate node, within the same adress space as your emac0 node? It's a separate node within the same address space. We can't guarantee that it's adjacent to any other EMAC register -- it could be anywhere in memory. 0x00410400 is the real physical address of those registers. Answer largely depends on whether your device is really located outside of the emac, if it located outside, then a platform device matching the compatible string would get you what you want. If the emac_sgmii block is a sub-block within the EMAC, then a few things need fixing: - your emac_sgmii node should be a sub-node of the emac node, not a sibling - the emac0 node should have a "ranges" property that indicates how to translate the sub-nodes' "reg" property based on the base register address of the emac0 block - you would have to call of_platform_populate from the EMAC driver to ensure that the emac_sgmii child node and therefore platform device gets created Even if the emac_sgmii block were a sub-block, wouldn't it conflict with the ethernet-phy@4 node? The #address-cells and #size-cells properties cannot be valid for both the emac_sgmii and ethernet-phy@4 nodes. -- Sent by an employee of the Qualcomm Innovation Center, Inc. The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by The Linux Foundation.
Re: [PATCH net-next 2/3] net: mpls: Fixups for GSO
On 8/17/16 7:06 PM, Alexander Duyck wrote: > On Wed, Aug 17, 2016 at 4:23 PM, David Ahernwrote: >> On 8/17/16 5:16 PM, Alexander Duyck wrote: diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 1ecbd7715f6d..6d78f162a88b 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -167,6 +167,12 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, skb->mac_len); skb_reset_mac_header(skb); + /* for GSO: set MPLS as network header and encapsulated protocol +* header as inner network header +*/ + skb_set_network_header(skb, skb->mac_len); + skb_set_inner_network_header(skb, skb->mac_len + MPLS_HLEN); + new_mpls_lse = (__be32 *)skb_mpls_header(skb); *new_mpls_lse = mpls->mpls_lse; >>> >>> So the one question I would have about this is how attached are you to >>> using the network_header to record the offset for the MPLS header? I >>> ask because I think from a hardware offloading perspective it would >>> make it much easier if instead you used the inner_mac_header to >>> represent the offset for the MPLS header. This way device drivers >>> could just skip over it like a VLAN and just use network and transport >>> header values like they would otherwise. >>> >> >> Where does the network_header relate to if I change the marker to >> inner_mac_header? Would it be skipped? > > No, the network header would still be the network header. If core MPLS code (ie., non-OVS) does not do skb_reset_network_header(skb) after adding the MPLS label nothing works. Not even ping with small packets. tcpdump shows a completely mangled packet. Right now resetting the network_header to mpls is required.
[PATCH v1 1/1] pppoe: l2tp: the PPPOX_CONNECTED should be used with bit operation
From: Gao FengThere are some codes in pppoe and l2tp which use the PPPOX_CONNECTED as the value including assignment and condition check. They should keep consistent with other codes. Signed-off-by: Gao Feng --- v1: Initial Patch drivers/net/ppp/pppoe.c | 2 +- net/l2tp/l2tp_ppp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 4ddae81..684b773 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -697,7 +697,7 @@ static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, goto err_put; } - sk->sk_state = PPPOX_CONNECTED; + sk->sk_state |= PPPOX_CONNECTED; } po->num = sp->sa_addr.pppoe.sid; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index d9560aa..3984385 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -774,7 +774,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, out_no_ppp: /* This is how we get the session context from the socket. */ sk->sk_user_data = session; - sk->sk_state = PPPOX_CONNECTED; + sk->sk_state |= PPPOX_CONNECTED; l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n", session->name); @@ -856,7 +856,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, error = -ENOTCONN; if (sk == NULL) goto end; - if (sk->sk_state != PPPOX_CONNECTED) + if (!(sk->sk_state & PPPOX_CONNECTED)) goto end; error = -EBADF; -- 1.9.1
Re: [PATCH net-next] tcp: refine tcp_prune_ofo_queue() to not drop all packets
On Wed, Aug 17, 2016 at 5:17 PM, Eric Dumazetwrote: > From: Eric Dumazet > > Over the years, TCP BDP has increased a lot, and is typically > in the order of ~10 Mbytes with help of clever Congestion Control > modules. > > In presence of packet losses, TCP stores incoming packets into an out of > order queue, and number of skbs sitting there waiting for the missing > packets to be received can match the BDP (~10 Mbytes) > > In some cases, TCP needs to make room for incoming skbs, and current > strategy can simply remove all skbs in the out of order queue as a last > resort, incurring a huge penalty, both for receiver and sender. > > Unfortunately these 'last resort events' are quite frequent, forcing > sender to send all packets again, stalling the flow and wasting a lot of > resources. > > This patch cleans only a part of the out of order queue in order > to meet the memory constraints. > > Signed-off-by: Eric Dumazet > Cc: Neal Cardwell > Cc: Yuchung Cheng > Cc: Soheil Hassas Yeganeh > Cc: C. Stephen Gun > Cc: Van Jacobson Acked-by: Soheil Hassas Yeganeh > --- > net/ipv4/tcp_input.c | 47 - > 1 file changed, 28 insertions(+), 19 deletions(-) > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index > 3ebf45b38bc309f448dbc4f27fe8722cefabaf19..8cd02c0b056cbc22e2e4a4fe8530b74f7bd25419 > 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -4392,12 +4392,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, > struct sk_buff *skb, > if (tcp_prune_queue(sk) < 0) > return -1; > > - if (!sk_rmem_schedule(sk, skb, size)) { > + while (!sk_rmem_schedule(sk, skb, size)) { > if (!tcp_prune_ofo_queue(sk)) > return -1; > - > - if (!sk_rmem_schedule(sk, skb, size)) > - return -1; > } > } > return 0; > @@ -4874,29 +4871,41 @@ static void tcp_collapse_ofo_queue(struct sock *sk) > } > > /* > - * Purge the out-of-order queue. > - * Return true if queue was pruned. > + * Clean the out-of-order queue to make room. > + * We drop high sequences packets to : > + * 1) Let a chance for holes to be filled. > + * 2) not add too big latencies if thousands of packets sit there. > + *(But if application shrinks SO_RCVBUF, we could still end up > + * freeing whole queue here) > + * > + * Return true if queue has shrunk. > */ > static bool tcp_prune_ofo_queue(struct sock *sk) > { > struct tcp_sock *tp = tcp_sk(sk); > - bool res = false; > + struct sk_buff *skb; > > - if (!skb_queue_empty(>out_of_order_queue)) { > - NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); > - __skb_queue_purge(>out_of_order_queue); > + if (skb_queue_empty(>out_of_order_queue)) > + return false; > > - /* Reset SACK state. A conforming SACK implementation will > -* do the same at a timeout based retransmit. When a > connection > -* is in a sad state like this, we care only about integrity > -* of the connection not performance. > -*/ > - if (tp->rx_opt.sack_ok) > - tcp_sack_reset(>rx_opt); > + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); > + > + while ((skb = __skb_dequeue_tail(>out_of_order_queue)) != NULL) { > + tcp_drop(sk, skb); > sk_mem_reclaim(sk); > - res = true; > + if (atomic_read(>sk_rmem_alloc) <= sk->sk_rcvbuf && > + !tcp_under_memory_pressure(sk)) > + break; > } > - return res; > + > + /* Reset SACK state. A conforming SACK implementation will > +* do the same at a timeout based retransmit. When a connection > +* is in a sad state like this, we care only about integrity > +* of the connection not performance. > +*/ > + if (tp->rx_opt.sack_ok) > + tcp_sack_reset(>rx_opt); > + return true; > } > > /* Reduce allocated memory if we can, trying to get > > Very nice patch, Eric! Thanks.
Re: [PATCH net-next 2/3] net: mpls: Fixups for GSO
On Wed, Aug 17, 2016 at 4:23 PM, David Ahernwrote: > On 8/17/16 5:16 PM, Alexander Duyck wrote: >>> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c >>> index 1ecbd7715f6d..6d78f162a88b 100644 >>> --- a/net/openvswitch/actions.c >>> +++ b/net/openvswitch/actions.c >>> @@ -167,6 +167,12 @@ static int push_mpls(struct sk_buff *skb, struct >>> sw_flow_key *key, >>> skb->mac_len); >>> skb_reset_mac_header(skb); >>> >>> + /* for GSO: set MPLS as network header and encapsulated protocol >>> +* header as inner network header >>> +*/ >>> + skb_set_network_header(skb, skb->mac_len); >>> + skb_set_inner_network_header(skb, skb->mac_len + MPLS_HLEN); >>> + >>> new_mpls_lse = (__be32 *)skb_mpls_header(skb); >>> *new_mpls_lse = mpls->mpls_lse; >>> >> >> So the one question I would have about this is how attached are you to >> using the network_header to record the offset for the MPLS header? I >> ask because I think from a hardware offloading perspective it would >> make it much easier if instead you used the inner_mac_header to >> represent the offset for the MPLS header. This way device drivers >> could just skip over it like a VLAN and just use network and transport >> header values like they would otherwise. >> > > Where does the network_header relate to if I change the marker to > inner_mac_header? Would it be skipped? No, the network header would still be the network header. > skb->protocol is set to MPLS. > mac_header points to ethernet address > network_header points to ??? The network_header would point to the IP header like it would be for a non-MPLS frame. > inner protocol is set to what is encapsulated (e.g., ipv4 or ipv6) I am okay with this, but wonder if we actually need it. Do you know of any protocols other than IPv4 or IPv6 that can be carried over MPLS and would expect to be offloaded? If not we may be able to just get away with recording the network header offset and then using the first nibble of the network header to determine the IP version since the value should be 4 or 6 for the two types we are offloading. > inner_mac_header points to start of mpls label. So this is what I would expect. > inner_network points to start of network header. The problem is that using inner_network_header to point to the network header will require me to fork the path pretty significantly for most of the Intel devices that would want to do MPLS GSO. The assumption most drivers make is that if we are offloading things then network_header and inner_network_header will point to either IPv4 or IPv6 headers. Introducing MPLS as the network_header with IPv4 or IPv6 as the inner_network_header throws a kink in the works because we currently ignore inner_network_header for the devices that are doing UDP or GRE tunnel GSO via GSO_PARTIAL with TSO_MANGLEID. > Is that sufficient for h/w drivers? I think of this as working like how we handle it for IP over IP tunnels. In that case we are at L3 so the inner_network_header field is populated, but the transport header stays the same. In the case of MPLS it isn't really L3 it is more of an L2.5 so my preference would be to treat it like it is an L2 tunnel or VLAN and just overwrite the inner_mac_header with the MPLS header offset, and leave the network and transport headers untouched. One other bonus that also occurred to me is that you might be able to get away with doing MPLS offloads for MPLS over IP or GRE tunnels. I hadn't realized that MPLS inside of these tunnels was a thing, I had just noticed it while looking over how the IP-in-IP tunnels are all being handled. However if you move the header tracking to inner_mac_header, and can avoid using skb->inner_protocol by instead using the first nibble of the network_header value then you could probably support segmenting those types of tunnels in hardware. - Alex
Re: [PATCH v2 1/1] ppp: Fix one deadlock issue of PPP when send frame
inline answer. On Thu, Aug 18, 2016 at 1:42 AM, Guillaume Naultwrote: > On Tue, Aug 16, 2016 at 07:33:38PM +0800, f...@ikuai8.com wrote: >> From: Gao Feng >> >> PPP channel holds one spinlock before send frame. But the skb may >> select the same PPP channel with wrong route policy. As a result, >> the skb reaches the same channel path. It tries to get the same >> spinlock which is held before. Bang, the deadlock comes out. >> > Unless I misunderstood the problem you're trying to solve, this patch > doesn't really help: deadlock still occurs if the same IP is used for > L2TP and PPP's peer address. > The deadlock happens because the same cpu try to hold the spinlock which is already held before by itself. Now the PPP_CHANNEL_LOCK_BH sets the lock owner after hold lock, then when the same cpu invokes PPP_CHANNEL_LOCK_BH again. The cl.owner equals current cpu id, so it only increases the lock_cnt without trying to hold the lock again. So it avoids the deadlock. >> Now add one lock owner to avoid it like xmit_lock_owner of >> netdev_queue. Check the lock owner before try to get the spinlock. >> If the current cpu is already the owner, needn't lock again. When >> PPP channel holds the spinlock at the first time, it sets owner >> with current CPU ID. >> > I think you should forbid lock recursion entirely, and drop the packet > if the owner tries to re-acquire the channel lock. Otherwise you just > move the deadlock down the stack (l2tp_xmit_skb() can't be called > recursively). The reason that fix it in ppp is that there are other layer on the ppp module. We resolve it in ppp module, it could avoid all similar potential issues. > >> diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c >> index 70cfa06..6909ab1 100644 >> --- a/drivers/net/ppp/ppp_generic.c >> +++ b/drivers/net/ppp/ppp_generic.c >> @@ -162,6 +162,37 @@ struct ppp { >>|SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \ >>|SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP) >> >> +struct chennel_lock { > ^ > I guess you meant 'channel_lock'. Yes. It is a typo. Thanks. > >> + spinlock_t lock; >> + u32 owner; > This field's default value is -1. So why not declaring it as 'int'? OK. I will fix it. Best Regards Feng -- To unsubscribe from this list: send the line "unsubscribe linux-ppp" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[GIT] Networking
1) Buffers powersave frame test is reversed in cfg80211, fix from Felix Fietkau. 2) Remove bogus WARN_ON in openvswitch, from Jarno Rajahalme. 3) Fix some tg3 ethtool logic bugs, and one that would cause no interrupts to be generated when rx-coalescing is set to 0. From Satish Baddipadige and Siva Reddy Kallam. 4) QLCNIC mailbox corruption and napi budget handling fix from Manish Chopra. 5) Fix fib_trie logic when walking the trie during /proc/net/route output than can access a stale node pointer. From David Forster. 6) Several sctp_diag fixes from Phil Sutter. 7) PAUSE frame handling fixes in mlxsw driver from Ido Schimmel. 8) Checksum fixup fixes in bpf from Daniel Borkmann. 9) Memork leaks in nfnetlink, from Liping Zhang. 10) Use after free in rxrpc, from David Howells. 11) Use after free in new skb_array code of macvtap driver, from Jason Wang. 12) Calipso resource leak, from Colin Ian King. 13) mediatek bug fixes (missing stats sync init, etc.) from Sean Wang. 14) Fix bpf non-linear packet write helpers, from Daniel Borkmann. 15) Fix lockdep splats in macsec, from Sabrina Dubroca. 16) hv_netvsc bug fixes from Vitaly Kuznetsov, mostly to do with VF handling. 17) Various tc-action bug fixes, from CONG Wang. Please pull, thanks a lot! The following changes since commit bf0f500bd0199aab613eb0ecb3412edd5472740d: Merge tag 'trace-v4.8-1' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace (2016-08-03 12:50:06 -0400) are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git for you to fetch changes up to b96c22c071eb1126db4055de4bb75b02b05affd1: Merge branch 'tc_action-fixes' (2016-08-17 19:27:58 -0400) Alexander Duyck (2): ixgbe: Force VLNCTRL.VFE to be set in all VMDq paths ixgbe: Re-enable ability to toggle VLAN filtering Alexei Starovoitov (2): bpf: restore behavior of bpf_map_update_elem samples/bpf: add bpf_map_update_elem() tests Arnd Bergmann (3): net: xgene: fix maybe-uninitialized variable rxrpc: fix uninitialized pointer dereference in debug code dsa: mv88e6xxx: hide unused functions Christophe Leroy (1): netfilter: nf_conntrack_sip: CSeq 0 is a valid CSeq Colin Ian King (2): cfg80211: fix missing break in NL8211_CHAN_WIDTH_80P80 case calipso: fix resource leak on calipso_genopt failure Daniel Borkmann (5): bpf: also call skb_postpush_rcsum on xmit occasions bpf: fix checksum fixups on bpf_skb_store_bytes bpf: fix checksum for vlan push/pop helper bpf: fix bpf_skb_in_cgroup helper naming bpf: fix write helpers with regards to non-linear parts Dave Ertman (1): i40e: check for and deal with non-contiguous TCs David Forster (1): ipv4: panic in leaf_walk_rcu due to stale node pointer David Howells (6): rxrpc: Fix races between skb free, ACK generation and replying rxrpc: Need to flag call as being released on connect failure rxrpc: Don't access connection from call if pointer is NULL rxrpc: Once packet posted in data_ready, don't retry posting rxrpc: Fix a use-after-push in data_ready handler rxrpc: Free packets discarded in data_ready David S. Miller (14): Merge branch 'tg3-fixes' Merge branch 'qlcnic-fixes' Merge tag 'mac80211-for-davem-2016-08-05' of git://git.kernel.org/.../jberg/mac80211 Merge branch 'sctp_diag-fixes' Merge branch 'mlxsw-dcb-fixes' Merge branch 'bpf-csum-complete' Merge branch 'qed-fixes' Merge tag 'rxrpc-fixes-20160809' of git://git.kernel.org/.../dhowells/linux-fs Merge git://git.kernel.org/.../pablo/nf Merge branch 'hv_netvsc-VF-removal-fixes' Merge branch 'mediatek-fixes' Merge branch 'mlxsw-fixes' Merge branch '1GbE' of git://git.kernel.org/.../jkirsher/net-queue Merge branch 'tc_action-fixes' Elad Raz (1): mlxsw: spectrum: Add missing packet traps Fabian Frederick (1): net: hns: fix typo in g_gmac_stats_string[] Felix Fietkau (2): mac80211: fix check for buffered powersave frames with txq mac80211: fix purging multicast PS buffer queue Florian Westphal (1): rhashtable: avoid large lock-array allocations Geert Uytterhoeven (1): net: dsa: b53: Add missing ULL suffix for 64-bit constant Grygorii Strashko (1): drivers: net: cpsw: fix kmemleak false-positive reports for sk buffers Harini Katakam (1): net: macb: Correct CAPS mask Ian Wienand (1): OVS: Ignore negative headroom value Ido Schimmel (11): mlxsw: spectrum: Do not assume PAUSE frames are disabled mlxsw: spectrum: Do not override PAUSE settings mlxsw: spectrum: Add missing DCB rollback in error path mlxsw: spectrum: Don't return upon error in removal path mlxsw: spectrum: Remove redundant errors from the code mlxsw:
Re: [PATCH] net: ipconfig: Fix more use after free
From: Thierry RedingDate: Tue, 16 Aug 2016 16:45:38 +0200 > From: Thierry Reding > > While commit 9c706a49d660 ("net: ipconfig: fix use after free") avoids > the use after free, the resulting code still ends up calling both the > ic_setup_if() and ic_setup_routes() after calling ic_close_devs(), and > access to the device is still required. > > Move the call to ic_close_devs() to the very end of the function. > > Signed-off-by: Thierry Reding > --- > This applies on top of next-20160816. Applied, thanks.
Re: [PATCH v4 net-next 0/3] strp: Stream parser for messages
From: Tom HerbertDate: Mon, 15 Aug 2016 14:51:00 -0700 > This patch set introduces a utility for parsing application layer > protocol messages in a TCP stream. This is a generalization of the > mechanism implemented of Kernel Connection Multiplexor. ... Series applied, thanks Tom.
Re: [RFC PATCH 10/13] net: sched: lockless support for netif_schedule
On Wed, 2016-08-17 at 16:17 -0700, John Fastabend wrote: > On 16-08-17 04:01 PM, Eric Dumazet wrote: > > On Wed, 2016-08-17 at 12:37 -0700, John Fastabend wrote: > > > >> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c > >> index d10b762..f5b7254 100644 > >> --- a/net/sched/sch_generic.c > >> +++ b/net/sched/sch_generic.c > >> @@ -171,6 +171,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, > >>if (qdisc_is_percpu_stats(q)) { > >>qdisc_qstats_cpu_backlog_inc(q, nskb); > >>qdisc_qstats_cpu_qlen_inc(q); > >> + set_thread_flag(TIF_NEED_RESCHED); > >>} else { > >>qdisc_qstats_backlog_inc(q, nskb); > >>q->q.qlen++; > > > > Hmm... care to elaborate this bit ? > > > > > > > > ah dang thats leftover from trying to resolve a skb getting stuck on the > bad_txq_cell from qdisc_enqueue_skb_bad_txq(). You'll notice I added > a __netif_schedule(skb) call in qdisc_enqueue_skb_bad_txq() which > resolves this and the set_thread_flag() here can then just be removed. OK I feel much better now ;) Thanks !
Re: [PATCH] xfrm: Only add l3mdev oif to dst lookups
From: David AhernDate: Sun, 14 Aug 2016 19:52:56 -0700 > Subash reported that commit 42a7b32b73d6 ("xfrm: Add oif to dst lookups") > broke a wifi use case that uses fib rules and xfrms. The intent of > 42a7b32b73d6 was driven by VRFs with IPsec. As a compromise relax the > use of oif in xfrm lookups to L3 master devices only (ie., oif is either > an L3 master device or is enslaved to a master device). > > Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups") > Reported-by: Subash Abhinov Kasiviswanathan > Signed-off-by: David Ahern Steffen, please pick this up. Thank you.
Re: [PATCH net-next 2/3] net: mpls: Fixups for GSO
On 8/17/16 5:16 PM, Alexander Duyck wrote: >> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c >> index 1ecbd7715f6d..6d78f162a88b 100644 >> --- a/net/openvswitch/actions.c >> +++ b/net/openvswitch/actions.c >> @@ -167,6 +167,12 @@ static int push_mpls(struct sk_buff *skb, struct >> sw_flow_key *key, >> skb->mac_len); >> skb_reset_mac_header(skb); >> >> + /* for GSO: set MPLS as network header and encapsulated protocol >> +* header as inner network header >> +*/ >> + skb_set_network_header(skb, skb->mac_len); >> + skb_set_inner_network_header(skb, skb->mac_len + MPLS_HLEN); >> + >> new_mpls_lse = (__be32 *)skb_mpls_header(skb); >> *new_mpls_lse = mpls->mpls_lse; >> > > So the one question I would have about this is how attached are you to > using the network_header to record the offset for the MPLS header? I > ask because I think from a hardware offloading perspective it would > make it much easier if instead you used the inner_mac_header to > represent the offset for the MPLS header. This way device drivers > could just skip over it like a VLAN and just use network and transport > header values like they would otherwise. > Where does the network_header relate to if I change the marker to inner_mac_header? Would it be skipped? skb->protocol is set to MPLS. mac_header points to ethernet address network_header points to ??? inner protocol is set to what is encapsulated (e.g., ipv4 or ipv6) inner_mac_header points to start of mpls label. inner_network points to start of network header. Is that sufficient for h/w drivers?
Re: [PATCH 00/16] pull request for net-next: batman-adv 2016-08-16
From: Simon WunderlichDate: Tue, 16 Aug 2016 10:32:28 +0200 > this is our second feature pull request for batman-adv in this round, with > our joint work on netlink and netns support. There will be at least one more > pull request coming later. > > Please pull or let me know of any problem! Pulled, thanks Simon.
Re: [net 0/6][pull request] Intel Wired LAN Driver Updates 2016-08-16
From: Jeff KirsherDate: Tue, 16 Aug 2016 13:40:08 -0700 > This series contains fixes to e1000e, igb, ixgbe and i40e. Pulled, thanks a lot Jeff.
Re: [patch net v2 00/10] mlxsw: IPv4 UC router fixes
From: Jiri PirkoDate: Wed, 17 Aug 2016 16:39:27 +0200 > Ido says: > Patches 1-3 fix a long standing problem in the driver's init sequence, > which manifests itself quite often when routing daemons try to configure > an IP address on registered netdevs that don't yet have an associated > vPort. > > Patches 4-9 add missing packet traps for the router to work properly and > also fix ordering issue following the recent changes to the driver's init > sequence. > > The last patch isn't related to the router, but fixes a general problem > in which under certain conditions packets aren't trapped to CPU. > > v1->v2: > - Change order of patch 7 > - Add patch 6 following Ilan's comment > - Add patchset name and cover letter Series applied, thanks.
Re: [RFC PATCH 12/13] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq
On 16-08-17 04:04 PM, Eric Dumazet wrote: > On Wed, 2016-08-17 at 12:38 -0700, John Fastabend wrote: >> The sch_mq qdisc creates a sub-qdisc per tx queue which are then >> called independently for enqueue and dequeue operations. However >> statistics are aggregated and pushed up to the "master" qdisc. >> >> This patch adds support for any of the sub-qdiscs to be per cpu >> statistic qdiscs. To handle this case add a check when calculating >> stats and aggregate the per cpu stats if needed. >> >> Also exports __gnet_stats_copy_queue() to use as a helper function. > > > Looks like this patch should be happening earlier in the series ? > > hmm yep patches 12 and 13 should come before 11 to avoid introducing a bug and subsequently fixing them.
Re: [RFC PATCH 10/13] net: sched: lockless support for netif_schedule
On 16-08-17 04:01 PM, Eric Dumazet wrote: > On Wed, 2016-08-17 at 12:37 -0700, John Fastabend wrote: > >> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c >> index d10b762..f5b7254 100644 >> --- a/net/sched/sch_generic.c >> +++ b/net/sched/sch_generic.c >> @@ -171,6 +171,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, >> if (qdisc_is_percpu_stats(q)) { >> qdisc_qstats_cpu_backlog_inc(q, nskb); >> qdisc_qstats_cpu_qlen_inc(q); >> +set_thread_flag(TIF_NEED_RESCHED); >> } else { >> qdisc_qstats_backlog_inc(q, nskb); >> q->q.qlen++; > > Hmm... care to elaborate this bit ? > > > ah dang thats leftover from trying to resolve a skb getting stuck on the bad_txq_cell from qdisc_enqueue_skb_bad_txq(). You'll notice I added a __netif_schedule(skb) call in qdisc_enqueue_skb_bad_txq() which resolves this and the set_thread_flag() here can then just be removed. .John
Re: [PATCH net-next 2/3] net: mpls: Fixups for GSO
On Wed, Aug 17, 2016 at 2:49 PM, David Ahernwrote: > As reported by Lennert the MPLS GSO code is failing to properly segment > large packets. There are a couple of problems: > > 1. the inner protocol is not set so the gso segment functions for inner >protocol layers are not getting run, and > > 2 MPLS labels for packets that use the "native" (non-OVS) MPLS code >are not properly accounted for in mpls_gso_segment. > > The MPLS GSO code was added for OVS. It is re-using skb_mac_gso_segment > to call the gso segment functions for the higher layer protocols. That > means skb_mac_gso_segment is called twice -- once with the network > protocol set to MPLS and again with the network protocol set to the > inner protocol. > > This patch sets the inner skb protocol addressing item 1 above and sets > the network_header and inner_network_header to mark where the MPLS labels > start and end. The MPLS code in OVS is also updated to set the two > network markers. > > From there the MPLS GSO code uses the difference between the network > header and the inner network header to know the size of the MPLS header > that was pushed. It then pulls the MPLS header, resets the mac_len and > protocol for the inner protocol and then calls skb_mac_gso_segment > to segment the skb. Afterwards the skb protocol is set to mpls for > each segment as suggested by Simon. > > Reported-by: Lennert Buytenhek > Signed-off-by: David Ahern > --- > net/mpls/mpls_gso.c | 24 +--- > net/mpls/mpls_iptunnel.c | 5 + > net/openvswitch/actions.c | 6 ++ > 3 files changed, 24 insertions(+), 11 deletions(-) > > diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c > index 1ecbd7715f6d..6d78f162a88b 100644 > --- a/net/openvswitch/actions.c > +++ b/net/openvswitch/actions.c > @@ -167,6 +167,12 @@ static int push_mpls(struct sk_buff *skb, struct > sw_flow_key *key, > skb->mac_len); > skb_reset_mac_header(skb); > > + /* for GSO: set MPLS as network header and encapsulated protocol > +* header as inner network header > +*/ > + skb_set_network_header(skb, skb->mac_len); > + skb_set_inner_network_header(skb, skb->mac_len + MPLS_HLEN); > + > new_mpls_lse = (__be32 *)skb_mpls_header(skb); > *new_mpls_lse = mpls->mpls_lse; > So the one question I would have about this is how attached are you to using the network_header to record the offset for the MPLS header? I ask because I think from a hardware offloading perspective it would make it much easier if instead you used the inner_mac_header to represent the offset for the MPLS header. This way device drivers could just skip over it like a VLAN and just use network and transport header values like they would otherwise. - Alex
Re: [PATCH net-next] dsa: mv88e6xxx: Timeout based on iterations
Hi Andrew, Andrew Lunnwrites: > The mv88e6xxx driver times out operations on the switch based on > looping until an elapsed wall clock time is reached. However, if > usleep_range() sleeps much longer than expected, it could timeout with > an error without actually checking to see if the devices has completed > the operation. So replace the elapsed time with a fixed upper bound on > the number of loops. > > Testing on various switches has shown that switches takes either 0 or > 1 iteration, so a maximum of 16 iterations is a safe limit. > > Signed-off-by: Andrew Lunn > --- > drivers/net/dsa/mv88e6xxx/chip.c | 15 ++- > 1 file changed, 6 insertions(+), 9 deletions(-) > > diff --git a/drivers/net/dsa/mv88e6xxx/chip.c > b/drivers/net/dsa/mv88e6xxx/chip.c > index a230fcba5b64..ac8e9af4879f 100644 > --- a/drivers/net/dsa/mv88e6xxx/chip.c > +++ b/drivers/net/dsa/mv88e6xxx/chip.c > @@ -309,9 +309,9 @@ static int mv88e6xxx_serdes_write(struct mv88e6xxx_chip > *chip, int reg, u16 val) > static int mv88e6xxx_wait(struct mv88e6xxx_chip *chip, int addr, int reg, > u16 mask) > { > - unsigned long timeout = jiffies + HZ / 10; > + int i; > > - while (time_before(jiffies, timeout)) { > + for (i = 0; i < 16; i++) { > u16 val; > int err; > Since we remove the elapsed time here, can we use mv88e6xxx_wait in mv88e6xxx_update? It'd be good to have a consistent wait routine everywhere. Thanks, Vivien
Re: [RFC PATCH 07/13] net: sched: support qdisc_reset on NOLOCK qdisc
On Wed, 2016-08-17 at 12:36 -0700, John Fastabend wrote: > The qdisc_reset operation depends on the qdisc lock at the moment > to halt any additions to gso_skb and statistics while the list is > free'd and the stats zeroed. ... > Signed-off-by: John Fastabend> --- > net/sched/sch_generic.c | 45 +++-- > 1 file changed, 35 insertions(+), 10 deletions(-) > > diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c > index 3b9a21f..29238c4 100644 > --- a/net/sched/sch_generic.c > +++ b/net/sched/sch_generic.c > @@ -737,6 +737,20 @@ void qdisc_reset(struct Qdisc *qdisc) > kfree_skb(qdisc->skb_bad_txq); > qdisc->skb_bad_txq = NULL; > > + if (qdisc->gso_cpu_skb) { > + int i; > + > + for_each_possible_cpu(i) { > + struct gso_cell *cell; > + > + cell = per_cpu_ptr(qdisc->gso_cpu_skb, i); > + if (cell) { > + kfree_skb_list(cell->skb); > + cell = NULL; You probably wanted : cell->skb = NULL; > + } > + } > + } > + > if (qdisc->gso_skb) { > kfree_skb_list(qdisc->gso_skb); > qdisc->gso_skb = NULL; > @@ -812,10 +826,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue > *dev_queue, > root_lock = qdisc_lock(oqdisc); > spin_lock_bh(root_lock); > > - /* Prune old scheduler */ > - if (oqdisc && atomic_read(>refcnt) <= 1) > - qdisc_reset(oqdisc); > - > This probably belongs to a separate patch, before any per cpu / lockless qdisc changes ?
Re: [RFC PATCH 12/13] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq
On Wed, 2016-08-17 at 12:38 -0700, John Fastabend wrote: > The sch_mq qdisc creates a sub-qdisc per tx queue which are then > called independently for enqueue and dequeue operations. However > statistics are aggregated and pushed up to the "master" qdisc. > > This patch adds support for any of the sub-qdiscs to be per cpu > statistic qdiscs. To handle this case add a check when calculating > stats and aggregate the per cpu stats if needed. > > Also exports __gnet_stats_copy_queue() to use as a helper function. Looks like this patch should be happening earlier in the series ?
[PATCH net-next 3/4] bpf: enable event output helper also for xdp types
Follow-up to 555c8a8623a3 ("bpf: avoid stack copy and use skb ctx for event output") for also adding the event output helper for XDP typed programs. The event output helper has been very useful in particular for debugging or event notification purposes, since it's much faster and flexible than regular trace printk due to programmatically being able to attach meta data. Same flags structure applies as with tc BPF programs. Signed-off-by: Daniel BorkmannAcked-by: Alexei Starovoitov --- net/core/filter.c | 42 +- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index abf546d..3b60dfd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2408,6 +2408,41 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { }; #endif +static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, + unsigned long off, unsigned long len) +{ + memcpy(dst_buff, src_buff + off, len); + return 0; +} + +static u64 bpf_xdp_event_output(u64 r1, u64 r2, u64 flags, u64 r4, + u64 meta_size) +{ + struct xdp_buff *xdp = (struct xdp_buff *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + void *meta = (void *)(long) r4; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, + bpf_xdp_copy); +} + +static const struct bpf_func_proto bpf_xdp_event_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -2492,7 +2527,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id) { - return sk_filter_func_proto(func_id); + switch (func_id) { + case BPF_FUNC_perf_event_output: + return _xdp_event_output_proto; + default: + return sk_filter_func_proto(func_id); + } } static bool __is_valid_access(int off, int size, enum bpf_access_type type) -- 1.9.3
[PATCH net-next 0/4] BPF helper improvements and cleanups
This set adds various improvements to BPF helpers, a cleanup to use skb_pkt_type_ok() helper, addition of bpf_skb_change_tail(), a follow up for event output helper and removing ifdefs around the cgroupv2 helper bits. For details please see individual patches. The set is based against net-next tree, but requires a merge of net into net-next first. Thanks a lot! Daniel Borkmann (4): bpf: use skb_pkt_type_ok helper in bpf_skb_change_type bpf: add bpf_skb_change_tail helper bpf: enable event output helper also for xdp types bpf: get rid of cgroup helper related ifdefs include/linux/skbuff.h | 43 +- include/net/sock.h | 10 include/uapi/linux/bpf.h | 11 net/core/filter.c| 152 +++ 4 files changed, 204 insertions(+), 12 deletions(-) -- 1.9.3
[PATCH net-next 4/4] bpf: get rid of cgroup helper related ifdefs
As recently discussed during the task_under_cgroup_hierarchy() addition, we should get rid of the ifdefs surrounding the bpf_skb_under_cgroup() helper. If related functionality is not built-in, the helper cannot be used anyway, which is also in line with what we do for all other helpers. Signed-off-by: Daniel BorkmannAcked-by: Alexei Starovoitov --- include/net/sock.h | 10 ++ net/core/filter.c | 6 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index ff5be7e..2aab9b6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1114,6 +1114,16 @@ static inline bool sk_stream_is_writeable(const struct sock *sk) sk_stream_memory_free(sk); } +static inline int sk_under_cgroup_hierarchy(struct sock *sk, + struct cgroup *ancestor) +{ +#ifdef CONFIG_SOCK_CGROUP_DATA + return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data), + ancestor); +#else + return -ENOTSUPP; +#endif +} static inline bool sk_has_memory_pressure(const struct sock *sk) { diff --git a/net/core/filter.c b/net/core/filter.c index 3b60dfd..a83766b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2374,7 +2374,6 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } -#ifdef CONFIG_SOCK_CGROUP_DATA static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *)(long)r1; @@ -2395,7 +2394,7 @@ static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) if (unlikely(!cgrp)) return -EAGAIN; - return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data), cgrp); + return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { @@ -2406,7 +2405,6 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; -#endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, unsigned long off, unsigned long len) @@ -2515,10 +2513,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return _skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return _get_smp_processor_id_proto; -#ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_under_cgroup: return _skb_under_cgroup_proto; -#endif default: return sk_filter_func_proto(func_id); } -- 1.9.3
Re: [RFC PATCH 10/13] net: sched: lockless support for netif_schedule
On Wed, 2016-08-17 at 12:37 -0700, John Fastabend wrote: > diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c > index d10b762..f5b7254 100644 > --- a/net/sched/sch_generic.c > +++ b/net/sched/sch_generic.c > @@ -171,6 +171,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, > if (qdisc_is_percpu_stats(q)) { > qdisc_qstats_cpu_backlog_inc(q, nskb); > qdisc_qstats_cpu_qlen_inc(q); > + set_thread_flag(TIF_NEED_RESCHED); > } else { > qdisc_qstats_backlog_inc(q, nskb); > q->q.qlen++; Hmm... care to elaborate this bit ?
[PATCH net-next 1/4] bpf: use skb_pkt_type_ok helper in bpf_skb_change_type
Since we have a skb_pkt_type_ok() helper for checking the type before mangling, make use of it instead of open coding. Follow-up to commit 8b10cab64c13 ("net: simplify and make pkt_type_ok() available for other users") that came in after d2485c4242a8 ("bpf: add bpf_skb_change_type helper"). Signed-off-by: Daniel BorkmannAcked-by: Alexei Starovoitov --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index cb06ace..58b5e6d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1976,8 +1976,8 @@ static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) u32 pkt_type = r2; /* We only allow a restricted subset to be changed for now. */ - if (unlikely(skb->pkt_type > PACKET_OTHERHOST || -pkt_type > PACKET_OTHERHOST)) + if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || +!skb_pkt_type_ok(pkt_type))) return -EINVAL; skb->pkt_type = pkt_type; -- 1.9.3
[PATCH net-next 2/4] bpf: add bpf_skb_change_tail helper
This work adds a bpf_skb_change_tail() helper for tc BPF programs. The basic idea is to expand or shrink the skb in a controlled manner. The eBPF program can then rewrite the rest via helpers like bpf_skb_store_bytes(), bpf_lX_csum_replace() and others rather than passing a raw buffer for writing here. bpf_skb_change_tail() is really a slow path helper and intended for replies with f.e. ICMP control messages. Concept is similar to other helpers like bpf_skb_change_proto() helper to keep the helper without protocol specifics and let the BPF program mangle the remaining parts. A flags field has been added and is reserved for now should we extend the helper in future. Signed-off-by: Daniel BorkmannAcked-by: Alexei Starovoitov --- include/linux/skbuff.h | 43 +++- include/uapi/linux/bpf.h | 11 ++ net/core/filter.c| 100 +-- 3 files changed, 150 insertions(+), 4 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0f665cb..7047448 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2295,7 +2295,7 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) int ___pskb_trim(struct sk_buff *skb, unsigned int len); -static inline void __skb_trim(struct sk_buff *skb, unsigned int len) +static inline void __skb_set_length(struct sk_buff *skb, unsigned int len) { if (unlikely(skb_is_nonlinear(skb))) { WARN_ON(1); @@ -2305,6 +2305,11 @@ static inline void __skb_trim(struct sk_buff *skb, unsigned int len) skb_set_tail_pointer(skb, len); } +static inline void __skb_trim(struct sk_buff *skb, unsigned int len) +{ + __skb_set_length(skb, len); +} + void skb_trim(struct sk_buff *skb, unsigned int len); static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) @@ -2335,6 +2340,20 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) BUG_ON(err); } +static inline int __skb_grow(struct sk_buff *skb, unsigned int len) +{ + unsigned int diff = len - skb->len; + + if (skb_tailroom(skb) < diff) { + int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb), + GFP_ATOMIC); + if (ret) + return ret; + } + __skb_set_length(skb, len); + return 0; +} + /** * skb_orphan - orphan a buffer * @skb: buffer to orphan @@ -2938,6 +2957,21 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) return __pskb_trim(skb, len); } +static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + __skb_trim(skb, len); + return 0; +} + +static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + return __skb_grow(skb, len); +} + #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ @@ -3726,6 +3760,13 @@ static inline bool skb_is_gso_v6(const struct sk_buff *skb) return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6; } +static inline void skb_gso_reset(struct sk_buff *skb) +{ + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_segs = 0; + skb_shinfo(skb)->gso_type = 0; +} + void __skb_warn_lro_forwarding(const struct sk_buff *skb); static inline bool skb_warn_if_lro(const struct sk_buff *skb) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 866d53c..e4c5a1b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -386,6 +386,17 @@ enum bpf_func_id { */ BPF_FUNC_current_task_under_cgroup, + /** +* bpf_skb_change_tail(skb, len, flags) +* The helper will resize the skb to the given new size, +* to be used f.e. with control messages. +* @skb: pointer to skb +* @len: new skb length +* @flags: reserved +* Return: 0 on success or negative error +*/ + BPF_FUNC_skb_change_tail, + __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 58b5e6d..abf546d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1350,14 +1350,18 @@ struct bpf_scratchpad { static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +static inline int __bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + return skb_ensure_writable(skb, write_len); +} + static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { -
Re: [RFC PATCH 08/13] net: sched: support skb_bad_tx with lockless qdisc
On 16-08-17 03:58 PM, Eric Dumazet wrote: > On Wed, 2016-08-17 at 12:36 -0700, John Fastabend wrote: >> Similar to how gso is handled skb_bad_tx needs to be per cpu to handle >> lockless qdisc with multiple writer/producers. > \ >> @@ -1021,6 +1026,7 @@ err_out4: >> free_percpu(sch->cpu_bstats); >> free_percpu(sch->cpu_qstats); >> free_percpu(sch->gso_cpu_skb); >> +free_percpu(sch->skb_bad_txq_cpu); > > > This might be the time to group all these per cpu allocations to a > single one, to help data locality and decrease overhead of having XX > pointers. > > > Sounds like a good idea to me. I'll go ahead and add a patch to the front to consolidate the stats and then add these there.
Re: [RFC PATCH 07/13] net: sched: support qdisc_reset on NOLOCK qdisc
On 16-08-17 03:53 PM, Eric Dumazet wrote: > On Wed, 2016-08-17 at 12:36 -0700, John Fastabend wrote: >> The qdisc_reset operation depends on the qdisc lock at the moment >> to halt any additions to gso_skb and statistics while the list is >> free'd and the stats zeroed. > > ... > >> Signed-off-by: John Fastabend>> --- >> net/sched/sch_generic.c | 45 +++-- >> 1 file changed, 35 insertions(+), 10 deletions(-) >> >> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c >> index 3b9a21f..29238c4 100644 >> --- a/net/sched/sch_generic.c >> +++ b/net/sched/sch_generic.c >> @@ -737,6 +737,20 @@ void qdisc_reset(struct Qdisc *qdisc) >> kfree_skb(qdisc->skb_bad_txq); >> qdisc->skb_bad_txq = NULL; >> >> +if (qdisc->gso_cpu_skb) { >> +int i; >> + >> +for_each_possible_cpu(i) { >> +struct gso_cell *cell; >> + >> +cell = per_cpu_ptr(qdisc->gso_cpu_skb, i); >> +if (cell) { >> +kfree_skb_list(cell->skb); >> +cell = NULL; > > You probably wanted : > cell->skb = NULL; > Yep thanks! > >> +} >> +} >> +} >> + >> if (qdisc->gso_skb) { >> kfree_skb_list(qdisc->gso_skb); >> qdisc->gso_skb = NULL; >> @@ -812,10 +826,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue >> *dev_queue, >> root_lock = qdisc_lock(oqdisc); >> spin_lock_bh(root_lock); >> >> -/* Prune old scheduler */ >> -if (oqdisc && atomic_read(>refcnt) <= 1) >> -qdisc_reset(oqdisc); >> - >> > > This probably belongs to a separate patch, before any per cpu / lockless > qdisc changes ? > > > Agreed will do for next rev thanks for reviewing.
Re: [RFC PATCH 08/13] net: sched: support skb_bad_tx with lockless qdisc
On Wed, 2016-08-17 at 12:36 -0700, John Fastabend wrote: > Similar to how gso is handled skb_bad_tx needs to be per cpu to handle > lockless qdisc with multiple writer/producers. \ > @@ -1021,6 +1026,7 @@ err_out4: > free_percpu(sch->cpu_bstats); > free_percpu(sch->cpu_qstats); > free_percpu(sch->gso_cpu_skb); > + free_percpu(sch->skb_bad_txq_cpu); This might be the time to group all these per cpu allocations to a single one, to help data locality and decrease overhead of having XX pointers.
Re: [PATCH net-next 3/3] net: veth: Set features for MPLS
On 8/17/16 4:41 PM, Eric Dumazet wrote: > On Wed, 2016-08-17 at 14:49 -0700, David Ahern wrote: >> veth does not really transmit packets only moves the skb from one >> netdev to another so gso and checksum is not really needed. Add >> the features to mpls_features to get the same benefit and performance >> with MPLS as without it. > > It seems mpls_features should also be managed by bonding and team > drivers ... > We'll get there. veth is used in Lennert's example so adding an update for it now. Can add other devices in time.
Re: [PATCH net] netfilter: tproxy: properly refcount tcp listeners
On Wed, Aug 17, 2016 at 09:56:46AM -0700, Eric Dumazet wrote: > From: Eric Dumazet> > inet_lookup_listener() and inet6_lookup_listener() no longer > take a reference on the found listener. > > This minimal patch adds back the refcounting, but we might do > this differently in net-next later. Applied, thanks Eric!
Re: [RFC PATCH 01/13] net: sched: allow qdiscs to handle locking
On 16-08-17 03:33 PM, Eric Dumazet wrote: > On Wed, 2016-08-17 at 12:33 -0700, John Fastabend wrote: >> This patch adds a flag for queueing disciplines to indicate the stack >> does not need to use the qdisc lock to protect operations. This can >> be used to build lockless scheduling algorithms and improving >> performance. >> >> The flag is checked in the tx path and the qdisc lock is only taken >> if it is not set. For now use a conditional if statement. Later we >> could be more aggressive if it proves worthwhile and use a static key >> or wrap this in a likely(). >> >> Signed-off-by: John Fastabend>> --- >> include/net/pkt_sched.h |4 +++- >> include/net/sch_generic.h |1 + >> net/core/dev.c| 32 >> net/sched/sch_generic.c | 26 -- >> 4 files changed, 48 insertions(+), 15 deletions(-) >> >> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h >> index 7caa99b..69540c6 100644 >> --- a/include/net/pkt_sched.h >> +++ b/include/net/pkt_sched.h >> @@ -107,8 +107,10 @@ void __qdisc_run(struct Qdisc *q); >> >> static inline void qdisc_run(struct Qdisc *q) >> { >> -if (qdisc_run_begin(q)) >> +if (qdisc_run_begin(q)) { >> __qdisc_run(q); >> +qdisc_run_end(q); >> +} >> } > > > Looks like you could have a separate patch, removing qdisc_run_end() > call done in __qdisc_run(q) ? > > Then the 'allow qdiscs to handle locking' > > Agreed that would clean this up a bit. Will do for next rev.
Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver
On 08/17/2016 03:32 PM, Timur Tabi wrote: > Timur Tabi wrote: >>> >>> Nothing prevents you from detailing in ACPI or DT sub-components of a >>> larger HW block, if there is such a desire, but just make it in a way >>> that it looks like what would be done for e.g: separate discrete parts, >>> only the parenting of nodes would change. >> >> So instead of just having a single property, instead create a child node >> just for the SGMII phy, with its own compatible string and various other >> properties (like interrupt and base register)? That's not a bad idea. > > I need help getting this to work. > > emac0: ethernet@3880 { > compatible = "qcom,qdf2432-emac"; > reg-names = "base", "csr"; > reg = <0x0 0x3880 0x0 0x1 >, > <0x0 0x38816000 0x0 0x1000>, > <0x0 0x3881C000 0x0 0x4000>; > interrupts = <0 0x100 0 0 0x104 0>; > interrupt-names = "core0", "sgmii"; > > sgmii-handle = <_sgmii>; > phy-handle = <>; > > #address-cells = <1>; > #size-cells = <0>; > phy0: ethernet-phy@4 { > reg = <4>; > }; > }; > > emac_sgmii: ethernet-phy@410400 { > compatible = "qcom,qdf2432-emac-phy"; > reg = <0x0 0x00410400 0x0 0x100>; > interrupts = <0 0x104 0>; > }; > Is this register range relative to the emac0 node here, or is this really a separate node, within the same adress space as your emac0 node? > When my driver probes, the platform_device object points to the emac0 > node, as always. > > How do I parse the emac_sgmii node? How do I get functions like > platform_get_resource() to work? How do I create a new platform_device > object that points to the emac_sgmii node? Answer largely depends on whether your device is really located outside of the emac, if it located outside, then a platform device matching the compatible string would get you what you want. If the emac_sgmii block is a sub-block within the EMAC, then a few things need fixing: - your emac_sgmii node should be a sub-node of the emac node, not a sibling - the emac0 node should have a "ranges" property that indicates how to translate the sub-nodes' "reg" property based on the base register address of the emac0 block - you would have to call of_platform_populate from the EMAC driver to ensure that the emac_sgmii child node and therefore platform device gets created -- Florian
Re: [RFC PATCH 01/13] net: sched: allow qdiscs to handle locking
On 16-08-17 03:34 PM, Eric Dumazet wrote: > On Wed, 2016-08-17 at 12:33 -0700, John Fastabend wrote: > > >> diff --git a/net/core/dev.c b/net/core/dev.c >> index 4ce07dc..5db395d 100644 >> --- a/net/core/dev.c >> +++ b/net/core/dev.c >> @@ -3076,6 +3076,26 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, >> struct Qdisc *q, >> int rc; >> >> qdisc_calculate_pkt_len(skb, q); >> + >> +if (q->flags & TCQ_F_NOLOCK) { >> +if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, >state))) { >> +__qdisc_drop(skb, _free); >> +rc = NET_XMIT_DROP; >> +} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q)) { > > For a lockless qdisc, do you believe TCQ_F_CAN_BYPASS is still a gain ? > For the benchmarks from pktgen it appears to be a win or mute to just drop the TCQ_F_CAN_BYPASS (just taking a look at one sample below) nolock & nobypass locked (current master) -- 1: 1435796 1471479 2: 1880642 1746231 4: 1922935 1119626 8: 1585055 1001471 12: 1479273 989269 The only thing would be to test a bunch of netperf RR sessions to be sure. > Also !qdisc_qlen(q) looks racy anyway ? Yep its racy unless you make it an atomic but this hurts performance metrics. There is a patch further in the stack here that adds the atomic variants but I tend to think we can just drop the bypass logic in the lockless case assuming the netperf tests look good. > >> +qdisc_bstats_cpu_update(q, skb); >> +if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) >> +__qdisc_run(q); >> +rc = NET_XMIT_SUCCESS; >> +} else { >> +rc = q->enqueue(skb, q, _free) & NET_XMIT_MASK; >> +__qdisc_run(q); >> +} >> + >> +if (unlikely(to_free)) >> +kfree_skb_list(to_free); >> +return rc; >> +} >> + > >
Re: [PATCH net-next 3/3] net: veth: Set features for MPLS
On Wed, 2016-08-17 at 14:49 -0700, David Ahern wrote: > veth does not really transmit packets only moves the skb from one > netdev to another so gso and checksum is not really needed. Add > the features to mpls_features to get the same benefit and performance > with MPLS as without it. It seems mpls_features should also be managed by bonding and team drivers ...
Re: [RFC PATCH 01/13] net: sched: allow qdiscs to handle locking
On Wed, 2016-08-17 at 12:33 -0700, John Fastabend wrote: > diff --git a/net/core/dev.c b/net/core/dev.c > index 4ce07dc..5db395d 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -3076,6 +3076,26 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, > struct Qdisc *q, > int rc; > > qdisc_calculate_pkt_len(skb, q); > + > + if (q->flags & TCQ_F_NOLOCK) { > + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, >state))) { > + __qdisc_drop(skb, _free); > + rc = NET_XMIT_DROP; > + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q)) { For a lockless qdisc, do you believe TCQ_F_CAN_BYPASS is still a gain ? Also !qdisc_qlen(q) looks racy anyway ? > + qdisc_bstats_cpu_update(q, skb); > + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) > + __qdisc_run(q); > + rc = NET_XMIT_SUCCESS; > + } else { > + rc = q->enqueue(skb, q, _free) & NET_XMIT_MASK; > + __qdisc_run(q); > + } > + > + if (unlikely(to_free)) > + kfree_skb_list(to_free); > + return rc; > + } > +
Re: [RFC PATCH 01/13] net: sched: allow qdiscs to handle locking
On Wed, 2016-08-17 at 12:33 -0700, John Fastabend wrote: > This patch adds a flag for queueing disciplines to indicate the stack > does not need to use the qdisc lock to protect operations. This can > be used to build lockless scheduling algorithms and improving > performance. > > The flag is checked in the tx path and the qdisc lock is only taken > if it is not set. For now use a conditional if statement. Later we > could be more aggressive if it proves worthwhile and use a static key > or wrap this in a likely(). > > Signed-off-by: John Fastabend> --- > include/net/pkt_sched.h |4 +++- > include/net/sch_generic.h |1 + > net/core/dev.c| 32 > net/sched/sch_generic.c | 26 -- > 4 files changed, 48 insertions(+), 15 deletions(-) > > diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h > index 7caa99b..69540c6 100644 > --- a/include/net/pkt_sched.h > +++ b/include/net/pkt_sched.h > @@ -107,8 +107,10 @@ void __qdisc_run(struct Qdisc *q); > > static inline void qdisc_run(struct Qdisc *q) > { > - if (qdisc_run_begin(q)) > + if (qdisc_run_begin(q)) { > __qdisc_run(q); > + qdisc_run_end(q); > + } > } Looks like you could have a separate patch, removing qdisc_run_end() call done in __qdisc_run(q) ? Then the 'allow qdiscs to handle locking'
Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver
Timur Tabi wrote: Nothing prevents you from detailing in ACPI or DT sub-components of a larger HW block, if there is such a desire, but just make it in a way that it looks like what would be done for e.g: separate discrete parts, only the parenting of nodes would change. So instead of just having a single property, instead create a child node just for the SGMII phy, with its own compatible string and various other properties (like interrupt and base register)? That's not a bad idea. I need help getting this to work. emac0: ethernet@3880 { compatible = "qcom,qdf2432-emac"; reg-names = "base", "csr"; reg = <0x0 0x3880 0x0 0x1 >, <0x0 0x38816000 0x0 0x1000>, <0x0 0x3881C000 0x0 0x4000>; interrupts = <0 0x100 0 0 0x104 0>; interrupt-names = "core0", "sgmii"; sgmii-handle = <_sgmii>; phy-handle = <>; #address-cells = <1>; #size-cells = <0>; phy0: ethernet-phy@4 { reg = <4>; }; }; emac_sgmii: ethernet-phy@410400 { compatible = "qcom,qdf2432-emac-phy"; reg = <0x0 0x00410400 0x0 0x100>; interrupts = <0 0x104 0>; }; When my driver probes, the platform_device object points to the emac0 node, as always. How do I parse the emac_sgmii node? How do I get functions like platform_get_resource() to work? How do I create a new platform_device object that points to the emac_sgmii node? -- Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
[PATCH net-next 2/3] net: mpls: Fixups for GSO
As reported by Lennert the MPLS GSO code is failing to properly segment large packets. There are a couple of problems: 1. the inner protocol is not set so the gso segment functions for inner protocol layers are not getting run, and 2 MPLS labels for packets that use the "native" (non-OVS) MPLS code are not properly accounted for in mpls_gso_segment. The MPLS GSO code was added for OVS. It is re-using skb_mac_gso_segment to call the gso segment functions for the higher layer protocols. That means skb_mac_gso_segment is called twice -- once with the network protocol set to MPLS and again with the network protocol set to the inner protocol. This patch sets the inner skb protocol addressing item 1 above and sets the network_header and inner_network_header to mark where the MPLS labels start and end. The MPLS code in OVS is also updated to set the two network markers. >From there the MPLS GSO code uses the difference between the network header and the inner network header to know the size of the MPLS header that was pushed. It then pulls the MPLS header, resets the mac_len and protocol for the inner protocol and then calls skb_mac_gso_segment to segment the skb. Afterwards the skb protocol is set to mpls for each segment as suggested by Simon. Reported-by: Lennert BuytenhekSigned-off-by: David Ahern --- net/mpls/mpls_gso.c | 24 +--- net/mpls/mpls_iptunnel.c | 5 + net/openvswitch/actions.c | 6 ++ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 2055e57ed1c3..fa6899f02cc8 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -22,33 +22,35 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, netdev_features_t features) { + int mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; netdev_features_t mpls_features; __be16 mpls_protocol; + u16 mac_len = skb->mac_len; /* Setup inner SKB. */ mpls_protocol = skb->protocol; skb->protocol = skb->inner_protocol; - /* Push back the mac header that skb_mac_gso_segment() has pulled. -* It will be re-pulled by the call to skb_mac_gso_segment() below -*/ - __skb_push(skb, skb->mac_len); + __skb_pull(skb, mpls_hlen); + skb->mac_len = skb_inner_network_offset(skb); /* Segment inner packet. */ mpls_features = skb->dev->mpls_features & features; segs = skb_mac_gso_segment(skb, mpls_features); - + if (IS_ERR_OR_NULL(segs)) { + skb_gso_error_unwind(skb, mpls_protocol, mpls_hlen, mac_offset, +mac_len); + goto out; + } /* Restore outer protocol. */ skb->protocol = mpls_protocol; + for (skb = segs; skb; skb = skb->next) + skb->protocol = mpls_protocol; - /* Re-pull the mac header that the call to skb_mac_gso_segment() -* above pulled. It will be re-pushed after returning -* skb_mac_gso_segment(), an indirect caller of this function. -*/ - __skb_pull(skb, skb->data - skb_mac_header(skb)); - +out: return segs; } diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index aed872cc05a6..55c5ab907563 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -90,7 +90,12 @@ static int mpls_xmit(struct sk_buff *skb) if (skb_cow(skb, hh_len + new_header_size)) goto drop; + skb_set_inner_protocol(skb, skb->protocol); + skb_reset_inner_network_header(skb); + skb->encapsulation = 1; + skb_push(skb, new_header_size); + skb_reset_network_header(skb); skb->dev = out_dev; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 1ecbd7715f6d..6d78f162a88b 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -167,6 +167,12 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, skb->mac_len); skb_reset_mac_header(skb); + /* for GSO: set MPLS as network header and encapsulated protocol +* header as inner network header +*/ + skb_set_network_header(skb, skb->mac_len); + skb_set_inner_network_header(skb, skb->mac_len + MPLS_HLEN); + new_mpls_lse = (__be32 *)skb_mpls_header(skb); *new_mpls_lse = mpls->mpls_lse; -- 2.1.4
[PATCH net-next 3/3] net: veth: Set features for MPLS
veth does not really transmit packets only moves the skb from one netdev to another so gso and checksum is not really needed. Add the features to mpls_features to get the same benefit and performance with MPLS as without it. Reported-by: Lennert BuytenhekSigned-off-by: David Ahern --- drivers/net/veth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index f37a6e61d4ad..5db320a4d5cf 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -340,6 +340,7 @@ static void veth_setup(struct net_device *dev) dev->hw_features = VETH_FEATURES; dev->hw_enc_features = VETH_FEATURES; + dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; } /* -- 2.1.4
[PATCH net-next 1/3] net: lwtunnel: Handle fragmentation
From: Roopa PrabhuToday mpls iptunnel lwtunnel_output redirect expects the tunnel output function to handle fragmentation. This is ok but can be avoided if we did not do the mpls output redirect too early. ie we could wait until ip fragmentation is done and then call mpls output for each ip fragment. To make this work we will need, 1) the lwtunnel state to carry encap headroom 2) and do the redirect to the encap output handler on the ip fragment (essentially do the output redirect after fragmentation) This patch adds tunnel headroom in lwtstate to make sure we account for tunnel data in mtu calculations during fragmentation and adds new xmit redirect handler to redirect to lwtunnel xmit func after ip fragmentation. This includes IPV6 and some mtu fixes and testing from David Ahern. Signed-off-by: Roopa Prabhu Signed-off-by: David Ahern --- include/net/lwtunnel.h | 44 net/core/lwtunnel.c | 35 +++ net/ipv4/ip_output.c | 8 net/ipv4/route.c | 4 +++- net/ipv6/ip6_output.c| 8 net/ipv6/route.c | 4 +++- net/mpls/mpls_iptunnel.c | 9 + 7 files changed, 106 insertions(+), 6 deletions(-) diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index e9f116e29c22..ea3f80f58fd6 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -13,6 +13,13 @@ /* lw tunnel state flags */ #define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) +#define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) + +enum { + LWTUNNEL_XMIT_DONE, + LWTUNNEL_XMIT_CONTINUE, +}; + struct lwtunnel_state { __u16 type; @@ -21,6 +28,7 @@ struct lwtunnel_state { int (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*orig_input)(struct sk_buff *); int len; + __u16 headroom; __u8data[0]; }; @@ -34,6 +42,7 @@ struct lwtunnel_encap_ops { struct lwtunnel_state *lwtstate); int (*get_encap_size)(struct lwtunnel_state *lwtstate); int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); + int (*xmit)(struct sk_buff *skb); }; #ifdef CONFIG_LWTUNNEL @@ -75,6 +84,24 @@ static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) return false; } + +static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) +{ + if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT)) + return true; + + return false; +} + +static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, +unsigned int mtu) +{ + if (lwtunnel_xmit_redirect(lwtstate) && lwtstate->headroom < mtu) + return lwtstate->headroom; + + return 0; +} + int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, @@ -90,6 +117,7 @@ struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); +int lwtunnel_xmit(struct sk_buff *skb); #else @@ -117,6 +145,17 @@ static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) return false; } +static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) +{ + return false; +} + +static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, +unsigned int mtu) +{ + return 0; +} + static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { @@ -170,6 +209,11 @@ static inline int lwtunnel_input(struct sk_buff *skb) return -EOPNOTSUPP; } +static inline int lwtunnel_xmit(struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_LWTUNNEL */ #define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type)) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 669ecc9f884e..e5f84c26ba1a 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -251,6 +251,41 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(lwtunnel_output); +int lwtunnel_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + + lwtstate = dst->lwtstate; + + if
[PATCH v2 net-next 0/3] net: mpls: fragmentation and gso fixes for locally originated traffic
This series fixes mtu and fragmentation for tunnels using lwtunnel output redirect, and fixes GSO for MPLS for locally originated traffic reported by Lennert Buytenhek. A follow on series will address fragmentation and GSO for forwarded MPLS traffic. Hardware offload of GSO with MPLS also needs to be addressed. v2 - consistent use of network_header in skb to fix GSO for MPLS - update MPLS code in OVS to network_header and inner_network_header David Ahern (2): net: mpls: Fixups for GSO net: veth: Set features for MPLS Roopa Prabhu (1): net: lwtunnel: Handle fragmentation drivers/net/veth.c| 1 + include/net/lwtunnel.h| 44 net/core/lwtunnel.c | 35 +++ net/ipv4/ip_output.c | 8 net/ipv4/route.c | 4 +++- net/ipv6/ip6_output.c | 8 net/ipv6/route.c | 4 +++- net/mpls/mpls_gso.c | 24 +--- net/mpls/mpls_iptunnel.c | 14 ++ net/openvswitch/actions.c | 6 ++ 10 files changed, 131 insertions(+), 17 deletions(-) -- 2.1.4
[PATCH iproute2] ip: report IFLA_GSO_MAX_SIZE and IFLA_GSO_MAX_SEGS
From: Eric Dumazetkernel support for these attributes was added in linux-4.6 Signed-off-by: Eric Dumazet --- ip/ipaddress.c |8 1 file changed, 8 insertions(+) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index ab4b1b1..76bd7b3 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -901,6 +901,14 @@ int print_linkinfo(const struct sockaddr_nl *who, fprintf(fp, "numrxqueues %u ", rta_getattr_u32(tb[IFLA_NUM_RX_QUEUES])); + if (tb[IFLA_GSO_MAX_SIZE]) + fprintf(fp, "gso_max_size %u ", + rta_getattr_u32(tb[IFLA_GSO_MAX_SIZE])); + + if (tb[IFLA_GSO_MAX_SEGS]) + fprintf(fp, "gso_max_segs %u ", + rta_getattr_u32(tb[IFLA_GSO_MAX_SEGS])); + if (tb[IFLA_PHYS_PORT_NAME]) fprintf(fp, "portname %s ", rta_getattr_str(tb[IFLA_PHYS_PORT_NAME]));
Re: [iproute PATCH 1/2] ipaddress: Simplify vf_info parsing
On Wed, Jun 01, 2016 at 10:03:49PM +0200, Phil Sutter wrote: > Not sure whether I misinterpret commit 7b8179c780a1a, but it looks > overly complicated. Instead rely upon parse_rtattr_nested() to assign > the relevant pointer if requested rtattr fields are present. In order to validate correctness of this patch, I compiled a kernel which does not export IFLA_VF_SPOOFCHK and verified correct functionality. Is there anything else I could do in order to convince you to accept these patches? Thanks, Phil
[PATCH net-next] dsa: mv88e6xxx: Timeout based on iterations
The mv88e6xxx driver times out operations on the switch based on looping until an elapsed wall clock time is reached. However, if usleep_range() sleeps much longer than expected, it could timeout with an error without actually checking to see if the devices has completed the operation. So replace the elapsed time with a fixed upper bound on the number of loops. Testing on various switches has shown that switches takes either 0 or 1 iteration, so a maximum of 16 iterations is a safe limit. Signed-off-by: Andrew Lunn--- drivers/net/dsa/mv88e6xxx/chip.c | 15 ++- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index a230fcba5b64..ac8e9af4879f 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -309,9 +309,9 @@ static int mv88e6xxx_serdes_write(struct mv88e6xxx_chip *chip, int reg, u16 val) static int mv88e6xxx_wait(struct mv88e6xxx_chip *chip, int addr, int reg, u16 mask) { - unsigned long timeout = jiffies + HZ / 10; + int i; - while (time_before(jiffies, timeout)) { + for (i = 0; i < 16; i++) { u16 val; int err; @@ -375,7 +375,7 @@ static int _mv88e6xxx_reg_write(struct mv88e6xxx_chip *chip, int addr, static int mv88e6xxx_ppu_disable(struct mv88e6xxx_chip *chip) { int ret; - unsigned long timeout; + int i; ret = _mv88e6xxx_reg_read(chip, REG_GLOBAL, GLOBAL_CONTROL); if (ret < 0) @@ -386,8 +386,7 @@ static int mv88e6xxx_ppu_disable(struct mv88e6xxx_chip *chip) if (ret) return ret; - timeout = jiffies + 1 * HZ; - while (time_before(jiffies, timeout)) { + for (i = 0; i < 16; i++) { ret = _mv88e6xxx_reg_read(chip, REG_GLOBAL, GLOBAL_STATUS); if (ret < 0) return ret; @@ -403,8 +402,7 @@ static int mv88e6xxx_ppu_disable(struct mv88e6xxx_chip *chip) static int mv88e6xxx_ppu_enable(struct mv88e6xxx_chip *chip) { - int ret, err; - unsigned long timeout; + int ret, err, i; ret = _mv88e6xxx_reg_read(chip, REG_GLOBAL, GLOBAL_CONTROL); if (ret < 0) @@ -415,8 +413,7 @@ static int mv88e6xxx_ppu_enable(struct mv88e6xxx_chip *chip) if (err) return err; - timeout = jiffies + 1 * HZ; - while (time_before(jiffies, timeout)) { + for (i = 0; i < 16; i++) { ret = _mv88e6xxx_reg_read(chip, REG_GLOBAL, GLOBAL_STATUS); if (ret < 0) return ret; -- 2.8.1
[PATCH net-next] tcp: refine tcp_prune_ofo_queue() to not drop all packets
From: Eric DumazetOver the years, TCP BDP has increased a lot, and is typically in the order of ~10 Mbytes with help of clever Congestion Control modules. In presence of packet losses, TCP stores incoming packets into an out of order queue, and number of skbs sitting there waiting for the missing packets to be received can match the BDP (~10 Mbytes) In some cases, TCP needs to make room for incoming skbs, and current strategy can simply remove all skbs in the out of order queue as a last resort, incurring a huge penalty, both for receiver and sender. Unfortunately these 'last resort events' are quite frequent, forcing sender to send all packets again, stalling the flow and wasting a lot of resources. This patch cleans only a part of the out of order queue in order to meet the memory constraints. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Cc: C. Stephen Gun Cc: Van Jacobson --- net/ipv4/tcp_input.c | 47 - 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ebf45b38bc309f448dbc4f27fe8722cefabaf19..8cd02c0b056cbc22e2e4a4fe8530b74f7bd25419 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4392,12 +4392,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, if (tcp_prune_queue(sk) < 0) return -1; - if (!sk_rmem_schedule(sk, skb, size)) { + while (!sk_rmem_schedule(sk, skb, size)) { if (!tcp_prune_ofo_queue(sk)) return -1; - - if (!sk_rmem_schedule(sk, skb, size)) - return -1; } } return 0; @@ -4874,29 +4871,41 @@ static void tcp_collapse_ofo_queue(struct sock *sk) } /* - * Purge the out-of-order queue. - * Return true if queue was pruned. + * Clean the out-of-order queue to make room. + * We drop high sequences packets to : + * 1) Let a chance for holes to be filled. + * 2) not add too big latencies if thousands of packets sit there. + *(But if application shrinks SO_RCVBUF, we could still end up + * freeing whole queue here) + * + * Return true if queue has shrunk. */ static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - bool res = false; + struct sk_buff *skb; - if (!skb_queue_empty(>out_of_order_queue)) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); - __skb_queue_purge(>out_of_order_queue); + if (skb_queue_empty(>out_of_order_queue)) + return false; - /* Reset SACK state. A conforming SACK implementation will -* do the same at a timeout based retransmit. When a connection -* is in a sad state like this, we care only about integrity -* of the connection not performance. -*/ - if (tp->rx_opt.sack_ok) - tcp_sack_reset(>rx_opt); + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); + + while ((skb = __skb_dequeue_tail(>out_of_order_queue)) != NULL) { + tcp_drop(sk, skb); sk_mem_reclaim(sk); - res = true; + if (atomic_read(>sk_rmem_alloc) <= sk->sk_rcvbuf && + !tcp_under_memory_pressure(sk)) + break; } - return res; + + /* Reset SACK state. A conforming SACK implementation will +* do the same at a timeout based retransmit. When a connection +* is in a sad state like this, we care only about integrity +* of the connection not performance. +*/ + if (tp->rx_opt.sack_ok) + tcp_sack_reset(>rx_opt); + return true; } /* Reduce allocated memory if we can, trying to get
[PATCH net-next] net: bgmac: make it clear when setting interface type to RMII
From: Rafał MiłeckiIt doesn't really change anything as BGMAC_CHIPCTL_1_IF_TYPE_RMII is equal to 0. It make code a bit clener, so far when reading it one could think we forgot to set a proper mode. It also keeps this mode code in sync with other ones. Signed-off-by: Rafał Miłecki --- drivers/net/ethernet/broadcom/bgmac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index 63ef7235..6ea0e5f 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -932,7 +932,8 @@ static void bgmac_chip_reset(struct bgmac *bgmac) et_swtype <<= 4; sw_type = et_swtype; } else if (bgmac->feature_flags & BGMAC_FEAT_SW_TYPE_EPHYRMII) { - sw_type = BGMAC_CHIPCTL_1_SW_TYPE_EPHYRMII; + sw_type = BGMAC_CHIPCTL_1_IF_TYPE_RMII | + BGMAC_CHIPCTL_1_SW_TYPE_EPHYRMII; } else if (bgmac->feature_flags & BGMAC_FEAT_SW_TYPE_RGMII) { sw_type = BGMAC_CHIPCTL_1_IF_TYPE_RGMII | BGMAC_CHIPCTL_1_SW_TYPE_RGMII; -- 1.8.4.5
[PATCH net-next] net: bgmac: support Ethernet core on BCM53573 SoCs
From: Rafał MiłeckiBCM53573 is a new series of Broadcom's SoCs. It's based on ARM and can be found in two packages (versions): BCM53573 and BCM47189. It shares some code with the Northstar family, but also requires some new quirks. First of all there can be up to 2 Ethernet cores on this SoC. If that is the case, they are connected to two different switch ports allowing some more complex/optimized setups. It seems the second unit doesn't come fully configured and requires some IRQ quirk. Other than that only the first core is connected to the PHY. For the second one we have to register fixed PHY (similarly to the Northstar), otherwise generic PHY driver would get some invalid info. This has been successfully tested on Tenda AC9 (BCM47189B0). Signed-off-by: Rafał Miłecki --- drivers/net/ethernet/broadcom/bgmac-bcma.c | 19 ++- drivers/net/ethernet/broadcom/bgmac.c | 25 + drivers/net/ethernet/broadcom/bgmac.h | 19 +++ include/linux/bcma/bcma.h | 3 +++ include/linux/bcma/bcma_regs.h | 1 + 5 files changed, 66 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c b/drivers/net/ethernet/broadcom/bgmac-bcma.c index 9a9745c4..3bc0a04 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c @@ -92,6 +92,7 @@ MODULE_DEVICE_TABLE(bcma, bgmac_bcma_tbl); /* http://bcm-v4.sipsolutions.net/mac-gbit/gmac/chipattach */ static int bgmac_probe(struct bcma_device *core) { + struct bcma_chipinfo *ci = >bus->chipinfo; struct ssb_sprom *sprom = >bus->sprom; struct mii_bus *mii_bus; struct bgmac *bgmac; @@ -157,7 +158,8 @@ static int bgmac_probe(struct bcma_device *core) dev_info(bgmac->dev, "Found PHY addr: %d%s\n", bgmac->phyaddr, bgmac->phyaddr == BGMAC_PHY_NOREGS ? " (NOREGS)" : ""); - if (!bgmac_is_bcm4707_family(core)) { + if (!bgmac_is_bcm4707_family(core) && + !(ci->id == BCMA_CHIP_ID_BCM53573 && core->core_unit == 1)) { mii_bus = bcma_mdio_mii_register(core, bgmac->phyaddr); if (!IS_ERR(mii_bus)) { err = PTR_ERR(mii_bus); @@ -230,6 +232,21 @@ static int bgmac_probe(struct bcma_device *core) bgmac->feature_flags |= BGMAC_FEAT_NO_RESET; bgmac->feature_flags |= BGMAC_FEAT_FORCE_SPEED_2500; break; + case BCMA_CHIP_ID_BCM53573: + bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST; + bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK; + if (ci->pkg == BCMA_PKG_ID_BCM47189) + bgmac->feature_flags |= BGMAC_FEAT_IOST_ATTACHED; + if (core->core_unit == 0) { + bgmac->feature_flags |= BGMAC_FEAT_CC4_IF_SW_TYPE; + if (ci->pkg == BCMA_PKG_ID_BCM47189) + bgmac->feature_flags |= + BGMAC_FEAT_CC4_IF_SW_TYPE_RGMII; + } else if (core->core_unit == 1) { + bgmac->feature_flags |= BGMAC_FEAT_IRQ_ID_OOB_6; + bgmac->feature_flags |= BGMAC_FEAT_CC7_IF_TYPE_RGMII; + } + break; default: bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST; bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK; diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index c4751ec..63ef7235 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -940,6 +940,27 @@ static void bgmac_chip_reset(struct bgmac *bgmac) bgmac_cco_ctl_maskset(bgmac, 1, ~(BGMAC_CHIPCTL_1_IF_TYPE_MASK | BGMAC_CHIPCTL_1_SW_TYPE_MASK), sw_type); + } else if (bgmac->feature_flags & BGMAC_FEAT_CC4_IF_SW_TYPE) { + u32 sw_type = BGMAC_CHIPCTL_4_IF_TYPE_MII | + BGMAC_CHIPCTL_4_SW_TYPE_EPHY; + u8 et_swtype = 0; + char buf[4]; + + if (bcm47xx_nvram_getenv("et_swtype", buf, sizeof(buf)) > 0) { + if (kstrtou8(buf, 0, _swtype)) + dev_err(bgmac->dev, "Failed to parse et_swtype (%s)\n", + buf); + sw_type = (et_swtype & 0x0f) << 12; + } else if (bgmac->feature_flags & BGMAC_FEAT_CC4_IF_SW_TYPE_RGMII) { + sw_type = BGMAC_CHIPCTL_4_IF_TYPE_RGMII | + BGMAC_CHIPCTL_4_SW_TYPE_RGMII; + } + bgmac_cco_ctl_maskset(bgmac, 4, ~(BGMAC_CHIPCTL_4_IF_TYPE_MASK | + BGMAC_CHIPCTL_4_SW_TYPE_MASK), +
Re: [PATCH iproute] ipila: Fixed unitialized variables
On Mon, 15 Aug 2016 16:30:22 -0700 Tom Herbertwrote: > Initialize locator and locator_match to zero and only do > addattr if they have been set. > > Signed-off-by: Tom Herbert Applied thanks
Re: [PATCH] [v8] net: emac: emac gigabit ethernet controller driver
> Something is odd about of_mdiobus_register(). > > This function scans child nodes to look for PHYs: > > /* Loop over the child nodes and register a phy_device for each phy */ > for_each_available_child_of_node(np, child) { > addr = of_mdio_parse_addr(>dev, child); > > And in my driver, this works. However, later when I try to call > of_phy_find_device(), it fails. That's because of_phy_find_device() > wants the np of the child node. But the only way to get that is > with a phy-phandle property which I need to manually parse. You do need the phy-handle, because you can have multiple PHYs, and ethernet switches on the MDIO bus. You need to indicate which of those PHYs is connected to this MAC. > So what's the point of having of_mdiobus_register() parse child > nodes, if you need a phy-phandle pointer anyway? So we know about the other phys and switches on the MDIO bus, can load the driver for them, etc. This is standard policy for a bus driver. When you instantiate the bus, you enumerate it to see what is connected to it. You do that either with of_mdiobus_register() or mdiobus_register(). However, i do agree about having a helper do the phy-handle parsing would be nice, and it should also handle fixed phys as well. Andrew
Re: [PATCH iproute2] ip route: restore_handler should check tb[RTA_PREFSRC] for local networks
On Wed, 17 Aug 2016 11:32:53 +0800 Xin Longwrote: > Hi, Stephen > > any update on this ? Was waiting long enough to get a comment or two.
Re: [PATCH iproute2 v2 2/2] tipc: refactor bearer identification
On Mon, 15 Aug 2016 10:24:32 +0200 Richard Alpewrote: > Introduce a generic function (nl_add_bearer_name()) that identifies a > bearer and adds it to an existing netlink message. This reduces code > complexity and makes the code a little bit easier to maintain. > > Signed-off-by: Richard Alpe Thanks for keeping TIPC support up to date. Applied.
Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver
Florian Fainelli wrote: If it has self identification, and the MAC is capable of knowing what kind of PHY it is internally bundled with, why does it matter to represent that specific piece of information in Device Tree or ACPI? That's the thing - the MAC is not capable of knowing. From a hardware perspective, the MAC and PHY are physically distinct IP blocks, and so the MAC hardware has no idea what it's really connected to. In theory, we don't even need an internal PHY, but if there is an internal PHY, we have to program it correctly. This is why we need a DT and ACPI property (or whatever) to tell the driver what kind of PHY it's attached to. The problem is that the internal PHY has mechanism for self-identification. There is no ACPI or device tree node for it. There is no register you can query to see if it's there or what version it is. The MAC part of the EMAC has no knowledge of the PHY part. The driver just has to know it's there. Nothing prevents you from detailing in ACPI or DT sub-components of a larger HW block, if there is such a desire, but just make it in a way that it looks like what would be done for e.g: separate discrete parts, only the parenting of nodes would change. So instead of just having a single property, instead create a child node just for the SGMII phy, with its own compatible string and various other properties (like interrupt and base register)? That's not a bad idea. -- Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver
On 08/16/2016 02:37 PM, Timur Tabi wrote: > Al Stone wrote: >> Does the ACPI portion of the driver*have* to know about the PHY? In >> general, >> the ACPI assumption on ARM [**] is that those have all been set up >> before we >> get to the kernel. So, does it need to be visible to the ACPI part of >> the >> driver at all? > > Yes, the driver supports both "v1" and "v2" of the PHY, and it has code > like this: > > if (adpt->phy.version == 2) > emac_sgmii_init_v2(adpt); > else > emac_sgmii_init_v1(adpt); > > The question, how should adpt->phy.version be initialized on device tree > and ACPI platforms? > > The reason why this is confusing is because there are conflicting > perspectives of this "internal PHY". It both is and is not part of the > EMAC. It is a separate block in the chip, and can be replaced with > other SGMII blocks, and in the future there will be a "v3" and maybe a > "v4" or who knows what. In fact, maybe using version numbers is > inappropriate and we'll need to use vendor names or something. If it has self identification, and the MAC is capable of knowing what kind of PHY it is internally bundled with, why does it matter to represent that specific piece of information in Device Tree or ACPI? > > The problem is that the internal PHY has mechanism for > self-identification. There is no ACPI or device tree node for it. There > is no register you can query to see if it's there or what version it > is. The MAC part of the EMAC has no knowledge of the PHY part. The > driver just has to know it's there. Nothing prevents you from detailing in ACPI or DT sub-components of a larger HW block, if there is such a desire, but just make it in a way that it looks like what would be done for e.g: separate discrete parts, only the parenting of nodes would change. > > This is why I'm advocating a separate property (DT and ACPI) to identify > the internal PHY > > The truth is that I don't really care HOW it's done, and as long as we > come to a consensus soon. I want my driver to be merged into 4.9. > -- Florian
Re: [PATCH] [v8] net: emac: emac gigabit ethernet controller driver
On 08/17/2016 12:38 PM, Timur Tabi wrote: >> If this is an external PHY, the expectation is that it will be hanging >> off a MDIO bus controller, either the MDIO bus internal to the MAC, or >> an external MDIO bus (separate register range). > > So for starters, I forgot to delete the 'compatible' property from this > document. Its presence breaks of_mdiobus_child_is_phy(). The MDIO devices and Ethernet PHY binding should certainly be improved, but the bulk of it is either you use ethernet-phy-ieee802.3-c22 or ethernet-phy-ieee802.3-c45, or nothing. Else, the MDIO bus probing code just assumes it can match a MDIO device (not specifically an Ethernet PHY) with its compatible string. > >> If such a PHY node is provided, the expectation is that your Ethernet >> MAC will have a phy-handle property and a phy-mode property to specify >> how to connect to this external PHY, so in your specific case, better >> remove the PHY from your example, or detail further how it should be >> done. > > Something is odd about of_mdiobus_register(). > > This function scans child nodes to look for PHYs: > > /* Loop over the child nodes and register a phy_device for each phy */ > for_each_available_child_of_node(np, child) { > addr = of_mdio_parse_addr(>dev, child); > > And in my driver, this works. However, later when I try to call > of_phy_find_device(), it fails. That's because of_phy_find_device() > wants the np of the child node. But the only way to get that is with a > phy-phandle property which I need to manually parse. of_get_phandle() should return the device_node pointer associated with the phy-handle property which you can later use and pass to of_phy_find_device(). There are plenty of examples in the tree and the logic is almost always the same. > > So what's the point of having of_mdiobus_register() parse child nodes, > if you need a phy-phandle pointer anyway? Not all MDIO devices are Ethernet PHYs, some of them are Ethernet switches, or any other PHY (USB, PCIE, SATA) just sitting on the same MDIO bus HW instance so you need to create device_node and struct device instances for these deviecs listed in the Device Tree since the bus is self-discoverable (like I2C). Your Ethernet controller somehow needs to know which Ethernet PHY it is attached to since the location of the Ethernet PHY on the bus could change from design to design, let alone the actual topology (you could have a GPIO bit-banged MDIO bus or something else). >> Since you have a check on CONFIG_QCOM_EMAC in emac/Makefile, you could >> always recurse into that directory while building (use obj-y). > > Obviously, having "obj-$(CONFIG_QCOM_EMAC)" in both Makefiles is > redundant, but wouldn't it make more sense to change > "obj-$(CONFIG_QCOM_EMAC)" to "obj-y" in > drivers/net/ethernet/qualcomm/emac/Makefile, so that I only recurse if > necessary? Whichever makes the most sense, when there is a directory involved, my preference is to always recurse in that directory and selectively compile from there, since the Kconfig/Makefile options are all located within the same hierarchical level, just a preference. > >>> >+static void emac_adjust_link(struct net_device *netdev) >>> >+{ >>> >+struct emac_adapter *adpt = netdev_priv(netdev); >>> >+struct phy_device *phydev = netdev->phydev; >>> >+ >>> >+if (phydev->link) >>> >+emac_mac_start(adpt); >>> >+else >>> >+emac_mac_stop(adpt); > >> This is really excessive here, you typically don't need to completely >> stop your transmitter/receiver/DMA/what have you, just reprogram the MAC >> with the appropriate link speed/duplex/pause parameters. > > Yes, I realize that a lot of the reinit code in my driver is "heavy", > and that it could be optimized for specific situations. However, I'd > like to save that for another patch, because it would require me to > study the documentation and do extensive tests. > > I can't tell you today whether emac_mac_start() is overkill for the > specific link change. For all I know, the hardware does require the TX > to stop before changing the link speed. Remember, the EMAC does have a > weird double-phy (one internal, one external). Fair enough, not having to stop the TX from the SW perspective should be kind of basic requirement here, most drivers/HW don't seem to require that, because even if packets were inflight at the time the link went down, the MAC should sense the carrier loss and report that accordingly. Now there could be many different ways to implement and screw that up, so yes, future patch seems like a good approach. -- Florian
Re: [RFC PATCH 12/13] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq
On 16-08-17 12:38 PM, John Fastabend wrote: > The sch_mq qdisc creates a sub-qdisc per tx queue which are then > called independently for enqueue and dequeue operations. However > statistics are aggregated and pushed up to the "master" qdisc. > > This patch adds support for any of the sub-qdiscs to be per cpu > statistic qdiscs. To handle this case add a check when calculating > stats and aggregate the per cpu stats if needed. > > Also exports __gnet_stats_copy_queue() to use as a helper function. > > Signed-off-by: John Fastabend> --- [...] > + if (qdisc_is_percpu_stats(qdisc)) { > + cpu_bstats = qdisc->cpu_bstats; > + cpu_qstats = qdisc->cpu_qstats; > + } > + > + qlen = qdisc_qlen_sum(qdisc); > + > + __gnet_stats_copy_basic(NULL, >bstats, > + cpu_bstats, >bstats); > + __gnet_stats_copy_queue(>qstats, > + cpu_qstats, >qstats, qlen); And also I forgot to bump this onto the atomic qlen and it is still using the per cpu counters giving the incorrect qlen total.
Re: [RFC PATCH 10/13] net: sched: lockless support for netif_schedule
On 16-08-17 12:37 PM, John Fastabend wrote: > netif_schedule uses a bit QDISC_STATE_SCHED to tell the qdisc layer > if a run of the qdisc has been scheduler. This is important when > tearing down qdisc instances. We can rcu_free an instance for example > if its possible that we might have outstanding references to it. > > Perhaps more importantly in the per cpu lockless case we need to > schedule a run of the qdisc on all qdiscs that are enqueu'ing packets > and hitting the gso_skb requeue logic or else the skb may get stuck > on the gso_skb queue without anything to finish the xmit. > > This patch uses a reference counter instead of a bit to account for > the multiple CPUs. > --- oops the commit message is incorrect here it actually uses a per cpu state bitmask to track this.
[RFC PATCH 00/13] Series short description
I've been working on this for a bit now figured its time for a v2 RFC. As usual any comments, suggestions, observations, musings, etc are appreciated. Latest round of lockless qdisc patch set with performance metric primarily using pktgen to inject pkts into the qdisc layer. Some simple netperf tests below as well but those need to be done correctly. This v2 RFC version fixes a couple flaws in the original series. The first major one was that the per_cpu accounting of qlen is not correct with respect to the qdisc bypass. Using per cpu counters for qlen allows a flow to be enqueuing on the packets into the qdisc and then get scheduled on another core and bypass the qdisc completely if that core is not in use. I've reworked the logic to use an atomic which is _correct_ now but unfortunately costs a lot in performance. With a single pfifo_fast and 12 threads of pktgen I still see a ~200k pps improvement even with atomic accounting so it is still a win but nothing like the +1Mpps without the atomic accounting. On the mq tests it atomic vs per cpu seems to be in the noise I believe because mq qdisc is already aligned with a pfifo_fast qdisc per core with the XPS setup I'm running mapping 1:1. Any thoughts around this would be interesting to hear. My general thinking around this is to submit the atomic version for inclusion and then start to improve it with a few items listed below. Additionally I've added a __netif_schedule() to the bad_skb_tx path otherwise I observed a pkt getting stuck on the bad_txq_cpu path on the pointer and sitting in the qdisc structure until it was kicked again from another pkt or netif_schedule. And on the netif_schedule() topic to support per cpu handling of gso and bad_txq_cpu we have to allow the netif_schedule() logic to fire on a per cpu model as well. Otherwise a bunch of small stylistic changes were made and I still need to do another pass to catch checkpatch warnings/errors and try to do a bit more cleanup around the statistics if/else branching. This series also has both the atomic qlen code and the per cpu qlen code as I continue to think up some scheme around the atomic qlen issue. But this series seems to be working. Future work is the following, - convert all qdiscs over to per cpu handling and cleanup the rather ugly if/else statistics handling. Although a bit of work its mechanical and should help some. - I'm looking at fq_codel to see how to make it "lockless". - It seems we can drop the TX_HARD_LOCK on cases where the nic exposes a queue per core now that we have enqueue/dequeue decoupled. The idea being a bunch of threads enqueue and per core dequeue logic runs. Requires XPS to be setup. - qlen improvements somehow - look at improvements to the skb_array structure. We can look at drop in replacements and/or improving it. Below is the data I took from pktgen, ./samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -t $NUM -i eth3 I did a run of 4 each time and took the total summation of each thread. There are four different runs for the mq and pfifo_fast cases. "without qlen atomic" uses per queue qlen values and allows bypassing the qdisc via bypass flag, this is incorrect but shows the impact of having an atomic in the mix. "with qlen atomic" shows the correct implementation with atomics and bypass enabled. And finally "without qlen atomic and no bypass" uses per cpu qlen values and disables bypass to ensure ooo packets are not created. To be clear the submitted patches here are the "with qlen atomic" metrics. nolock pfifo_fast (without qlen atomic) 1: 1440293 1421602 1409553 1393469 1424543 2: 1754890 1819292 1727948 1797711 1743427 4: 3282665 3344095 3315220 3332777 3348972 8: 2940079 1644450 2950777 2922085 2946310 12: 2042084 2610060 2857581 3493162 3104611 nolock pfifo_fast (with qlen atomic) 1: 1425231 1417176 1402862 1432880 2: 1631437 1633398 1630867 1628816 4: 1704383 1709900 1706274 1710198 8: 1348672 1344343 1339072 1334288 12: 1262988 1280724 1262237 1262615 nolock pfifo_fast (without qlen atomic and no bypass) 1: 1435796 1458522 1471855 1455658 2: 1880642 1876359 1872879 1884578 4: 1922935 1914589 1912832 1912116 8: 1585055 1576887 1577086 1570236 12: 1479273 1450706 1447056 1466330 lock (pfifo_fast) 1: 1471479 1469142 1458825 1456788 1453952 2: 1746231 1749490 1753176 1753780 1755959 4: 1119626 1120515 1121478 1119220 1121115 8: 1001471 999308 1000318 1000776 1000384 12: 989269 992122 991590 986581 990430 nolock (mq with per cpu qlen) 1: 1435952 1459523 1448860 1385451 1435031 2: 2850662 2855702 2859105 2855443 2843382 4: 5288135 5271192 5252242 5270192 5311642 8: 10042731 10018063 9891813 9968382 9956727 12: 13265277 13384199 13438955 13363771 13436198 nolock (mq with qlen atomic) 1: 1558253 1562285 1555037 1558422 2: 2917449 2952852 2921697 2892313 4: 5518243 5375300
[RFC PATCH 12/13] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq
The sch_mq qdisc creates a sub-qdisc per tx queue which are then called independently for enqueue and dequeue operations. However statistics are aggregated and pushed up to the "master" qdisc. This patch adds support for any of the sub-qdiscs to be per cpu statistic qdiscs. To handle this case add a check when calculating stats and aggregate the per cpu stats if needed. Also exports __gnet_stats_copy_queue() to use as a helper function. Signed-off-by: John Fastabend--- include/net/gen_stats.h |3 +++ net/core/gen_stats.c|9 + net/sched/sch_mq.c | 25 ++--- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 231e121..5ddc88b 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -47,6 +47,9 @@ int gnet_stats_copy_rate_est(struct gnet_dump *d, int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen); +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, +const struct gnet_stats_queue __percpu *cpu_q, +const struct gnet_stats_queue *q, __u32 qlen); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 508e051..a503547 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -254,10 +254,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, +const struct gnet_stats_queue __percpu *cpu, +const struct gnet_stats_queue *q, +__u32 qlen) { if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); @@ -271,6 +271,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, qstats->qlen = qlen; } +EXPORT_SYMBOL(__gnet_stats_copy_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index b943982..f4b5bbb 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -17,6 +17,7 @@ #include #include #include +#include struct mq_sched { struct Qdisc**qdiscs; @@ -107,15 +108,25 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) memset(>qstats, 0, sizeof(sch->qstats)); for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; + __u32 qlen = 0; + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues+= qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; + + if (qdisc_is_percpu_stats(qdisc)) { + cpu_bstats = qdisc->cpu_bstats; + cpu_qstats = qdisc->cpu_qstats; + } + + qlen = qdisc_qlen_sum(qdisc); + + __gnet_stats_copy_basic(NULL, >bstats, + cpu_bstats, >bstats); + __gnet_stats_copy_queue(>qstats, + cpu_qstats, >qstats, qlen); + spin_unlock_bh(qdisc_lock(qdisc)); } return 0;
[RFC PATCH 11/13] net: sched: pfifo_fast use alf_queue
This converts the pfifo_fast qdisc to use the alf_queue enqueue and dequeue routines then sets the NOLOCK bit. This also removes the logic used to pick the next band to dequeue from and instead just checks each alf_queue for packets from top priority to lowest. This might need to be a bit more clever but seems to work for now. Signed-off-by: John Fastabend--- net/sched/sch_generic.c | 133 +++ 1 file changed, 77 insertions(+), 56 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f5b7254..c41a5dd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -555,88 +556,79 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { /* * Private data for a pfifo_fast scheduler containing: - * - queues for the three band - * - bitmap indicating which of the bands contain skbs + * - rings for priority bands */ struct pfifo_fast_priv { - u32 bitmap; - struct sk_buff_head q[PFIFO_FAST_BANDS]; + struct skb_array q[PFIFO_FAST_BANDS]; }; -/* - * Convert a bitmap to the first band number where an skb is queued, where: - * bitmap=0 means there are no skbs on any band. - * bitmap=1 means there is an skb on band 0. - * bitmap=7 means there are skbs on all 3 bands, etc. - */ -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; - -static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, -int band) +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, + int band) { - return priv->q + band; + return >q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { - if (skb_queue_len(>q) < qdisc_dev(qdisc)->tx_queue_len) { - int band = prio2band[skb->priority & TC_PRIO_MAX]; - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - struct sk_buff_head *list = band2list(priv, band); - - priv->bitmap |= (1 << band); - qdisc->q.qlen++; - return __qdisc_enqueue_tail(skb, qdisc, list); - } + int band = prio2band[skb->priority & TC_PRIO_MAX]; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + struct skb_array *q = band2list(priv, band); + int err; - return qdisc_drop(skb, qdisc, to_free); + err = skb_array_produce(q, skb); + + if (unlikely(err)) + return qdisc_drop_cpu(skb, qdisc, to_free); + + qdisc_qstats_cpu_qlen_inc(qdisc); + qdisc_qstats_cpu_backlog_inc(qdisc, skb); + return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; + struct sk_buff *skb = NULL; + int band; - if (likely(band >= 0)) { - struct sk_buff_head *list = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - qdisc->q.qlen--; - if (skb_queue_empty(list)) - priv->bitmap &= ~(1 << band); + if (__skb_array_empty(q)) + continue; - return skb; + skb = skb_array_consume(q); } - return NULL; -} - -static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) -{ - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; - - if (band >= 0) { - struct sk_buff_head *list = band2list(priv, band); - - return skb_peek(list); + if (likely(skb)) { + qdisc_qstats_cpu_backlog_dec(qdisc, skb); + qdisc_bstats_cpu_update(qdisc, skb); + qdisc_qstats_cpu_qlen_dec(qdisc); } - return NULL; + return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { - int prio; + int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(band2list(priv, prio)); + for (band = 0; band < PFIFO_FAST_BANDS; band++) { + struct skb_array *q = band2list(priv, band); + struct sk_buff *skb; - priv->bitmap = 0; - qdisc->qstats.backlog = 0; - qdisc->q.qlen = 0; + while ((skb = skb_array_consume(q)) != NULL) + kfree_skb(skb); + } + + for_each_possible_cpu(i) { + struct gnet_stats_queue *q =
Re: [PATCH] [v8] net: emac: emac gigabit ethernet controller driver
Florian Fainelli wrote: On 08/11/2016 02:34 PM, Timur Tabi wrote: >Add supports for ethernet controller HW on Qualcomm Technologies, Inc. SoC. >This driver supports the following features: >1) Checksum offload. >2) Interrupt coalescing support. >3) SGMII phy. >4) phylib interface for external phy OK, so this is looking good now, just a few nits, feel free to submit as clean up patches, would this one be accepted. I will post a v9. >+soc { >+ #address-cells = <1>; >+ #size-cells = <1>; >+ dma-ranges = <0 0 0x>; >+ >+ emac0: ethernet@feb2 { >+ compatible = "qcom,fsm9900-emac"; >+ #address-cells = <1>; >+ #size-cells = <1>; >+ reg-names = "base", "csr", "ptp", "sgmii"; >+ reg = <0xfeb2 0x1>, >+ <0xfeb36000 0x1000>, >+ <0xfeb3c000 0x4000>, >+ <0xfeb38000 0x400>; >+ interrupt-parent = <>; >+ #interrupt-cells = <1>; >+ interrupts = <76 80>; >+ interrupt-names = "emac_core0", "sgmii_irq"; >+ phy0: ethernet-phy@0 { >+ compatible = "qcom,fsm9900-emac-phy"; >+ reg = <0>; >+ } If this is an external PHY, the expectation is that it will be hanging off a MDIO bus controller, either the MDIO bus internal to the MAC, or an external MDIO bus (separate register range). So for starters, I forgot to delete the 'compatible' property from this document. Its presence breaks of_mdiobus_child_is_phy(). If such a PHY node is provided, the expectation is that your Ethernet MAC will have a phy-handle property and a phy-mode property to specify how to connect to this external PHY, so in your specific case, better remove the PHY from your example, or detail further how it should be done. Something is odd about of_mdiobus_register(). This function scans child nodes to look for PHYs: /* Loop over the child nodes and register a phy_device for each phy */ for_each_available_child_of_node(np, child) { addr = of_mdio_parse_addr(>dev, child); And in my driver, this works. However, later when I try to call of_phy_find_device(), it fails. That's because of_phy_find_device() wants the np of the child node. But the only way to get that is with a phy-phandle property which I need to manually parse. So what's the point of having of_mdiobus_register() parse child nodes, if you need a phy-phandle pointer anyway? >diff --git a/drivers/net/ethernet/qualcomm/Kconfig b/drivers/net/ethernet/qualcomm/Kconfig >index a76e380..85b599f 100644 >--- a/drivers/net/ethernet/qualcomm/Kconfig >+++ b/drivers/net/ethernet/qualcomm/Kconfig >@@ -24,4 +24,15 @@ config QCA7000 > To compile this driver as a module, choose M here. The module > will be called qcaspi. > >+config QCOM_EMAC >+ tristate "Qualcomm Technologies, Inc. EMAC Gigabit Ethernet support" >+ select CRC32 select PHYLIB? Ok. >diff --git a/drivers/net/ethernet/qualcomm/Makefile b/drivers/net/ethernet/qualcomm/Makefile >index 9da2d75..1b3a0ce 100644 >--- a/drivers/net/ethernet/qualcomm/Makefile >+++ b/drivers/net/ethernet/qualcomm/Makefile >@@ -4,3 +4,5 @@ > > obj-$(CONFIG_QCA7000) += qcaspi.o > qcaspi-objs := qca_spi.o qca_framing.o qca_7k.o qca_debug.o >+ >+obj-$(CONFIG_QCOM_EMAC) += emac/ Since you have a check on CONFIG_QCOM_EMAC in emac/Makefile, you could always recurse into that directory while building (use obj-y). Obviously, having "obj-$(CONFIG_QCOM_EMAC)" in both Makefiles is redundant, but wouldn't it make more sense to change "obj-$(CONFIG_QCOM_EMAC)" to "obj-y" in drivers/net/ethernet/qualcomm/emac/Makefile, so that I only recurse if necessary? >+static void emac_adjust_link(struct net_device *netdev) >+{ >+ struct emac_adapter *adpt = netdev_priv(netdev); >+ struct phy_device *phydev = netdev->phydev; >+ >+ if (phydev->link) >+ emac_mac_start(adpt); >+ else >+ emac_mac_stop(adpt); This is really excessive here, you typically don't need to completely stop your transmitter/receiver/DMA/what have you, just reprogram the MAC with the appropriate link speed/duplex/pause parameters. Yes, I realize that a lot of the reinit code in my driver is "heavy", and that it could be optimized for specific situations. However, I'd like to save that for another patch, because it would require me to study the documentation and do extensive tests. I can't tell you today whether emac_mac_start() is overkill for the specific link change. For all I know, the hardware does require the TX to stop before changing the link speed. Remember, the EMAC does have a weird double-phy (one internal, one external). -- Sent by an employee of the Qualcomm Innovation Center, Inc. The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by The Linux Foundation.
[RFC PATCH 13/13] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mqprio
The sch_mqprio qdisc creates a sub-qdisc per tx queue which are then called independently for enqueue and dequeue operations. However statistics are aggregated and pushed up to the "master" qdisc. This patch adds support for any of the sub-qdiscs to be per cpu statistic qdiscs. To handle this case add a check when calculating stats and aggregate the per cpu stats if needed. Signed-off-by: John Fastabend--- net/sched/sch_mqprio.c | 61 +++- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 549c663..24f0360 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -229,22 +229,32 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) unsigned char *b = skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; - unsigned int i; + unsigned int ntx, tc; sch->q.qlen = 0; memset(>bstats, 0, sizeof(sch->bstats)); memset(>qstats, 0, sizeof(sch->qstats)); - for (i = 0; i < dev->num_tx_queues; i++) { - qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; + __u32 qlen = 0; + + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues+= qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; + + if (qdisc_is_percpu_stats(qdisc)) { + cpu_bstats = qdisc->cpu_bstats; + cpu_qstats = qdisc->cpu_qstats; + } + + qlen = qdisc_qlen_sum(qdisc); + + __gnet_stats_copy_basic(NULL, >bstats, + cpu_bstats, >bstats); + __gnet_stats_copy_queue(>qstats, + cpu_qstats, >qstats, qlen); + spin_unlock_bh(qdisc_lock(qdisc)); } @@ -252,9 +262,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); opt.hw = priv->hw_owned; - for (i = 0; i < netdev_get_num_tc(dev); i++) { - opt.count[i] = dev->tc_to_txq[i].count; - opt.offset[i] = dev->tc_to_txq[i].offset; + for (tc = 0; tc < netdev_get_num_tc(dev); tc++) { + opt.count[tc] = dev->tc_to_txq[tc].count; + opt.offset[tc] = dev->tc_to_txq[tc].offset; } if (nla_put(skb, TCA_OPTIONS, sizeof(opt), )) @@ -332,7 +342,6 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (cl <= netdev_get_num_tc(dev)) { int i; __u32 qlen = 0; - struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1]; @@ -347,18 +356,26 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); + struct Qdisc *qdisc = rtnl_dereference(q->qdisc); + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; - qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); - qlen += qdisc->q.qlen; - bstats.bytes += qdisc->bstats.bytes; - bstats.packets+= qdisc->bstats.packets; - qstats.backlog+= qdisc->qstats.backlog; - qstats.drops += qdisc->qstats.drops; - qstats.requeues += qdisc->qstats.requeues; - qstats.overlimits += qdisc->qstats.overlimits; + if (qdisc_is_percpu_stats(qdisc)) { + cpu_bstats = qdisc->cpu_bstats; + cpu_qstats = qdisc->cpu_qstats; + } + + qlen = qdisc_qlen_sum(qdisc); + __gnet_stats_copy_basic(NULL, >bstats, +
[RFC PATCH 08/13] net: sched: support skb_bad_tx with lockless qdisc
Similar to how gso is handled skb_bad_tx needs to be per cpu to handle lockless qdisc with multiple writer/producers. Signed-off-by: John Fastabend--- include/net/sch_generic.h |7 +++ net/sched/sch_api.c |6 +++ net/sched/sch_generic.c | 95 + 3 files changed, 99 insertions(+), 9 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 0864813..d465fb9 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -40,6 +40,10 @@ struct gso_cell { struct sk_buff *skb; }; +struct bad_txq_cell { + struct sk_buff *skb; +}; + struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, @@ -77,7 +81,8 @@ struct Qdisc { struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; - struct gso_cell __percpu *gso_cpu_skb; + struct gso_cell __percpu *gso_cpu_skb; + struct bad_txq_cell __percpu *skb_bad_txq_cpu; /* * For performance sake on SMP, we put highly modified fields at the end diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d713052..b90a23a 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -970,6 +970,11 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, sch->gso_cpu_skb = alloc_percpu(struct gso_cell); if (!sch->gso_cpu_skb) goto err_out4; + + sch->skb_bad_txq_cpu = + alloc_percpu(struct bad_txq_cell); + if (!sch->skb_bad_txq_cpu) + goto err_out4; } if (tca[TCA_STAB]) { @@ -1021,6 +1026,7 @@ err_out4: free_percpu(sch->cpu_bstats); free_percpu(sch->cpu_qstats); free_percpu(sch->gso_cpu_skb); + free_percpu(sch->skb_bad_txq_cpu); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 29238c4..d10b762 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -44,6 +44,43 @@ EXPORT_SYMBOL(default_qdisc_ops); * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ +static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *sch) +{ + if (sch->skb_bad_txq_cpu) { + struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu); + + return cell->skb; + } + + return sch->skb_bad_txq; +} + +static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *sch, +struct sk_buff *skb) +{ + if (sch->skb_bad_txq_cpu) { + struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu); + + cell->skb = skb; + __netif_schedule(sch); + return; + } + + sch->skb_bad_txq = skb; +} + +static inline void qdisc_null_skb_bad_txq(struct Qdisc *sch) +{ + if (sch->skb_bad_txq_cpu) { + struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu); + + cell->skb = NULL; + return; + } + + sch->skb_bad_txq = NULL; +} + static inline struct sk_buff *qdisc_dequeue_gso_skb(struct Qdisc *sch) { if (sch->gso_cpu_skb) @@ -129,9 +166,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { - q->skb_bad_txq = nskb; - qdisc_qstats_backlog_inc(q, nskb); - q->q.qlen++; + qdisc_enqueue_skb_bad_txq(q, nskb); + + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_inc(q, nskb); + qdisc_qstats_cpu_qlen_inc(q); + } else { + qdisc_qstats_backlog_inc(q, nskb); + q->q.qlen++; + } break; } skb->next = nskb; @@ -160,7 +203,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, qdisc_null_gso_skb(q); if (qdisc_is_percpu_stats(q)) { - qdisc_qstats_cpu_backlog_inc(q, skb); + qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb);
[RFC PATCH 09/13] net: sched: helper to sum qlen
Reporting qlen when qlen is per cpu requires aggregating the per cpu counters. This adds a helper routine for this. Signed-off-by: John Fastabend--- include/net/sch_generic.h | 15 +++ net/sched/sch_api.c |3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d465fb9..cc28af0 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -280,6 +280,21 @@ static inline int qdisc_qlen(const struct Qdisc *q) return q->q.qlen; } +static inline int qdisc_qlen_sum(const struct Qdisc *q) +{ + __u32 qlen = 0; + int i; + + if (q->flags & TCQ_F_NOLOCK) { + for_each_possible_cpu(i) + qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; + } else { + qlen = q->q.qlen; + } + + return qlen; +} + static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) { return (struct qdisc_skb_cb *)skb->cb; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b90a23a..6c5bf13 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1370,7 +1370,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; - qlen = q->q.qlen; + + qlen = qdisc_qlen_sum(q); stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0)
[RFC PATCH 10/13] net: sched: lockless support for netif_schedule
netif_schedule uses a bit QDISC_STATE_SCHED to tell the qdisc layer if a run of the qdisc has been scheduler. This is important when tearing down qdisc instances. We can rcu_free an instance for example if its possible that we might have outstanding references to it. Perhaps more importantly in the per cpu lockless case we need to schedule a run of the qdisc on all qdiscs that are enqueu'ing packets and hitting the gso_skb requeue logic or else the skb may get stuck on the gso_skb queue without anything to finish the xmit. This patch uses a reference counter instead of a bit to account for the multiple CPUs. --- include/net/sch_generic.h |1 + net/core/dev.c| 32 +++- net/sched/sch_api.c |5 + net/sched/sch_generic.c | 16 +++- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cc28af0..2e0e5b0 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -94,6 +94,7 @@ struct Qdisc { seqcount_t running; struct gnet_stats_queue qstats; unsigned long state; + unsigned long __percpu *cpu_state; struct Qdisc*next_sched; struct sk_buff *skb_bad_txq; struct rcu_head rcu_head; diff --git a/net/core/dev.c b/net/core/dev.c index 5db395d..f491845 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2272,8 +2272,14 @@ static void __netif_reschedule(struct Qdisc *q) void __netif_schedule(struct Qdisc *q) { - if (!test_and_set_bit(__QDISC_STATE_SCHED, >state)) + if (q->flags & TCQ_F_NOLOCK) { + unsigned long *s = this_cpu_ptr(q->cpu_state); + + if (!test_and_set_bit(__QDISC_STATE_SCHED, s)) + __netif_reschedule(q); + } else if (!test_and_set_bit(__QDISC_STATE_SCHED, >state)) { __netif_reschedule(q); + } } EXPORT_SYMBOL(__netif_schedule); @@ -3925,15 +3931,23 @@ static void net_tx_action(struct softirq_action *h) if (!(q->flags & TCQ_F_NOLOCK)) { root_lock = qdisc_lock(q); spin_lock(root_lock); - } - /* We need to make sure head->next_sched is read -* before clearing __QDISC_STATE_SCHED -*/ - smp_mb__before_atomic(); - clear_bit(__QDISC_STATE_SCHED, >state); - qdisc_run(q); - if (!(q->flags & TCQ_F_NOLOCK)) + + /* We need to make sure head->next_sched is read +* before clearing __QDISC_STATE_SCHED +*/ + smp_mb__before_atomic(); + clear_bit(__QDISC_STATE_SCHED, >state); + + qdisc_run(q); + spin_unlock(root_lock); + } else { + unsigned long *s = this_cpu_ptr(q->cpu_state); + + smp_mb__before_atomic(); + clear_bit(__QDISC_STATE_SCHED, s); + __qdisc_run(q); + } } } } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 6c5bf13..89989a6 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -975,6 +975,10 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, alloc_percpu(struct bad_txq_cell); if (!sch->skb_bad_txq_cpu) goto err_out4; + + sch->cpu_state = alloc_percpu(unsigned long); + if (!sch->cpu_state) + goto err_out4; } if (tca[TCA_STAB]) { @@ -1027,6 +1031,7 @@ err_out4: free_percpu(sch->cpu_qstats); free_percpu(sch->gso_cpu_skb); free_percpu(sch->skb_bad_txq_cpu); + free_percpu(sch->cpu_state); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index d10b762..f5b7254 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -171,6 +171,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_inc(q, nskb); qdisc_qstats_cpu_qlen_inc(q); + set_thread_flag(TIF_NEED_RESCHED); } else { qdisc_qstats_backlog_inc(q, nskb);
[RFC PATCH 05/13] net: sched: a dflt qdisc may be used with per cpu stats
Enable dflt qdisc support for per cpu stats before this patch a dflt qdisc was required to use the global statistics qstats and bstats. Signed-off-by: John Fastabend--- net/sched/sch_generic.c | 24 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index af32418..f8fec81 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -645,18 +645,34 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, struct Qdisc *sch; if (!try_module_get(ops->owner)) - goto errout; + return NULL; sch = qdisc_alloc(dev_queue, ops); if (IS_ERR(sch)) - goto errout; + return NULL; sch->parent = parentid; - if (!ops->init || ops->init(sch, NULL) == 0) + if (!ops->init) return sch; - qdisc_destroy(sch); + if (ops->init(sch, NULL)) + goto errout; + + /* init() may have set percpu flags so init data structures */ + if (qdisc_is_percpu_stats(sch)) { + sch->cpu_bstats = + netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto errout; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) + goto errout; + } + + return sch; errout: + qdisc_destroy(sch); return NULL; } EXPORT_SYMBOL(qdisc_create_dflt);
[RFC PATCH 06/13] net: sched: per cpu gso handlers
The net sched infrastructure has a gso ptr that points to skb structs that have failed to be enqueued by the device driver. This can happen when multiple cores try to push a skb onto the same underlying hardware queue resulting in lock contention. This case is handled by a cpu collision handler handle_dev_cpu_collision(). Another case occurs when the stack overruns the drivers low level tx queues capacity. Ideally these should be a rare occurrence in a well-tuned system but they do happen. To handle this in the lockless case use a per cpu gso field to park the skb until the conflict can be resolved. Note at this point the skb has already been popped off the qdisc so it has to be handled by the infrastructure. Signed-off-by: John Fastabend--- include/net/sch_generic.h | 39 + net/sched/sch_api.c |7 net/sched/sch_generic.c | 71 ++--- 3 files changed, 112 insertions(+), 5 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 193cf8c..0864813 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -36,6 +36,10 @@ struct qdisc_size_table { u16 data[]; }; +struct gso_cell { + struct sk_buff *skb; +}; + struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, @@ -73,6 +77,8 @@ struct Qdisc { struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; + struct gso_cell __percpu *gso_cpu_skb; + /* * For performance sake on SMP, we put highly modified fields at the end */ @@ -744,6 +750,23 @@ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch) return sch->gso_skb; } +static inline struct sk_buff *qdisc_peek_dequeued_cpu(struct Qdisc *sch) +{ + struct gso_cell *gso = this_cpu_ptr(sch->gso_cpu_skb); + + if (!gso->skb) { + struct sk_buff *skb = sch->dequeue(sch); + + if (skb) { + gso->skb = skb; + qdisc_qstats_cpu_backlog_inc(sch, skb); + qdisc_qstats_cpu_qlen_inc(sch); + } + } + + return gso->skb; +} + /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) { @@ -760,6 +783,22 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) return skb; } +static inline struct sk_buff *qdisc_dequeue_peeked_skb(struct Qdisc *sch) +{ + struct gso_cell *gso = this_cpu_ptr(sch->gso_cpu_skb); + struct sk_buff *skb = gso->skb; + + if (skb) { + gso->skb = NULL; + qdisc_qstats_cpu_backlog_dec(sch, skb); + qdisc_qstats_cpu_qlen_dec(sch); + } else { + skb = sch->dequeue(sch); + } + + return skb; +} + static inline void __qdisc_reset_queue(struct sk_buff_head *list) { /* diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 12ebde8..d713052 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -966,6 +966,12 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, goto err_out4; } + if (sch->flags & TCQ_F_NOLOCK) { + sch->gso_cpu_skb = alloc_percpu(struct gso_cell); + if (!sch->gso_cpu_skb) + goto err_out4; + } + if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB]); if (IS_ERR(stab)) { @@ -1014,6 +1020,7 @@ err_out: err_out4: free_percpu(sch->cpu_bstats); free_percpu(sch->cpu_qstats); + free_percpu(sch->gso_cpu_skb); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f8fec81..3b9a21f 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -44,8 +44,25 @@ EXPORT_SYMBOL(default_qdisc_ops); * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ +static inline struct sk_buff *qdisc_dequeue_gso_skb(struct Qdisc *sch) +{ + if (sch->gso_cpu_skb) + return (this_cpu_ptr(sch->gso_cpu_skb))->skb; -static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) + return sch->gso_skb; +} + +static inline void qdisc_null_gso_skb(struct Qdisc *sch) +{ + if (sch->gso_cpu_skb) { + (this_cpu_ptr(sch->gso_cpu_skb))->skb = NULL; + return; + } + + sch->gso_skb = NULL; +} + +static inline int
[RFC PATCH 07/13] net: sched: support qdisc_reset on NOLOCK qdisc
The qdisc_reset operation depends on the qdisc lock at the moment to halt any additions to gso_skb and statistics while the list is free'd and the stats zeroed. Without the qdisc lock we can not guarantee another cpu is not in the process of adding a skb to one of the "cells". Here are the two cases we have to handle. case 1: qdisc_graft operation. In this case a "new" qdisc is attached and the 'qdisc_destroy' operation is called on the old qdisc. The destroy operation will wait a rcu grace period and call qdisc_rcu_free(). At which point gso_cpu_skb is free'd along with all stats so no need to zero stats and gso_cpu_skb from the reset operation itself. Because we can not continue to call qdisc_reset before waiting an rcu grace period so that the qdisc is detached from all cpus simply do not call qdisc_reset() at all and let the qdisc_destroy operation clean up the qdisc. Note, a refcnt greater than 1 would cause the destroy operation to be aborted however if this ever happened the reference to the qdisc would be lost and we would have a memory leak. case 2: dev_deactivate sequence. This can come from a user bringing the interface down which causes the gso_skb list to be flushed and the qlen zero'd. At the moment this is protected by the qdisc lock so while we clear the qlen/gso_skb fields we are guaranteed no new skbs are added. For the lockless case though this is not true. To resolve this move the qdisc_reset call after the new qdisc is assigned and a grace period is exercised to ensure no new skbs can be enqueued. Further the RTNL lock is held so we can not get another call to activate the qdisc while the skb lists are being free'd. Finally, fix qdisc_reset to handle the per cpu stats and skb lists. Signed-off-by: John Fastabend--- net/sched/sch_generic.c | 45 +++-- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 3b9a21f..29238c4 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -737,6 +737,20 @@ void qdisc_reset(struct Qdisc *qdisc) kfree_skb(qdisc->skb_bad_txq); qdisc->skb_bad_txq = NULL; + if (qdisc->gso_cpu_skb) { + int i; + + for_each_possible_cpu(i) { + struct gso_cell *cell; + + cell = per_cpu_ptr(qdisc->gso_cpu_skb, i); + if (cell) { + kfree_skb_list(cell->skb); + cell = NULL; + } + } + } + if (qdisc->gso_skb) { kfree_skb_list(qdisc->gso_skb); qdisc->gso_skb = NULL; @@ -812,10 +826,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); - /* Prune old scheduler */ - if (oqdisc && atomic_read(>refcnt) <= 1) - qdisc_reset(oqdisc); - /* ... and graft new one */ if (qdisc == NULL) qdisc = _qdisc; @@ -929,7 +939,6 @@ static void dev_deactivate_queue(struct net_device *dev, set_bit(__QDISC_STATE_DEACTIVATED, >state); rcu_assign_pointer(dev_queue->qdisc, qdisc_default); - qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } @@ -966,6 +975,16 @@ static bool some_qdisc_is_busy(struct net_device *dev) return false; } +static void dev_qdisc_reset(struct net_device *dev, + struct netdev_queue *dev_queue, + void *none) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + + if (qdisc) + qdisc_reset(qdisc); +} + /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate @@ -976,7 +995,6 @@ static bool some_qdisc_is_busy(struct net_device *dev) void dev_deactivate_many(struct list_head *head) { struct net_device *dev; - bool sync_needed = false; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, @@ -986,20 +1004,27 @@ void dev_deactivate_many(struct list_head *head) _qdisc); dev_watchdog_down(dev); - sync_needed |= !dev->dismantle; } /* Wait for outstanding qdisc-less dev_queue_xmit calls. * This is avoided if all devices are in dismantle phase : * Caller will call synchronize_net() for us */ - if (sync_needed) - synchronize_net(); + synchronize_net(); /*
[RFC PATCH 01/13] net: sched: allow qdiscs to handle locking
This patch adds a flag for queueing disciplines to indicate the stack does not need to use the qdisc lock to protect operations. This can be used to build lockless scheduling algorithms and improving performance. The flag is checked in the tx path and the qdisc lock is only taken if it is not set. For now use a conditional if statement. Later we could be more aggressive if it proves worthwhile and use a static key or wrap this in a likely(). Signed-off-by: John Fastabend--- include/net/pkt_sched.h |4 +++- include/net/sch_generic.h |1 + net/core/dev.c| 32 net/sched/sch_generic.c | 26 -- 4 files changed, 48 insertions(+), 15 deletions(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 7caa99b..69540c6 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -107,8 +107,10 @@ void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { - if (qdisc_run_begin(q)) + if (qdisc_run_begin(q)) { __qdisc_run(q); + qdisc_run_end(q); + } } int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 909aff2..3de6a8c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -58,6 +58,7 @@ struct Qdisc { #define TCQ_F_NOPARENT 0x40 /* root of its hierarchy : * qdisc_tree_decrease_qlen() should stop. */ +#define TCQ_F_NOLOCK 0x80 /* qdisc does not require locking */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; diff --git a/net/core/dev.c b/net/core/dev.c index 4ce07dc..5db395d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3076,6 +3076,26 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, int rc; qdisc_calculate_pkt_len(skb, q); + + if (q->flags & TCQ_F_NOLOCK) { + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, >state))) { + __qdisc_drop(skb, _free); + rc = NET_XMIT_DROP; + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q)) { + qdisc_bstats_cpu_update(q, skb); + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) + __qdisc_run(q); + rc = NET_XMIT_SUCCESS; + } else { + rc = q->enqueue(skb, q, _free) & NET_XMIT_MASK; + __qdisc_run(q); + } + + if (unlikely(to_free)) + kfree_skb_list(to_free); + return rc; + } + /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3118,6 +3138,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); + qdisc_run_end(q); } } spin_unlock(root_lock); @@ -3897,19 +3918,22 @@ static void net_tx_action(struct softirq_action *h) while (head) { struct Qdisc *q = head; - spinlock_t *root_lock; + spinlock_t *root_lock = NULL; head = head->next_sched; - root_lock = qdisc_lock(q); - spin_lock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, >state); qdisc_run(q); - spin_unlock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) + spin_unlock(root_lock); } } } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e95b67c..af32418 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -170,7 +170,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, int ret = NETDEV_TX_BUSY; /* And release qdisc */ - spin_unlock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) + spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) @@ -183,10 +184,13 @@ int
[RFC PATCH 04/13] net: sched: provide atomic qlen helpers for bypass case
The qlen is used by the core/dev.c to determine if a packet can skip the qdisc on qdiscs with bypass enabled. In these cases a per cpu qlen value can cause one cpu to bypass a qdisc that has packets in it. To avoid this case use the simplest solution I could come up with for now and add an atomic qlen value to the qdisc to use in these cases. Signed-off-by: John Fastabend--- include/net/sch_generic.h | 21 - 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f69da4b..193cf8c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -78,6 +78,7 @@ struct Qdisc { */ struct sk_buff *gso_skb cacheline_aligned_in_smp; struct sk_buff_head q; + atomic_tqlen_atomic; struct gnet_stats_basic_packed bstats; seqcount_t running; struct gnet_stats_queue qstats; @@ -247,6 +248,11 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) BUILD_BUG_ON(sizeof(qcb->data) < sz); } +static inline int qdisc_qlen_atomic(const struct Qdisc *q) +{ + return atomic_read(>qlen_atomic); +} + static inline int qdisc_qlen_cpu(const struct Qdisc *q) { return this_cpu_ptr(q->cpu_qstats)->qlen; @@ -254,8 +260,11 @@ static inline int qdisc_qlen_cpu(const struct Qdisc *q) static inline int qdisc_qlen(const struct Qdisc *q) { + /* current default is to use atomic ops for qdisc qlen when +* running with TCQ_F_NOLOCK. +*/ if (q->flags & TCQ_F_NOLOCK) - return qdisc_qlen_cpu(q); + return qdisc_qlen_atomic(q); return q->q.qlen; } @@ -595,6 +604,16 @@ static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, q->backlog += qdisc_pkt_len(skb); } +static inline void qdisc_qstats_atomic_qlen_inc(struct Qdisc *sch) +{ + atomic_inc(>qlen_atomic); +} + +static inline void qdisc_qstats_atomic_qlen_dec(struct Qdisc *sch) +{ + atomic_dec(>qlen_atomic); +} + static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) { this_cpu_ptr(sch->cpu_qstats)->qlen++;
[RFC PATCH 02/13] net: sched: qdisc_qlen for per cpu logic
This is a bit interesting because it means sch_direct_xmit will return a positive value which causes the dequeue/xmit cycle to continue only when a specific cpu has a qlen > 0. However checking each cpu for qlen will break performance so its important to note that qdiscs that set the no lock bit need to have some sort of per cpu enqueue/dequeue data structure that maps to the per cpu qlen value. Signed-off-by: John Fastabend--- include/net/sch_generic.h |8 1 file changed, 8 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 3de6a8c..354951d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -247,8 +247,16 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) BUILD_BUG_ON(sizeof(qcb->data) < sz); } +static inline int qdisc_qlen_cpu(const struct Qdisc *q) +{ + return this_cpu_ptr(q->cpu_qstats)->qlen; +} + static inline int qdisc_qlen(const struct Qdisc *q) { + if (q->flags & TCQ_F_NOLOCK) + return qdisc_qlen_cpu(q); + return q->q.qlen; }
[RFC PATCH 03/13] net: sched: provide per cpu qstat helpers
The per cpu qstats support was added with per cpu bstat support which is currently used by the ingress qdisc. This patch adds a set of helpers needed to make other qdiscs that use qstats per cpu as well. Signed-off-by: John Fastabend--- include/net/sch_generic.h | 39 +++ 1 file changed, 39 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 354951d..f69da4b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -573,12 +573,43 @@ static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, sch->qstats.backlog -= qdisc_pkt_len(skb); } +static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch, + const struct sk_buff *skb) +{ + struct gnet_stats_queue *q = this_cpu_ptr(sch->cpu_qstats); + + q->backlog -= qdisc_pkt_len(skb); +} + static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog += qdisc_pkt_len(skb); } +static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, + const struct sk_buff *skb) +{ + struct gnet_stats_queue *q = this_cpu_ptr(sch->cpu_qstats); + + q->backlog += qdisc_pkt_len(skb); +} + +static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) +{ + this_cpu_ptr(sch->cpu_qstats)->qlen++; +} + +static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch) +{ + this_cpu_ptr(sch->cpu_qstats)->qlen--; +} + +static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch) +{ + this_cpu_ptr(sch->cpu_qstats)->requeues++; +} + static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) { sch->qstats.drops += count; @@ -751,6 +782,14 @@ static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) qdisc_qstats_drop(sch); } +static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch, +struct sk_buff **to_free) +{ + __qdisc_drop(skb, to_free); + qdisc_qstats_cpu_drop(sch); + + return NET_XMIT_DROP; +} static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver
Timur Tabi wrote: The question, how should adpt->phy.version be initialized on device tree and ACPI platforms? The reason why this is confusing is because there are conflicting perspectives of this "internal PHY". It both is and is not part of the EMAC. It is a separate block in the chip, and can be replaced with other SGMII blocks, and in the future there will be a "v3" and maybe a "v4" or who knows what. In fact, maybe using version numbers is inappropriate and we'll need to use vendor names or something. The problem is that the internal PHY has mechanism for self-identification. There is no ACPI or device tree node for it. There is no register you can query to see if it's there or what version it is. The MAC part of the EMAC has no knowledge of the PHY part. The driver just has to know it's there. This is why I'm advocating a separate property (DT and ACPI) to identify the internal PHY. So we had some internal discussion on this. Different compatible properties for DT are okay, and for ACPI we'll either create a "phy-version" property (DT-like properties are called DSDs in ACPI) or use an HRV (a specific property for the hardware version). But this speaks to the greater problem of mapping DT compatible strings to ACPI. Like I said, the ACPI equivalent of a compatible string is the HID. But unlike DT, we cannot create new HIDs whenever we feel like it. The process for generating and assigning HIDs is much more involved, and we do have a very limited namespace. -- Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
Re: [RFC PATCH 4/5] net: filter: run cgroup eBPF programs
On Wed, Aug 17, 2016 at 11:20:29AM -0700, Alexei Starovoitov wrote: > On Wed, Aug 17, 2016 at 04:00:47PM +0200, Daniel Mack wrote: > > If CONFIG_CGROUP_BPF is enabled, and the cgroup associated with the > > receiving socket has an eBPF programs installed, run them from > > sk_filter_trim_cap(). > > > > eBPF programs used in this context are expected to either return 1 to > > let the packet pass, or != 1 to drop them. The programs have access to > > the full skb, including the MAC headers. > > > > This patch only implements the call site for ingress packets. > > > > Signed-off-by: Daniel Mack> > --- > > net/core/filter.c | 44 > > 1 file changed, 44 insertions(+) > > > > diff --git a/net/core/filter.c b/net/core/filter.c > > index c5d8332..a1dd94b 100644 > > --- a/net/core/filter.c > > +++ b/net/core/filter.c > > @@ -52,6 +52,44 @@ > > #include > > #include > > > > +#ifdef CONFIG_CGROUP_BPF > > +static int sk_filter_cgroup_bpf(struct sock *sk, struct sk_buff *skb, > > + enum bpf_attach_type type) > > +{ > > + struct sock_cgroup_data *skcd = >sk_cgrp_data; > > + struct cgroup *cgrp = sock_cgroup_ptr(skcd); > > + struct bpf_prog *prog; > > + int ret = 0; > > + > > + rcu_read_lock(); > > + > > + switch (type) { > > + case BPF_ATTACH_TYPE_CGROUP_EGRESS: > > + prog = rcu_dereference(cgrp->bpf_egress); > > + break; > > + case BPF_ATTACH_TYPE_CGROUP_INGRESS: > > + prog = rcu_dereference(cgrp->bpf_ingress); > > + break; > > + default: > > + WARN_ON_ONCE(1); > > + ret = -EINVAL; > > + break; > > + } > > + > > + if (prog) { > > I really like how in this version of the patches it became > a single load+cmp of per-packet cost when this feature is off. > Please move > + struct cgroup *cgrp = sock_cgroup_ptr(skcd); > into if (prog) {..} > to make sure it's actually single load. > The compiler cannot avoid that load when it's placed at the top. sorry. brain fart. it is two loads. scratch that. > > + unsigned int offset = skb->data - skb_mac_header(skb); > > + > > + __skb_push(skb, offset); > > + ret = bpf_prog_run_clear_cb(prog, skb) > 0 ? 0 : -EPERM; > > that doesn't match commit log. The above '> 0' makes sense to me though. > If we want to do it for 1 only we have to define it in uapi/bpf.h > as action code, so we can extend to 2, 3 in the future if necessary. > > It also have to be bpf_prog_run_save_cb() (as sk_filter_trim_cap() does) > instead of bpf_prog_run_clear_cb(). > See commit ff936a04e5f2 ("bpf: fix cb access in socket filter programs") > > > + __skb_pull(skb, offset); > > + } > > + > > + rcu_read_unlock(); > > + > > + return ret; > > +} > > +#endif /* !CONFIG_CGROUP_BPF */ > > + > > /** > > * sk_filter_trim_cap - run a packet through a socket filter > > * @sk: sock associated with _buff > > @@ -78,6 +116,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff > > *skb, unsigned int cap) > > if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) > > return -ENOMEM; > > > > +#ifdef CONFIG_CGROUP_BPF > > + err = sk_filter_cgroup_bpf(sk, skb, BPF_ATTACH_TYPE_CGROUP_INGRESS); > > + if (err) > > + return err; > > +#endif > > + > > err = security_sock_rcv_skb(sk, skb); > > if (err) > > return err; > > -- > > 2.5.5 > >
Re: [RFC PATCH 3/5] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands
On Wed, Aug 17, 2016 at 09:16:02AM -0700, Eric Dumazet wrote: > On Wed, 2016-08-17 at 16:00 +0200, Daniel Mack wrote: > > > + progp = is_ingress ? >bpf_ingress : >bpf_egress; > > + > > + rcu_read_lock(); > > + old_prog = rcu_dereference(*progp); > > + rcu_assign_pointer(*progp, prog); > > + > > + if (old_prog) > > + bpf_prog_put(old_prog); > > + > > + rcu_read_unlock(); > > > This is a bogus locking strategy. yep. this rcu_lock/unlock won't solve the race between parallel bpf_prog_attach calls. please use xchg() similar to bpf_fd_array_map_update_elem() Then prog pointer will be swapped automically and bpf_prog_put() will free it via call_rcu. The reader side in sk_filter_cgroup_bpf() looks correct.
Re: [RFC PATCH 4/5] net: filter: run cgroup eBPF programs
On Wed, Aug 17, 2016 at 04:00:47PM +0200, Daniel Mack wrote: > If CONFIG_CGROUP_BPF is enabled, and the cgroup associated with the > receiving socket has an eBPF programs installed, run them from > sk_filter_trim_cap(). > > eBPF programs used in this context are expected to either return 1 to > let the packet pass, or != 1 to drop them. The programs have access to > the full skb, including the MAC headers. > > This patch only implements the call site for ingress packets. > > Signed-off-by: Daniel Mack> --- > net/core/filter.c | 44 > 1 file changed, 44 insertions(+) > > diff --git a/net/core/filter.c b/net/core/filter.c > index c5d8332..a1dd94b 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -52,6 +52,44 @@ > #include > #include > > +#ifdef CONFIG_CGROUP_BPF > +static int sk_filter_cgroup_bpf(struct sock *sk, struct sk_buff *skb, > + enum bpf_attach_type type) > +{ > + struct sock_cgroup_data *skcd = >sk_cgrp_data; > + struct cgroup *cgrp = sock_cgroup_ptr(skcd); > + struct bpf_prog *prog; > + int ret = 0; > + > + rcu_read_lock(); > + > + switch (type) { > + case BPF_ATTACH_TYPE_CGROUP_EGRESS: > + prog = rcu_dereference(cgrp->bpf_egress); > + break; > + case BPF_ATTACH_TYPE_CGROUP_INGRESS: > + prog = rcu_dereference(cgrp->bpf_ingress); > + break; > + default: > + WARN_ON_ONCE(1); > + ret = -EINVAL; > + break; > + } > + > + if (prog) { I really like how in this version of the patches it became a single load+cmp of per-packet cost when this feature is off. Please move + struct cgroup *cgrp = sock_cgroup_ptr(skcd); into if (prog) {..} to make sure it's actually single load. The compiler cannot avoid that load when it's placed at the top. > + unsigned int offset = skb->data - skb_mac_header(skb); > + > + __skb_push(skb, offset); > + ret = bpf_prog_run_clear_cb(prog, skb) > 0 ? 0 : -EPERM; that doesn't match commit log. The above '> 0' makes sense to me though. If we want to do it for 1 only we have to define it in uapi/bpf.h as action code, so we can extend to 2, 3 in the future if necessary. It also have to be bpf_prog_run_save_cb() (as sk_filter_trim_cap() does) instead of bpf_prog_run_clear_cb(). See commit ff936a04e5f2 ("bpf: fix cb access in socket filter programs") > + __skb_pull(skb, offset); > + } > + > + rcu_read_unlock(); > + > + return ret; > +} > +#endif /* !CONFIG_CGROUP_BPF */ > + > /** > * sk_filter_trim_cap - run a packet through a socket filter > * @sk: sock associated with _buff > @@ -78,6 +116,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff > *skb, unsigned int cap) > if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) > return -ENOMEM; > > +#ifdef CONFIG_CGROUP_BPF > + err = sk_filter_cgroup_bpf(sk, skb, BPF_ATTACH_TYPE_CGROUP_INGRESS); > + if (err) > + return err; > +#endif > + > err = security_sock_rcv_skb(sk, skb); > if (err) > return err; > -- > 2.5.5 >
Re: [RFC PATCH 2/5] cgroup: add bpf_{e,in}gress pointers
On Wed, Aug 17, 2016 at 04:00:45PM +0200, Daniel Mack wrote: > Add two pointers for eBPF programs to struct cgroup. These will be used > to store programs for ingress and egress for accounting and filtering. > > This new feature is guarded by CONFIG_CGROUP_BPF. ... > +#ifdef CONFIG_CGROUP_BPF > + /* used by the networking layer */ > + struct bpf_prog *bpf_ingress; > + struct bpf_prog *bpf_egress; > +#endif ... > +config CGROUP_BPF > + bool "Enable eBPF programs in cgroups" > + depends on BPF_SYSCALL > + help > + This options allows cgroups to accommodate eBPF programs that > + can be used for network traffic filtering and accounting. See > + Documentation/networking/filter.txt for more information. > + I think this extra config is unnecessary. It makes the code harder to follow. Anyone turning on bpf syscall and cgroups should be able to have this feature. Extra config is imo overkill.
Re: [RFC PATCH 2/5] cgroup: add bpf_{e,in}gress pointers
Hello, On Wed, Aug 17, 2016 at 10:50:40AM -0700, Alexei Starovoitov wrote: > > +config CGROUP_BPF > > + bool "Enable eBPF programs in cgroups" > > + depends on BPF_SYSCALL > > + help > > + This options allows cgroups to accommodate eBPF programs that > > + can be used for network traffic filtering and accounting. See > > + Documentation/networking/filter.txt for more information. > > + > > I think this extra config is unnecessary. It makes the code harder to follow. > Anyone turning on bpf syscall and cgroups should be able to have this feature. > Extra config is imo overkill. Agreed, the added code is pretty small, especially in comparison to both cgroup and bpf, and the only memory overhead would be four pointers per cgroup, which shouldn't be noticeable at all. Thanks. -- tejun
[PATCH] net: ethernet: nuvoton: fix spelling mistake: "aligment" -> "alignment"
From: Colin Ian Kingtrivial fix to spelling mistake in dev_err message Signed-off-by: Colin Ian King --- drivers/net/ethernet/nuvoton/w90p910_ether.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/nuvoton/w90p910_ether.c b/drivers/net/ethernet/nuvoton/w90p910_ether.c index 87b7b81..712d8bc 100644 --- a/drivers/net/ethernet/nuvoton/w90p910_ether.c +++ b/drivers/net/ethernet/nuvoton/w90p910_ether.c @@ -751,7 +751,7 @@ static void netdev_rx(struct net_device *dev) dev_err(>dev, "rx crc err\n"); ether->stats.rx_crc_errors++; } else if (status & RXDS_ALIE) { - dev_err(>dev, "rx aligment err\n"); + dev_err(>dev, "rx alignment err\n"); ether->stats.rx_frame_errors++; } else if (status & RXDS_PTLE) { dev_err(>dev, "rx longer err\n"); -- 2.9.3
Re: [RFC PATCH 3/5] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands
On Wed, Aug 17, 2016 at 05:51:44PM +0200, Daniel Mack wrote: > On 08/17/2016 05:06 PM, Tejun Heo wrote: > > What I'd suggest is keeping two sets of pointers, one set for the > > programs explicitly attached to the cgroup and the other for programs > > which are to be executed for the cgroup. The two pointers in the > > latter set will point to the closest non-null program in its ancestry. > > Note that the pointers may point to programs belonging to different > > ancestors. > > > > A - B - C > > \ D - E > > > > Let's say B has both ingress and egress programs and D only has > > egress. E's execution pointers should point to D's egress program and > > B's ingress program. > > > > The pointers would need to be updated > > > > 1. When a program is installed or removed. For simplicity's sake, we > >can just walk the whole hierarchy. If that's too expensive > >(shouldn't be), we can find out the closest ancestor where both > >programs can be determined and then walk from there. > > > > 2. When a new cgroup is created, it should inherit its execution table > >from its parent. > > Ok, yes, that sounds sane to me. Will implement that scheme. That makes sense to me as well. Sounds quite flexible and fast with only one check per packet per direction.
Re: [PATCH v2 1/1] ppp: Fix one deadlock issue of PPP when send frame
On Tue, Aug 16, 2016 at 07:33:38PM +0800, f...@ikuai8.com wrote: > From: Gao Feng> > PPP channel holds one spinlock before send frame. But the skb may > select the same PPP channel with wrong route policy. As a result, > the skb reaches the same channel path. It tries to get the same > spinlock which is held before. Bang, the deadlock comes out. > Unless I misunderstood the problem you're trying to solve, this patch doesn't really help: deadlock still occurs if the same IP is used for L2TP and PPP's peer address. > Now add one lock owner to avoid it like xmit_lock_owner of > netdev_queue. Check the lock owner before try to get the spinlock. > If the current cpu is already the owner, needn't lock again. When > PPP channel holds the spinlock at the first time, it sets owner > with current CPU ID. > I think you should forbid lock recursion entirely, and drop the packet if the owner tries to re-acquire the channel lock. Otherwise you just move the deadlock down the stack (l2tp_xmit_skb() can't be called recursively). > diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c > index 70cfa06..6909ab1 100644 > --- a/drivers/net/ppp/ppp_generic.c > +++ b/drivers/net/ppp/ppp_generic.c > @@ -162,6 +162,37 @@ struct ppp { >|SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \ >|SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP) > > +struct chennel_lock { ^ I guess you meant 'channel_lock'. > + spinlock_t lock; > + u32 owner; This field's default value is -1. So why not declaring it as 'int'? -- To unsubscribe from this list: send the line "unsubscribe linux-ppp" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: regression in xgene-enet in 4.8-rc series, oops from xgene_enet_probe
Hi Riku, >> When the driver is configured as kernel module and when it gets >> unloaded and reloaded, kernel crash was observed. This patch >> addresses the software cleanup by doing the following, >> >> - Moved register_netdev call after hardware is ready >> - Since ndev is not ready, added set_irq_name to set irq name >> - Since ndev is not ready, changed mdio_bus->parent to pdev->dev >> - Replaced netif_start(stop)_queue by netif_tx_start(stop)_queues >> - Removed napi_del call since it's called by free_netdev >> - Added dev_close call, within remove >> - Added shutdown callback >> - Changed to use dmam_ APIs > > Bisecting points this patch, commited as > cb0366b7c16427a25923350b69f53a5b1345a34b the cause of oops when > booting apm mustang: > > [1.670201] [ cut here ] > [1.674804] WARNING: CPU: 2 PID: 1 at ../net/core/dev.c:6696 > rollback_registered_many+0x60/0x300 > [1.683543] Modules linked in: realtek > [1.687291] > [1.688774] CPU: 2 PID: 1 Comm: swapper/0 Not tainted > 4.8.0-rc2-00037-g3ec60b92d3ba #1 > [1.696648] Hardware name: APM X-Gene Mustang board (DT) > [1.701930] task: 8003ee078000 task.stack: 8003ee054000 > [1.707819] PC is at rollback_registered_many+0x60/0x300 > [1.713102] LR is at rollback_registered_many+0x30/0x300 > [1.718384] pc : [] lr : [] pstate: 2045 > [1.725739] sp : 8003ee057b00 > [1.729034] x29: 8003ee057b00 x28: 8003eda1a000 > [1.734338] x27: 0002 x26: 8003ebcba970 > [1.739641] x25: 8003eda1a208 x24: 8003eda1a010 > [1.744945] x23: 8003ee057c58 x22: 8003ebcba000 > [1.750247] x21: ffed x20: 8003ee057b70 > [1.755549] x19: 8003ee057b50 x18: 08dfafff > [1.760852] x17: 0007 x16: 0001 > [1.766154] x15: 08ce2000 x14: > [1.771458] x13: 0008 x12: 0030 > [1.776760] x11: 0030 x10: 0101010101010101 > [1.782062] x9 : x8 : 8003df80c700 > [1.787365] x7 : x6 : 0001 > [1.792668] x5 : dead0100 x4 : dead0200 > [1.797971] x3 : 8003ebcba070 x2 : > [1.803273] x1 : 8003ee057b00 x0 : 8003ebcba000 > [1.808575] > [1.810057] ---[ end trace 93f1dda704e63533 ]--- > [1.814648] Call trace: > [1.816207] ata2: SATA link down (SStatus 0 SControl 4300) > [1.822535] Exception stack(0x8003ee057930 to 0x8003ee057a60) > [1.828941] 7920: > 8003ee057b50 0001 > [1.836729] 7940: 8003ee057b00 08773c18 > 8003ee057980 08849a1c > [1.844517] 7960: 0009 08e5 > 8003ee0579a0 086eb03c > [1.852305] 7980: 08dbcde8 8003fffe1ca0 > 0040 8003ee057998 > [1.860094] 79a0: 8003ee0579e0 086eb1b0 > 0004 8003ee057a4c > f8003ebcba000 8003ee057b00 > [1.875669] 79e0: 8003ebcba070 > dead0200 dead0100 > [1.883457] 7a00: 0001 > 8003df80c700 > [1.884198] ata4: SATA link down (SStatus 0 SControl 4300) > [1.884211] ata3: SATA link down (SStatus 0 SControl 4300) > [1.902153] 7a20: 0101010101010101 0030 > 0030 0008 > [1.909941] 7a40: 08ce2000 > 0001 0007 > [1.917730] [] rollback_registered_many+0x60/0x300 > [1.924050] [] rollback_registered+0x28/0x40 > [1.929852] [] unregister_netdevice_queue+0x78/0xb8 > [1.936259] [] unregister_netdev+0x20/0x30 > [1.941889] [] xgene_enet_probe+0x638/0xf98 > [1.947605] [] platform_drv_probe+0x50/0xb8 > [1.953320] [] driver_probe_device+0x204/0x2b0 > [1.959294] [] __driver_attach+0xac/0xb0 > [1.964751] [] bus_for_each_dev+0x60/0xa0 > [1.970293] [] driver_attach+0x20/0x28 > [1.975576] [] bus_add_driver+0x1d0/0x238 > [1.981118] [] driver_register+0x60/0xf8 > [1.986574] [] __platform_driver_register+0x40/0x48 > [1.992982] [] xgene_enet_driver_init+0x18/0x20 > [1.999044] [] do_one_initcall+0x38/0x128 > [2.004588] [] kernel_init_freeable+0x1ac/0x250 > [2.010651] [] kernel_init+0x10/0x100 > [2.015847] [] ret_from_fork+0x10/0x40 > [2.021152] network todo 'eth%d' but state 0 > > Picked up from: > > https://storage.kernelci.org/mainline/v4.8-rc2-37-g3ec60b92d3ba/arm64-defconfig/lab-cambridge/boot-apm-mustang.html > > Visible on all mainline/apt-mustang boot reports. net-next seems to > have a fix for this. Iyappan will follow up if required. But I notice that you are using 1.13.xx boot loader. The latest official released boot loader version is 1.15.23 (from late 2015). I would suggest and recommend that you upgrade if possible. -Loc
Server fails to boot with net-next
Just updated to last net-next and boot crashed in x2apic_cluster_probe(). Bisection seemed to point to: commit 6b2c28471de550308784560206c3365e5179d42f Author: Sebastian Andrzej SiewiorDate: Wed Jul 13 17:17:00 2016 + x86/x2apic: Convert to CPU hotplug state machine When I re-compiled after removing CONFIG_X86_X2APIC, server managed to load properly. Are there known issues here? config-bisect-failing Description: config-bisect-failing
Re: Server fails to boot with net-next
On 2016-08-17 16:58:02 [+], Yuval Mintz wrote: > Are there known issues here? fixed in -rc2 by http://git.kernel.org/tip/d52c0569bab4edc32df44dc7ac28517134f6 Sebastian
[PATCH net] netfilter: tproxy: properly refcount tcp listeners
From: Eric Dumazetinet_lookup_listener() and inet6_lookup_listener() no longer take a reference on the found listener. This minimal patch adds back the refcounting, but we might do this differently in net-next later. Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood") Reported-and-tested-by: Denys Fedoryshchenko Signed-off-by: Eric Dumazet --- Note: bug added in 4.7, stable candidate. net/netfilter/xt_TPROXY.c |4 1 file changed, 4 insertions(+) diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 7f4414d26a66..663c4c3c9072 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -127,6 +127,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, daddr, dport, in->ifindex); + if (sk && !atomic_inc_not_zero(>sk_refcnt)) + sk = NULL; /* NOTE: we return listeners even if bound to * 0.0.0.0, those are filtered out in * xt_socket, since xt_TPROXY needs 0 bound @@ -195,6 +197,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, daddr, ntohs(dport), in->ifindex); + if (sk && !atomic_inc_not_zero(>sk_refcnt)) + sk = NULL; /* NOTE: we return listeners even if bound to * 0.0.0.0, those are filtered out in * xt_socket, since xt_TPROXY needs 0 bound
Re: kernel panic TPROXY , vanilla 4.7.1
On Wed, 2016-08-17 at 19:44 +0300, Denys Fedoryshchenko wrote: > Yes, everything fine after patch! > Thanks a lot Perfect, thanks for testing, I am sending the official patch.
Re: kernel panic TPROXY , vanilla 4.7.1
On 2016-08-17 19:04, Eric Dumazet wrote: On Wed, 2016-08-17 at 08:42 -0700, Eric Dumazet wrote: On Wed, 2016-08-17 at 17:31 +0300, Denys Fedoryshchenko wrote: > Hi! > > Tried to run squid on latest kernel, and hit a panic > Sometimes it just shows warning in dmesg (but doesnt work properly) > [ 75.701666] IPv4: Attempt to release TCP socket in state 10 > 88102d430780 > [ 83.866974] squid (2700) used greatest stack depth: 12912 bytes left > [ 87.506644] IPv4: Attempt to release TCP socket in state 10 > 880078a48780 > [ 114.704295] IPv4: Attempt to release TCP socket in state 10 > 881029f8ad00 > > I cannot catch yet oops/panic message, netconsole not working. > > After triggering warning message 3 times, i am unable to run squid > anymore (without reboot), and in netstat it doesnt show port running. > > firewall is: > *mangle > -A PREROUTING -p tcp -m socket -j DIVERT > -A PREROUTING -p tcp -m tcp --dport 80 -i eno1 -j TPROXY --on-port 3129 > --on-ip 0.0.0.0 --tproxy-mark 0x1/0x1 > -A DIVERT -j MARK --set-xmark 0x1/0x > -A DIVERT -j ACCEPT > > routing > ip rule add fwmark 1 lookup 100 > ip route add local default dev eno1 table 100 > > > squid config is default with tproxy option > http_port 3129 tproxy > Hmppff... sorry for this, I will send a fix. Thanks for the report ! Could you try the following ? Thanks ! net/netfilter/xt_TPROXY.c |4 1 file changed, 4 insertions(+) diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 7f4414d26a66..663c4c3c9072 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -127,6 +127,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, daddr, dport, in->ifindex); + if (sk && !atomic_inc_not_zero(>sk_refcnt)) + sk = NULL; /* NOTE: we return listeners even if bound to * 0.0.0.0, those are filtered out in * xt_socket, since xt_TPROXY needs 0 bound @@ -195,6 +197,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, daddr, ntohs(dport), in->ifindex); + if (sk && !atomic_inc_not_zero(>sk_refcnt)) + sk = NULL; /* NOTE: we return listeners even if bound to * 0.0.0.0, those are filtered out in * xt_socket, since xt_TPROXY needs 0 bound Yes, everything fine after patch! Thanks a lot
Re: [PATCH] net: ipv6: Remove addresses for failures with strict DAD
On 08/17/2016 04:40 PM, Hannes Frederic Sowa wrote: > On 17.08.2016 12:28, Mike Manning wrote: >> +static void dev_disable_change(struct inet6_dev *idev); >> >> /* >> * Configured unicast address hash table >> @@ -1945,6 +1946,12 @@ lock_errdad: >> >> pr_info("%s: IPv6 being disabled!\n", >> ifp->idev->dev->name); >> +spin_unlock_bh(>lock); >> +addrconf_dad_stop(ifp, 1); >> +rtnl_lock(); >> +dev_disable_change(idev); >> +rtnl_unlock(); >> +return; >> } >> } > > You can't take rtnl_lock at that point but must postpone the actions and > do that in addrconf_dad_work. > > Probably the whole ... else if (idev->cnf.accept_dad > 1 && ...) needs > to move there. > > Bye, > Hannes > > Thanks for the prompt review, I will look into making these changes. Also these changes caused a build error due to conditional compilation without CONFIG_SYSCTL, which is resolved by replacing the call to dev_disable_change(idev) by directly calling addrconf_ifdown(idev->dev, 0) instead. I would appreciate any further comments if the suggested change in behavior is not acceptable. Thanks Mike
Re: [PATCH] net: ipv6: Remove addresses for failures with strict DAD
On Wed, Aug 17, 2016, at 18:08, Mike Manning wrote: > On 08/17/2016 04:40 PM, Hannes Frederic Sowa wrote: > > On 17.08.2016 12:28, Mike Manning wrote: > >> +static void dev_disable_change(struct inet6_dev *idev); > >> > >> /* > >> *Configured unicast address hash table > >> @@ -1945,6 +1946,12 @@ lock_errdad: > >> > >>pr_info("%s: IPv6 being disabled!\n", > >>ifp->idev->dev->name); > >> + spin_unlock_bh(>lock); > >> + addrconf_dad_stop(ifp, 1); > >> + rtnl_lock(); > >> + dev_disable_change(idev); > >> + rtnl_unlock(); > >> + return; > >>} > >>} > > > > You can't take rtnl_lock at that point but must postpone the actions and > > do that in addrconf_dad_work. > > > > Probably the whole ... else if (idev->cnf.accept_dad > 1 && ...) needs > > to move there. > > > > Bye, > > Hannes > > > > > > Thanks for the prompt review, I will look into making these changes. > > Also these changes caused a build error due to conditional compilation > without CONFIG_SYSCTL, which is resolved by replacing the call to > dev_disable_change(idev) by directly calling addrconf_ifdown(idev->dev, > 0) instead. > > I would appreciate any further comments if the suggested change in > behavior is not acceptable. What you describe in the changelog what is happening right now looks like a bug to me thus your patch made sense to me. Bye, Hannes
Re: [RFC PATCH 3/5] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands
On Wed, 2016-08-17 at 16:00 +0200, Daniel Mack wrote: > + progp = is_ingress ? >bpf_ingress : >bpf_egress; > + > + rcu_read_lock(); > + old_prog = rcu_dereference(*progp); > + rcu_assign_pointer(*progp, prog); > + > + if (old_prog) > + bpf_prog_put(old_prog); > + > + rcu_read_unlock(); This is a bogus locking strategy. You do not want to use rcu_read_lock()/rcu_read_unlock() here, but appropriate writer exclusion (a mutex probably, or a spinlock) Then use rcu_dereference_protected() instead of rcu_dereference(*progp);
Re: [PATCH net] tcp: fix use after free in tcp_xmit_retransmit_queue()
On Wed, Aug 17, 2016 at 5:56 AM, Eric Dumazetwrote: > From: Eric Dumazet > > When tcp_sendmsg() allocates a fresh and empty skb, it puts it at the > tail of the write queue using tcp_add_write_queue_tail() > > Then it attempts to copy user data into this fresh skb. > > If the copy fails, we undo the work and remove the fresh skb. > > Unfortunately, this undo lacks the change done to tp->highest_sack and > we can leave a dangling pointer (to a freed skb) > > Later, tcp_xmit_retransmit_queue() can dereference this pointer and > access freed memory. For regular kernels where memory is not unmapped, > this might cause SACK bugs because tcp_highest_sack_seq() is buggy, > returning garbage instead of tp->snd_nxt, but with various debug > features like CONFIG_DEBUG_PAGEALLOC, this can crash the kernel. > > This bug was found by Marco Grassi thanks to syzkaller. > > Fixes: 6859d49475d4 ("[TCP]: Abstract tp->highest_sack accessing & point to > next skb") > Reported-by: Marco Grassi > Signed-off-by: Eric Dumazet > Cc: Ilpo Järvinen > Cc: Yuchung Cheng > Cc: Neal Cardwell > --- > include/net/tcp.h |2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/include/net/tcp.h b/include/net/tcp.h > index c00e7d51bb18..7717302cab91 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -1523,6 +1523,8 @@ static inline void tcp_check_send_head(struct sock *sk, > struct sk_buff *skb_unli > { > if (sk->sk_send_head == skb_unlinked) > sk->sk_send_head = NULL; > + if (tcp_sk(sk)->highest_sack == skb_unlinked) > + tcp_sk(sk)->highest_sack = NULL; > } > Nit: the function name probably needs to change too, since it now checks more than just send_head. ;) But we can always do this for net-next, don't let this be a blocker for this security fix. Reviewed-by: Cong Wang Thanks!
Re: kernel panic TPROXY , vanilla 4.7.1
On Wed, 2016-08-17 at 08:42 -0700, Eric Dumazet wrote: > On Wed, 2016-08-17 at 17:31 +0300, Denys Fedoryshchenko wrote: > > Hi! > > > > Tried to run squid on latest kernel, and hit a panic > > Sometimes it just shows warning in dmesg (but doesnt work properly) > > [ 75.701666] IPv4: Attempt to release TCP socket in state 10 > > 88102d430780 > > [ 83.866974] squid (2700) used greatest stack depth: 12912 bytes left > > [ 87.506644] IPv4: Attempt to release TCP socket in state 10 > > 880078a48780 > > [ 114.704295] IPv4: Attempt to release TCP socket in state 10 > > 881029f8ad00 > > > > I cannot catch yet oops/panic message, netconsole not working. > > > > After triggering warning message 3 times, i am unable to run squid > > anymore (without reboot), and in netstat it doesnt show port running. > > > > firewall is: > > *mangle > > -A PREROUTING -p tcp -m socket -j DIVERT > > -A PREROUTING -p tcp -m tcp --dport 80 -i eno1 -j TPROXY --on-port 3129 > > --on-ip 0.0.0.0 --tproxy-mark 0x1/0x1 > > -A DIVERT -j MARK --set-xmark 0x1/0x > > -A DIVERT -j ACCEPT > > > > routing > > ip rule add fwmark 1 lookup 100 > > ip route add local default dev eno1 table 100 > > > > > > squid config is default with tproxy option > > http_port 3129 tproxy > > > > Hmppff... sorry for this, I will send a fix. > > Thanks for the report ! > Could you try the following ? Thanks ! net/netfilter/xt_TPROXY.c |4 1 file changed, 4 insertions(+) diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 7f4414d26a66..663c4c3c9072 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -127,6 +127,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, daddr, dport, in->ifindex); + if (sk && !atomic_inc_not_zero(>sk_refcnt)) + sk = NULL; /* NOTE: we return listeners even if bound to * 0.0.0.0, those are filtered out in * xt_socket, since xt_TPROXY needs 0 bound @@ -195,6 +197,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, daddr, ntohs(dport), in->ifindex); + if (sk && !atomic_inc_not_zero(>sk_refcnt)) + sk = NULL; /* NOTE: we return listeners even if bound to * 0.0.0.0, those are filtered out in * xt_socket, since xt_TPROXY needs 0 bound
Re: [RFC PATCH 3/5] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands
On 08/17/2016 05:06 PM, Tejun Heo wrote: > What I'd suggest is keeping two sets of pointers, one set for the > programs explicitly attached to the cgroup and the other for programs > which are to be executed for the cgroup. The two pointers in the > latter set will point to the closest non-null program in its ancestry. > Note that the pointers may point to programs belonging to different > ancestors. > > A - B - C > \ D - E > > Let's say B has both ingress and egress programs and D only has > egress. E's execution pointers should point to D's egress program and > B's ingress program. > > The pointers would need to be updated > > 1. When a program is installed or removed. For simplicity's sake, we >can just walk the whole hierarchy. If that's too expensive >(shouldn't be), we can find out the closest ancestor where both >programs can be determined and then walk from there. > > 2. When a new cgroup is created, it should inherit its execution table >from its parent. Ok, yes, that sounds sane to me. Will implement that scheme. Thanks! Daniel