Re: pls merge net to net-next

2016-08-17 Thread David Miller
From: Stephen Hemminger 
Date: Tue, 16 Aug 2016 14:02:18 -0700

> Since Vitaly's changes to hyperv drivers went into net it makes
> it harder to submit other changes for net-next since they will invariably
> cause conflicts.

Done.


[PATCH 2/2] pptp: Reset call_id as 0 to avoid one useless lookup at next time

2016-08-17 Thread fgao
From: Gao Feng 

When pptp fails to get valid callid, the global call_id is set as
MAX_CALLID. Then it must fail to get callid at next time, when
invoke find_next_zero_bit from call_id+1. Because the call_id+1
exceeds the limit "MAX_CALLID".
So reset call_id as 0 when fail to get valid callid. And add one
variable to check if need the second lookup.

Signed-off-by: Gao Feng 
---
 v1: Initial patch

 drivers/net/ppp/pptp.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 48c3701..9afef3c 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -103,12 +103,21 @@ static int add_chan(struct pppox_sock *sock,
static int call_id;
 
spin_lock(_lock);
-   if (!sa->call_id)   {
+   if (!sa->call_id) {
+   bool from_start = (call_id == 0);
+
call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, call_id 
+ 1);
if (call_id == MAX_CALLID) {
-   call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, 
1);
-   if (call_id == MAX_CALLID)
+   if (unlikely(from_start)) {
+   call_id = 0;
goto out_err;
+   } else {
+   call_id = find_next_zero_bit(callid_bitmap, 
MAX_CALLID, 1);
+   if (call_id == MAX_CALLID) {
+   call_id = 0;
+   goto out_err;
+   }
+   }
}
sa->call_id = call_id;
} else if (test_bit(sa->call_id, callid_bitmap)) {
@@ -656,8 +665,10 @@ static int __init pptp_init_module(void)
pr_info("PPTP driver version " PPTP_DRIVER_VERSION "\n");
 
callid_sock = vzalloc((MAX_CALLID + 1) * sizeof(void *));
-   if (!callid_sock)
+   if (!callid_sock) {
+   pr_err("PPTP: can't alloc callid_sock mem");
return -ENOMEM;
+   }
 
err = gre_add_protocol(_pptp_protocol, GREPROTO_PPTP);
if (err) {
-- 
1.9.1




[PATCH v1 1/2] pptp: Use macro and sizeof instead of literal number

2016-08-17 Thread fgao
From: Gao Feng 

Use existing macros like PPP_ADDRESS, SC_COMP_PROT and sizeof fixed
variables instead of original literal number to enhance readbility.

BTW, the original pptp_rcv uses literal number "12" as the param
of pskb_may_pull. Actually the "12" is less than the size of struct
pptp_gre_header. Now use the sizeof(*header) fixes this issue.

Signed-off-by: Gao Feng 
---
 v1: Initial patch

 drivers/net/ppp/pptp.c | 23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 1951b10..48c3701 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -54,6 +54,8 @@ static struct proto pptp_sk_proto __read_mostly;
 static const struct ppp_channel_ops pptp_chan_ops;
 static const struct proto_ops pptp_ops;
 
+static const u8 fixed_ppphdr[2] = {PPP_ALLSTATIONS, PPP_UI};
+
 static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr)
 {
struct pppox_sock *sock;
@@ -167,7 +169,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct 
sk_buff *skb)
 
tdev = rt->dst.dev;
 
-   max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(*iph) + sizeof(*hdr) + 
2;
+   max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(*iph) + sizeof(*hdr) + 
sizeof(fixed_ppphdr);
 
if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || 
skb_shared(skb)) {
struct sk_buff *new_skb = skb_realloc_headroom(skb, 
max_headroom);
@@ -190,9 +192,9 @@ static int pptp_xmit(struct ppp_channel *chan, struct 
sk_buff *skb)
 
/* Put in the address/control bytes if necessary */
if ((opt->ppp_flags & SC_COMP_AC) == 0 || islcp) {
-   data = skb_push(skb, 2);
-   data[0] = PPP_ALLSTATIONS;
-   data[1] = PPP_UI;
+   data = skb_push(skb, sizeof(fixed_ppphdr));
+   data[0] = fixed_ppphdr[0];
+   data[1] = fixed_ppphdr[1];
}
 
len = skb->len;
@@ -219,8 +221,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct 
sk_buff *skb)
}
hdr->payload_len = htons(len);
 
-   /*  Push down and install the IP header. */
-
+   /* Push down and install the IP header. */
skb_reset_transport_header(skb);
skb_push(skb, sizeof(*iph));
skb_reset_network_header(skb);
@@ -319,14 +320,14 @@ static int pptp_rcv_core(struct sock *sk, struct sk_buff 
*skb)
 allow_packet:
skb_pull(skb, headersize);
 
-   if (payload[0] == PPP_ALLSTATIONS && payload[1] == PPP_UI) {
+   if (PPP_ADDRESS(payload) == PPP_ALLSTATIONS && 
PPP_CONTROL(payload) == PPP_UI) {
/* chop off address/control */
if (skb->len < 3)
goto drop;
-   skb_pull(skb, 2);
+   skb_pull(skb, sizeof(fixed_ppphdr));
}
 
-   if ((*skb->data) & 1) {
+   if ((*skb->data) & SC_COMP_PROT) {
/* protocol is compressed */
skb_push(skb, 1)[0] = 0;
}
@@ -351,7 +352,7 @@ static int pptp_rcv(struct sk_buff *skb)
if (skb->pkt_type != PACKET_HOST)
goto drop;
 
-   if (!pskb_may_pull(skb, 12))
+   if (!pskb_may_pull(skb, sizeof(*header)))
goto drop;
 
iph = ip_hdr(skb);
@@ -468,7 +469,7 @@ static int pptp_connect(struct socket *sock, struct 
sockaddr *uservaddr,
ip_rt_put(rt);
po->chan.mtu -= PPTP_HEADER_OVERHEAD;
 
-   po->chan.hdrlen = 2 + sizeof(struct pptp_gre_header);
+   po->chan.hdrlen = sizeof(fixed_ppphdr) + sizeof(struct pptp_gre_header);
error = ppp_register_channel(>chan);
if (error) {
pr_err("PPTP: failed to register PPP channel (%d)\n", error);
-- 
1.9.1



[PATCH] fib_trie: Fix the description of pos and bits

2016-08-17 Thread Xunlei Pang
1) Fix one typo: s/tn/tp/
2) Fix the description about the "u" bits.

Signed-off-by: Xunlei Pang 
---
 net/ipv4/fib_trie.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d07fc07..eb7c5d1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -249,7 +249,7 @@ static inline unsigned long get_index(t_key key, struct 
key_vector *kv)
  * index into the parent's child array. That is, they will be used to find
  * 'n' among tp's children.
  *
- * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
+ * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
  * for the node n.
  *
  * All the bits we have seen so far are significant to the node n. The rest
@@ -258,7 +258,7 @@ static inline unsigned long get_index(t_key key, struct 
key_vector *kv)
  * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
  * n's child array, and will of course be different for each child.
  *
- * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
+ * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
  * at this point.
  */
 
-- 
1.8.3.1



Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Timur Tabi

Florian Fainelli wrote:

The larger issue is that the emac_sgmii node in the form you posted
is going to be backed by a platform device in Linux while you want a
PHY device with a reg property that describes a MDIO address
(#address-cells = 1, #size-cells = 0).


But how do I get the platform device for the emac_sgmii node?  If I 
create an of_device_id ID for it, like this:


static const struct of_device_id emac_dt_match[] = {
{
.compatible = "qcom,fsm9900-emac",
},
{}
};

static const struct of_device_id emac_sgmii_dt_match[] = {
{
.compatible = "qcom,fsm9900-emac-sgmii",
},
{}
};

Then the probe function will be called for qcom,fsm9900-emac-sgmii 
separately from qcom,fsm9900-emac, which just confuses things. So I 
can't create emac_sgmii_dt_match.


I know this is standard DT stuff, and I used to do a lot of work on DT 
so maybe I should know this already.  But it seems to me that I need to 
manually create the platform_device for qcom,fsm9900-emac-sgmii.



IIRC the amd xgbe driver mainline had a similar design but still
implemented a PHY device anyway although it may not have been using
Device Tree. It should still be possible to implement a PHY driver
that you manually register and bind to its device_node pointer such
that of_phy_find_device and friends still work. You would do this
from the emac_sgmii platform device driver and parent devices in a
way that satisfy the PHY device driver lifecycle as well.

Hope this helps.


It doesn't, sorry.  The emac_sgmii is really just another register block 
for the driver to program.  Creating another PHY driver for it doesn't 
really make sense.  It's not on an MDIO bus.


--
Sent by an employee of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the
Code Aurora Forum, hosted by The Linux Foundation.


Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Florian Fainelli
On August 17, 2016 8:27:19 PM PDT, Timur Tabi  wrote:
>Florian Fainelli wrote:
>
>>>  emac_sgmii: ethernet-phy@410400 {
>>>  compatible = "qcom,qdf2432-emac-phy";
>>>  reg = <0x0 0x00410400 0x0 0x100>;
>>>  interrupts = <0 0x104 0>;
>>>  };
>>>
>>
>> Is this register range relative to the emac0 node here, or is this
>> really a separate node, within the same adress space as your emac0
>node?
>
>It's a separate node within the same address space.  We can't guarantee
>
>that it's adjacent to any other EMAC register -- it could be anywhere
>in 
>memory. 0x00410400 is the real physical address of those registers.

Ok

>
>> Answer largely depends on whether your device is really located
>outside
>> of the emac, if it located outside, then a platform device matching
>the
>> compatible string would get you what you want. If the emac_sgmii
>block
>> is a sub-block within the EMAC, then a few things need fixing:
>>
>> - your emac_sgmii node should be a sub-node of the emac node, not a
>sibling
>> - the emac0 node should have a "ranges" property that indicates how
>to
>> translate the sub-nodes' "reg" property based on the base register
>> address of the emac0 block
>> - you would have to call of_platform_populate from the EMAC driver to
>> ensure that the emac_sgmii child node and therefore platform device
>gets
>> created
>
>Even if the emac_sgmii block were a sub-block, wouldn't it conflict
>with 
>the ethernet-phy@4 node?  The #address-cells and #size-cells properties
>
>cannot be valid for both the emac_sgmii and ethernet-phy@4 nodes.

These two properties apply to subnodes within the emac0 and emac_sgmii nodes, 
this is standard DT stuff here.

The larger issue is that the emac_sgmii node in the form you posted is going to 
be backed by a platform device in Linux while you want a PHY device with a reg 
property that describes a MDIO address (#address-cells = 1, #size-cells = 0). 

IIRC the amd xgbe driver mainline had a similar design but still implemented a 
PHY device anyway although it may not have been using Device Tree. It should 
still be possible to implement a PHY driver that you manually register and bind 
to its device_node pointer such that of_phy_find_device and friends still work. 
You would do this from the emac_sgmii platform device driver and parent devices 
in a way that satisfy the PHY device driver lifecycle as well.

Hope this helps.

NB: another way to do this is to give the SGMII PHY device a MDIO address, 
provided that it has one, and you pass the memory mapped register range as a 
syscon property pointing to the emac_sgmii register range. This is not really 
clean through.


-- 
Florian


Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Timur Tabi

Florian Fainelli wrote:


 emac_sgmii: ethernet-phy@410400 {
 compatible = "qcom,qdf2432-emac-phy";
 reg = <0x0 0x00410400 0x0 0x100>;
 interrupts = <0 0x104 0>;
 };



Is this register range relative to the emac0 node here, or is this
really a separate node, within the same adress space as your emac0 node?


It's a separate node within the same address space.  We can't guarantee 
that it's adjacent to any other EMAC register -- it could be anywhere in 
memory. 0x00410400 is the real physical address of those registers.



Answer largely depends on whether your device is really located outside
of the emac, if it located outside, then a platform device matching the
compatible string would get you what you want. If the emac_sgmii block
is a sub-block within the EMAC, then a few things need fixing:

- your emac_sgmii node should be a sub-node of the emac node, not a sibling
- the emac0 node should have a "ranges" property that indicates how to
translate the sub-nodes' "reg" property based on the base register
address of the emac0 block
- you would have to call of_platform_populate from the EMAC driver to
ensure that the emac_sgmii child node and therefore platform device gets
created


Even if the emac_sgmii block were a sub-block, wouldn't it conflict with 
the ethernet-phy@4 node?  The #address-cells and #size-cells properties 
cannot be valid for both the emac_sgmii and ethernet-phy@4 nodes.


--
Sent by an employee of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the
Code Aurora Forum, hosted by The Linux Foundation.


Re: [PATCH net-next 2/3] net: mpls: Fixups for GSO

2016-08-17 Thread David Ahern
On 8/17/16 7:06 PM, Alexander Duyck wrote:
> On Wed, Aug 17, 2016 at 4:23 PM, David Ahern  wrote:
>> On 8/17/16 5:16 PM, Alexander Duyck wrote:
 diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
 index 1ecbd7715f6d..6d78f162a88b 100644
 --- a/net/openvswitch/actions.c
 +++ b/net/openvswitch/actions.c
 @@ -167,6 +167,12 @@ static int push_mpls(struct sk_buff *skb, struct 
 sw_flow_key *key,
 skb->mac_len);
 skb_reset_mac_header(skb);

 +   /* for GSO: set MPLS as network header and encapsulated protocol
 +* header as inner network header
 +*/
 +   skb_set_network_header(skb, skb->mac_len);
 +   skb_set_inner_network_header(skb, skb->mac_len + MPLS_HLEN);
 +
 new_mpls_lse = (__be32 *)skb_mpls_header(skb);
 *new_mpls_lse = mpls->mpls_lse;

>>>
>>> So the one question I would have about this is how attached are you to
>>> using the network_header to record the offset for the MPLS header?  I
>>> ask because I think from a hardware offloading perspective it would
>>> make it much easier if instead you used the inner_mac_header to
>>> represent the offset for the MPLS header.  This way device drivers
>>> could just skip over it like a VLAN and just use network and transport
>>> header values like they would otherwise.
>>>
>>
>> Where does the network_header relate to if I change the marker to 
>> inner_mac_header? Would it be skipped?
> 
> No, the network header would still be the network header.

If core MPLS code (ie., non-OVS) does not do skb_reset_network_header(skb) 
after adding the MPLS label nothing works. Not even ping with small packets. 
tcpdump shows a completely mangled packet. Right now resetting the 
network_header to mpls is required.



[PATCH v1 1/1] pppoe: l2tp: the PPPOX_CONNECTED should be used with bit operation

2016-08-17 Thread fgao
From: Gao Feng 

There are some codes in pppoe and l2tp which use the PPPOX_CONNECTED
as the value including assignment and condition check.
They should keep consistent with other codes.

Signed-off-by: Gao Feng 
---
 v1: Initial Patch

 drivers/net/ppp/pppoe.c | 2 +-
 net/l2tp/l2tp_ppp.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 4ddae81..684b773 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -697,7 +697,7 @@ static int pppoe_connect(struct socket *sock, struct 
sockaddr *uservaddr,
goto err_put;
}
 
-   sk->sk_state = PPPOX_CONNECTED;
+   sk->sk_state |= PPPOX_CONNECTED;
}
 
po->num = sp->sa_addr.pppoe.sid;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index d9560aa..3984385 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -774,7 +774,7 @@ static int pppol2tp_connect(struct socket *sock, struct 
sockaddr *uservaddr,
 out_no_ppp:
/* This is how we get the session context from the socket. */
sk->sk_user_data = session;
-   sk->sk_state = PPPOX_CONNECTED;
+   sk->sk_state |= PPPOX_CONNECTED;
l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n",
  session->name);
 
@@ -856,7 +856,7 @@ static int pppol2tp_getname(struct socket *sock, struct 
sockaddr *uaddr,
error = -ENOTCONN;
if (sk == NULL)
goto end;
-   if (sk->sk_state != PPPOX_CONNECTED)
+   if (!(sk->sk_state & PPPOX_CONNECTED))
goto end;
 
error = -EBADF;
-- 
1.9.1




Re: [PATCH net-next] tcp: refine tcp_prune_ofo_queue() to not drop all packets

2016-08-17 Thread Soheil Hassas Yeganeh
On Wed, Aug 17, 2016 at 5:17 PM, Eric Dumazet  wrote:
> From: Eric Dumazet 
>
> Over the years, TCP BDP has increased a lot, and is typically
> in the order of ~10 Mbytes with help of clever Congestion Control
> modules.
>
> In presence of packet losses, TCP stores incoming packets into an out of
> order queue, and number of skbs sitting there waiting for the missing
> packets to be received can match the BDP (~10 Mbytes)
>
> In some cases, TCP needs to make room for incoming skbs, and current
> strategy can simply remove all skbs in the out of order queue as a last
> resort, incurring a huge penalty, both for receiver and sender.
>
> Unfortunately these 'last resort events' are quite frequent, forcing
> sender to send all packets again, stalling the flow and wasting a lot of
> resources.
>
> This patch cleans only a part of the out of order queue in order
> to meet the memory constraints.
>
> Signed-off-by: Eric Dumazet 
> Cc: Neal Cardwell 
> Cc: Yuchung Cheng 
> Cc: Soheil Hassas Yeganeh 
> Cc: C. Stephen Gun 
> Cc: Van Jacobson 

Acked-by: Soheil Hassas Yeganeh 

> ---
>  net/ipv4/tcp_input.c |   47 -
>  1 file changed, 28 insertions(+), 19 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 
> 3ebf45b38bc309f448dbc4f27fe8722cefabaf19..8cd02c0b056cbc22e2e4a4fe8530b74f7bd25419
>  100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4392,12 +4392,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, 
> struct sk_buff *skb,
> if (tcp_prune_queue(sk) < 0)
> return -1;
>
> -   if (!sk_rmem_schedule(sk, skb, size)) {
> +   while (!sk_rmem_schedule(sk, skb, size)) {
> if (!tcp_prune_ofo_queue(sk))
> return -1;
> -
> -   if (!sk_rmem_schedule(sk, skb, size))
> -   return -1;
> }
> }
> return 0;
> @@ -4874,29 +4871,41 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
>  }
>
>  /*
> - * Purge the out-of-order queue.
> - * Return true if queue was pruned.
> + * Clean the out-of-order queue to make room.
> + * We drop high sequences packets to :
> + * 1) Let a chance for holes to be filled.
> + * 2) not add too big latencies if thousands of packets sit there.
> + *(But if application shrinks SO_RCVBUF, we could still end up
> + * freeing whole queue here)
> + *
> + * Return true if queue has shrunk.
>   */
>  static bool tcp_prune_ofo_queue(struct sock *sk)
>  {
> struct tcp_sock *tp = tcp_sk(sk);
> -   bool res = false;
> +   struct sk_buff *skb;
>
> -   if (!skb_queue_empty(>out_of_order_queue)) {
> -   NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> -   __skb_queue_purge(>out_of_order_queue);
> +   if (skb_queue_empty(>out_of_order_queue))
> +   return false;
>
> -   /* Reset SACK state.  A conforming SACK implementation will
> -* do the same at a timeout based retransmit.  When a 
> connection
> -* is in a sad state like this, we care only about integrity
> -* of the connection not performance.
> -*/
> -   if (tp->rx_opt.sack_ok)
> -   tcp_sack_reset(>rx_opt);
> +   NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> +
> +   while ((skb = __skb_dequeue_tail(>out_of_order_queue)) != NULL) {
> +   tcp_drop(sk, skb);
> sk_mem_reclaim(sk);
> -   res = true;
> +   if (atomic_read(>sk_rmem_alloc) <= sk->sk_rcvbuf &&
> +   !tcp_under_memory_pressure(sk))
> +   break;
> }
> -   return res;
> +
> +   /* Reset SACK state.  A conforming SACK implementation will
> +* do the same at a timeout based retransmit.  When a connection
> +* is in a sad state like this, we care only about integrity
> +* of the connection not performance.
> +*/
> +   if (tp->rx_opt.sack_ok)
> +   tcp_sack_reset(>rx_opt);
> +   return true;
>  }
>
>  /* Reduce allocated memory if we can, trying to get
>
>

Very nice patch, Eric! Thanks.


Re: [PATCH net-next 2/3] net: mpls: Fixups for GSO

2016-08-17 Thread Alexander Duyck
On Wed, Aug 17, 2016 at 4:23 PM, David Ahern  wrote:
> On 8/17/16 5:16 PM, Alexander Duyck wrote:
>>> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
>>> index 1ecbd7715f6d..6d78f162a88b 100644
>>> --- a/net/openvswitch/actions.c
>>> +++ b/net/openvswitch/actions.c
>>> @@ -167,6 +167,12 @@ static int push_mpls(struct sk_buff *skb, struct 
>>> sw_flow_key *key,
>>> skb->mac_len);
>>> skb_reset_mac_header(skb);
>>>
>>> +   /* for GSO: set MPLS as network header and encapsulated protocol
>>> +* header as inner network header
>>> +*/
>>> +   skb_set_network_header(skb, skb->mac_len);
>>> +   skb_set_inner_network_header(skb, skb->mac_len + MPLS_HLEN);
>>> +
>>> new_mpls_lse = (__be32 *)skb_mpls_header(skb);
>>> *new_mpls_lse = mpls->mpls_lse;
>>>
>>
>> So the one question I would have about this is how attached are you to
>> using the network_header to record the offset for the MPLS header?  I
>> ask because I think from a hardware offloading perspective it would
>> make it much easier if instead you used the inner_mac_header to
>> represent the offset for the MPLS header.  This way device drivers
>> could just skip over it like a VLAN and just use network and transport
>> header values like they would otherwise.
>>
>
> Where does the network_header relate to if I change the marker to 
> inner_mac_header? Would it be skipped?

No, the network header would still be the network header.

> skb->protocol is set to MPLS.
> mac_header points to ethernet address
> network_header points to ???

The network_header would point to the IP header like it would be for a
non-MPLS frame.

> inner protocol is set to what is encapsulated (e.g., ipv4 or ipv6)

I am okay with this, but wonder if we actually need it.  Do you know
of any protocols other than IPv4 or IPv6 that can be carried over MPLS
and would expect to be offloaded?  If not we may be able to just get
away with recording the network header offset and then using the first
nibble of the network header to determine the IP version since the
value should be 4 or 6 for the two types we are offloading.

> inner_mac_header points to start of mpls label.

So this is what I would expect.

> inner_network points to start of network header.

The problem is that using inner_network_header to point to the network
header will require me to fork the path pretty significantly for most
of the Intel devices that would want to do MPLS GSO.  The assumption
most drivers make is that if we are offloading things then
network_header and inner_network_header will point to either IPv4 or
IPv6 headers.  Introducing MPLS as the network_header with IPv4 or
IPv6 as the inner_network_header throws a kink in the works because we
currently ignore inner_network_header for the devices that are doing
UDP or GRE tunnel GSO via GSO_PARTIAL with TSO_MANGLEID.

> Is that sufficient for h/w drivers?

I think of this as working like how we handle it for IP over IP
tunnels.  In that case we are at L3 so the inner_network_header field
is populated, but the transport header stays the same.  In the case of
MPLS it isn't really L3 it is more of an L2.5 so my preference would
be to treat it like it is an L2 tunnel or VLAN and just overwrite the
inner_mac_header with the MPLS header offset, and leave the network
and transport headers untouched.

One other bonus that also occurred to me is that you might be able to
get away with doing MPLS offloads for MPLS over IP or GRE tunnels.  I
hadn't realized that MPLS inside of these tunnels was a thing, I had
just noticed it while looking over how the IP-in-IP tunnels are all
being handled.  However if you move the header tracking to
inner_mac_header, and can avoid using skb->inner_protocol by instead
using the first nibble of the network_header value then you could
probably support segmenting those types of tunnels in hardware.

- Alex


Re: [PATCH v2 1/1] ppp: Fix one deadlock issue of PPP when send frame

2016-08-17 Thread Feng Gao
inline answer.

On Thu, Aug 18, 2016 at 1:42 AM, Guillaume Nault  wrote:
> On Tue, Aug 16, 2016 at 07:33:38PM +0800, f...@ikuai8.com wrote:
>> From: Gao Feng 
>>
>> PPP channel holds one spinlock before send frame. But the skb may
>> select the same PPP channel with wrong route policy. As a result,
>> the skb reaches the same channel path. It tries to get the same
>> spinlock which is held before. Bang, the deadlock comes out.
>>
> Unless I misunderstood the problem you're trying to solve, this patch
> doesn't really help: deadlock still occurs if the same IP is used for
> L2TP and PPP's peer address.
>

The deadlock happens because the same cpu try to hold the spinlock
which is already held before by itself.
Now the PPP_CHANNEL_LOCK_BH sets the lock owner after hold lock, then
when the same cpu
invokes PPP_CHANNEL_LOCK_BH again. The cl.owner equals current cpu id,
so it only increases
the lock_cnt without trying to hold the lock again.
So it avoids the deadlock.

>> Now add one lock owner to avoid it like xmit_lock_owner of
>> netdev_queue. Check the lock owner before try to get the spinlock.
>> If the current cpu is already the owner, needn't lock again. When
>> PPP channel holds the spinlock at the first time, it sets owner
>> with current CPU ID.
>>
> I think you should forbid lock recursion entirely, and drop the packet
> if the owner tries to re-acquire the channel lock. Otherwise you just
> move the deadlock down the stack (l2tp_xmit_skb() can't be called
> recursively).

The reason that fix it in ppp is that there are other layer on the ppp module.
We resolve it in ppp module, it could avoid all similar potential issues.

>
>> diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
>> index 70cfa06..6909ab1 100644
>> --- a/drivers/net/ppp/ppp_generic.c
>> +++ b/drivers/net/ppp/ppp_generic.c
>> @@ -162,6 +162,37 @@ struct ppp {
>>|SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
>>|SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)
>>
>> +struct chennel_lock {
> ^
> I guess you meant 'channel_lock'.

Yes. It is a typo. Thanks.

>
>> + spinlock_t lock;
>> + u32 owner;
> This field's default value is -1. So why not declaring it as 'int'?

OK. I will fix it.

Best Regards
Feng
--
To unsubscribe from this list: send the line "unsubscribe linux-ppp" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT] Networking

2016-08-17 Thread David Miller

1) Buffers powersave frame test is reversed in cfg80211, fix from
   Felix Fietkau.

2) Remove bogus WARN_ON in openvswitch, from Jarno Rajahalme.

3) Fix some tg3 ethtool logic bugs, and one that would cause no
   interrupts to be generated when rx-coalescing is set to 0.
   From Satish Baddipadige and Siva Reddy Kallam.

4) QLCNIC mailbox corruption and napi budget handling fix from
   Manish Chopra.

5) Fix fib_trie logic when walking the trie during /proc/net/route
   output than can access a stale node pointer.  From David Forster.

6) Several sctp_diag fixes from Phil Sutter.

7) PAUSE frame handling fixes in mlxsw driver from Ido Schimmel.

8) Checksum fixup fixes in bpf from Daniel Borkmann.

9) Memork leaks in nfnetlink, from Liping Zhang.

10) Use after free in rxrpc, from David Howells.

11) Use after free in new skb_array code of macvtap driver, from
Jason Wang.

12) Calipso resource leak, from Colin Ian King.

13) mediatek bug fixes (missing stats sync init, etc.) from Sean Wang.

14) Fix bpf non-linear packet write helpers, from Daniel Borkmann.

15) Fix lockdep splats in macsec, from Sabrina Dubroca.

16) hv_netvsc bug fixes from Vitaly Kuznetsov, mostly to do with
VF handling.

17) Various tc-action bug fixes, from CONG Wang.

Please pull, thanks a lot!

The following changes since commit bf0f500bd0199aab613eb0ecb3412edd5472740d:

  Merge tag 'trace-v4.8-1' of 
git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace (2016-08-03 
12:50:06 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to b96c22c071eb1126db4055de4bb75b02b05affd1:

  Merge branch 'tc_action-fixes' (2016-08-17 19:27:58 -0400)


Alexander Duyck (2):
  ixgbe: Force VLNCTRL.VFE to be set in all VMDq paths
  ixgbe: Re-enable ability to toggle VLAN filtering

Alexei Starovoitov (2):
  bpf: restore behavior of bpf_map_update_elem
  samples/bpf: add bpf_map_update_elem() tests

Arnd Bergmann (3):
  net: xgene: fix maybe-uninitialized variable
  rxrpc: fix uninitialized pointer dereference in debug code
  dsa: mv88e6xxx: hide unused functions

Christophe Leroy (1):
  netfilter: nf_conntrack_sip: CSeq 0 is a valid CSeq

Colin Ian King (2):
  cfg80211: fix missing break in NL8211_CHAN_WIDTH_80P80 case
  calipso: fix resource leak on calipso_genopt failure

Daniel Borkmann (5):
  bpf: also call skb_postpush_rcsum on xmit occasions
  bpf: fix checksum fixups on bpf_skb_store_bytes
  bpf: fix checksum for vlan push/pop helper
  bpf: fix bpf_skb_in_cgroup helper naming
  bpf: fix write helpers with regards to non-linear parts

Dave Ertman (1):
  i40e: check for and deal with non-contiguous TCs

David Forster (1):
  ipv4: panic in leaf_walk_rcu due to stale node pointer

David Howells (6):
  rxrpc: Fix races between skb free, ACK generation and replying
  rxrpc: Need to flag call as being released on connect failure
  rxrpc: Don't access connection from call if pointer is NULL
  rxrpc: Once packet posted in data_ready, don't retry posting
  rxrpc: Fix a use-after-push in data_ready handler
  rxrpc: Free packets discarded in data_ready

David S. Miller (14):
  Merge branch 'tg3-fixes'
  Merge branch 'qlcnic-fixes'
  Merge tag 'mac80211-for-davem-2016-08-05' of 
git://git.kernel.org/.../jberg/mac80211
  Merge branch 'sctp_diag-fixes'
  Merge branch 'mlxsw-dcb-fixes'
  Merge branch 'bpf-csum-complete'
  Merge branch 'qed-fixes'
  Merge tag 'rxrpc-fixes-20160809' of 
git://git.kernel.org/.../dhowells/linux-fs
  Merge git://git.kernel.org/.../pablo/nf
  Merge branch 'hv_netvsc-VF-removal-fixes'
  Merge branch 'mediatek-fixes'
  Merge branch 'mlxsw-fixes'
  Merge branch '1GbE' of git://git.kernel.org/.../jkirsher/net-queue
  Merge branch 'tc_action-fixes'

Elad Raz (1):
  mlxsw: spectrum: Add missing packet traps

Fabian Frederick (1):
  net: hns: fix typo in g_gmac_stats_string[]

Felix Fietkau (2):
  mac80211: fix check for buffered powersave frames with txq
  mac80211: fix purging multicast PS buffer queue

Florian Westphal (1):
  rhashtable: avoid large lock-array allocations

Geert Uytterhoeven (1):
  net: dsa: b53: Add missing ULL suffix for 64-bit constant

Grygorii Strashko (1):
  drivers: net: cpsw: fix kmemleak false-positive reports for sk buffers

Harini Katakam (1):
  net: macb: Correct CAPS mask

Ian Wienand (1):
  OVS: Ignore negative headroom value

Ido Schimmel (11):
  mlxsw: spectrum: Do not assume PAUSE frames are disabled
  mlxsw: spectrum: Do not override PAUSE settings
  mlxsw: spectrum: Add missing DCB rollback in error path
  mlxsw: spectrum: Don't return upon error in removal path
  mlxsw: spectrum: Remove redundant errors from the code
  mlxsw: 

Re: [PATCH] net: ipconfig: Fix more use after free

2016-08-17 Thread David Miller
From: Thierry Reding 
Date: Tue, 16 Aug 2016 16:45:38 +0200

> From: Thierry Reding 
> 
> While commit 9c706a49d660 ("net: ipconfig: fix use after free") avoids
> the use after free, the resulting code still ends up calling both the
> ic_setup_if() and ic_setup_routes() after calling ic_close_devs(), and
> access to the device is still required.
> 
> Move the call to ic_close_devs() to the very end of the function.
> 
> Signed-off-by: Thierry Reding 
> ---
> This applies on top of next-20160816.

Applied, thanks.


Re: [PATCH v4 net-next 0/3] strp: Stream parser for messages

2016-08-17 Thread David Miller
From: Tom Herbert 
Date: Mon, 15 Aug 2016 14:51:00 -0700

> This patch set introduces a utility for parsing application layer
> protocol messages in a TCP stream. This is a generalization of the
> mechanism implemented of Kernel Connection Multiplexor.
 ...

Series applied, thanks Tom.


Re: [RFC PATCH 10/13] net: sched: lockless support for netif_schedule

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 16:17 -0700, John Fastabend wrote:
> On 16-08-17 04:01 PM, Eric Dumazet wrote:
> > On Wed, 2016-08-17 at 12:37 -0700, John Fastabend wrote:
> > 
> >> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> >> index d10b762..f5b7254 100644
> >> --- a/net/sched/sch_generic.c
> >> +++ b/net/sched/sch_generic.c
> >> @@ -171,6 +171,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
> >>if (qdisc_is_percpu_stats(q)) {
> >>qdisc_qstats_cpu_backlog_inc(q, nskb);
> >>qdisc_qstats_cpu_qlen_inc(q);
> >> +  set_thread_flag(TIF_NEED_RESCHED);
> >>} else {
> >>qdisc_qstats_backlog_inc(q, nskb);
> >>q->q.qlen++;
> > 
> > Hmm... care to elaborate this bit ?
> > 
> > 
> > 
> 
> ah dang thats leftover from trying to resolve a skb getting stuck on the
> bad_txq_cell from qdisc_enqueue_skb_bad_txq(). You'll notice I added
> a __netif_schedule(skb) call in qdisc_enqueue_skb_bad_txq() which
> resolves this and the set_thread_flag() here can then just be removed.

OK I feel much better now ;)

Thanks !




Re: [PATCH] xfrm: Only add l3mdev oif to dst lookups

2016-08-17 Thread David Miller
From: David Ahern 
Date: Sun, 14 Aug 2016 19:52:56 -0700

> Subash reported that commit 42a7b32b73d6 ("xfrm: Add oif to dst lookups")
> broke a wifi use case that uses fib rules and xfrms. The intent of
> 42a7b32b73d6 was driven by VRFs with IPsec. As a compromise relax the
> use of oif in xfrm lookups to L3 master devices only (ie., oif is either
> an L3 master device or is enslaved to a master device).
> 
> Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups")
> Reported-by: Subash Abhinov Kasiviswanathan 
> Signed-off-by: David Ahern 

Steffen, please pick this up.

Thank you.


Re: [PATCH net-next 2/3] net: mpls: Fixups for GSO

2016-08-17 Thread David Ahern
On 8/17/16 5:16 PM, Alexander Duyck wrote:
>> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
>> index 1ecbd7715f6d..6d78f162a88b 100644
>> --- a/net/openvswitch/actions.c
>> +++ b/net/openvswitch/actions.c
>> @@ -167,6 +167,12 @@ static int push_mpls(struct sk_buff *skb, struct 
>> sw_flow_key *key,
>> skb->mac_len);
>> skb_reset_mac_header(skb);
>>
>> +   /* for GSO: set MPLS as network header and encapsulated protocol
>> +* header as inner network header
>> +*/
>> +   skb_set_network_header(skb, skb->mac_len);
>> +   skb_set_inner_network_header(skb, skb->mac_len + MPLS_HLEN);
>> +
>> new_mpls_lse = (__be32 *)skb_mpls_header(skb);
>> *new_mpls_lse = mpls->mpls_lse;
>>
> 
> So the one question I would have about this is how attached are you to
> using the network_header to record the offset for the MPLS header?  I
> ask because I think from a hardware offloading perspective it would
> make it much easier if instead you used the inner_mac_header to
> represent the offset for the MPLS header.  This way device drivers
> could just skip over it like a VLAN and just use network and transport
> header values like they would otherwise.
> 

Where does the network_header relate to if I change the marker to 
inner_mac_header? Would it be skipped?

skb->protocol is set to MPLS.
mac_header points to ethernet address
network_header points to ???

inner protocol is set to what is encapsulated (e.g., ipv4 or ipv6)
inner_mac_header points to start of mpls label.
inner_network points to start of network header.

Is that sufficient for h/w drivers?


Re: [PATCH 00/16] pull request for net-next: batman-adv 2016-08-16

2016-08-17 Thread David Miller
From: Simon Wunderlich 
Date: Tue, 16 Aug 2016 10:32:28 +0200

> this is our second feature pull request for batman-adv in this round, with
> our joint work on netlink and netns support. There will be at least one more
> pull request coming later.
> 
> Please pull or let me know of any problem!

Pulled, thanks Simon.


Re: [net 0/6][pull request] Intel Wired LAN Driver Updates 2016-08-16

2016-08-17 Thread David Miller
From: Jeff Kirsher 
Date: Tue, 16 Aug 2016 13:40:08 -0700

> This series contains fixes to e1000e, igb, ixgbe and i40e.

Pulled, thanks a lot Jeff.


Re: [patch net v2 00/10] mlxsw: IPv4 UC router fixes

2016-08-17 Thread David Miller
From: Jiri Pirko 
Date: Wed, 17 Aug 2016 16:39:27 +0200

> Ido says:
> Patches 1-3 fix a long standing problem in the driver's init sequence,
> which manifests itself quite often when routing daemons try to configure
> an IP address on registered netdevs that don't yet have an associated
> vPort.
> 
> Patches 4-9 add missing packet traps for the router to work properly and
> also fix ordering issue following the recent changes to the driver's init
> sequence.
> 
> The last patch isn't related to the router, but fixes a general problem
> in which under certain conditions packets aren't trapped to CPU.
> 
> v1->v2:
> - Change order of patch 7
> - Add patch 6 following Ilan's comment
> - Add patchset name and cover letter

Series applied, thanks.


Re: [RFC PATCH 12/13] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq

2016-08-17 Thread John Fastabend
On 16-08-17 04:04 PM, Eric Dumazet wrote:
> On Wed, 2016-08-17 at 12:38 -0700, John Fastabend wrote:
>> The sch_mq qdisc creates a sub-qdisc per tx queue which are then
>> called independently for enqueue and dequeue operations. However
>> statistics are aggregated and pushed up to the "master" qdisc.
>>
>> This patch adds support for any of the sub-qdiscs to be per cpu
>> statistic qdiscs. To handle this case add a check when calculating
>> stats and aggregate the per cpu stats if needed.
>>
>> Also exports __gnet_stats_copy_queue() to use as a helper function.
> 
> 
> Looks like this patch should be happening earlier in the series ?
> 
> 

hmm yep patches 12 and 13 should come before 11 to avoid introducing
a bug and subsequently fixing them.


Re: [RFC PATCH 10/13] net: sched: lockless support for netif_schedule

2016-08-17 Thread John Fastabend
On 16-08-17 04:01 PM, Eric Dumazet wrote:
> On Wed, 2016-08-17 at 12:37 -0700, John Fastabend wrote:
> 
>> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
>> index d10b762..f5b7254 100644
>> --- a/net/sched/sch_generic.c
>> +++ b/net/sched/sch_generic.c
>> @@ -171,6 +171,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
>>  if (qdisc_is_percpu_stats(q)) {
>>  qdisc_qstats_cpu_backlog_inc(q, nskb);
>>  qdisc_qstats_cpu_qlen_inc(q);
>> +set_thread_flag(TIF_NEED_RESCHED);
>>  } else {
>>  qdisc_qstats_backlog_inc(q, nskb);
>>  q->q.qlen++;
> 
> Hmm... care to elaborate this bit ?
> 
> 
> 

ah dang thats leftover from trying to resolve a skb getting stuck on the
bad_txq_cell from qdisc_enqueue_skb_bad_txq(). You'll notice I added
a __netif_schedule(skb) call in qdisc_enqueue_skb_bad_txq() which
resolves this and the set_thread_flag() here can then just be removed.

.John


Re: [PATCH net-next 2/3] net: mpls: Fixups for GSO

2016-08-17 Thread Alexander Duyck
On Wed, Aug 17, 2016 at 2:49 PM, David Ahern  wrote:
> As reported by Lennert the MPLS GSO code is failing to properly segment
> large packets. There are a couple of problems:
>
> 1. the inner protocol is not set so the gso segment functions for inner
>protocol layers are not getting run, and
>
> 2  MPLS labels for packets that use the "native" (non-OVS) MPLS code
>are not properly accounted for in mpls_gso_segment.
>
> The MPLS GSO code was added for OVS. It is re-using skb_mac_gso_segment
> to call the gso segment functions for the higher layer protocols. That
> means skb_mac_gso_segment is called twice -- once with the network
> protocol set to MPLS and again with the network protocol set to the
> inner protocol.
>
> This patch sets the inner skb protocol addressing item 1 above and sets
> the network_header and inner_network_header to mark where the MPLS labels
> start and end. The MPLS code in OVS is also updated to set the two
> network markers.
>
> From there the MPLS GSO code uses the difference between the network
> header and the inner network header to know the size of the MPLS header
> that was pushed. It then pulls the MPLS header, resets the mac_len and
> protocol for the inner protocol and then calls skb_mac_gso_segment
> to segment the skb. Afterwards the skb protocol is set to mpls for
> each segment as suggested by Simon.
>
> Reported-by: Lennert Buytenhek 
> Signed-off-by: David Ahern 
> ---
>  net/mpls/mpls_gso.c   | 24 +---
>  net/mpls/mpls_iptunnel.c  |  5 +
>  net/openvswitch/actions.c |  6 ++
>  3 files changed, 24 insertions(+), 11 deletions(-)
>



> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index 1ecbd7715f6d..6d78f162a88b 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -167,6 +167,12 @@ static int push_mpls(struct sk_buff *skb, struct 
> sw_flow_key *key,
> skb->mac_len);
> skb_reset_mac_header(skb);
>
> +   /* for GSO: set MPLS as network header and encapsulated protocol
> +* header as inner network header
> +*/
> +   skb_set_network_header(skb, skb->mac_len);
> +   skb_set_inner_network_header(skb, skb->mac_len + MPLS_HLEN);
> +
> new_mpls_lse = (__be32 *)skb_mpls_header(skb);
> *new_mpls_lse = mpls->mpls_lse;
>

So the one question I would have about this is how attached are you to
using the network_header to record the offset for the MPLS header?  I
ask because I think from a hardware offloading perspective it would
make it much easier if instead you used the inner_mac_header to
represent the offset for the MPLS header.  This way device drivers
could just skip over it like a VLAN and just use network and transport
header values like they would otherwise.

- Alex


Re: [PATCH net-next] dsa: mv88e6xxx: Timeout based on iterations

2016-08-17 Thread Vivien Didelot
Hi Andrew,

Andrew Lunn  writes:

> The mv88e6xxx driver times out operations on the switch based on
> looping until an elapsed wall clock time is reached. However, if
> usleep_range() sleeps much longer than expected, it could timeout with
> an error without actually checking to see if the devices has completed
> the operation. So replace the elapsed time with a fixed upper bound on
> the number of loops.
>
> Testing on various switches has shown that switches takes either 0 or
> 1 iteration, so a maximum of 16 iterations is a safe limit.
>
> Signed-off-by: Andrew Lunn 
> ---
>  drivers/net/dsa/mv88e6xxx/chip.c | 15 ++-
>  1 file changed, 6 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/dsa/mv88e6xxx/chip.c 
> b/drivers/net/dsa/mv88e6xxx/chip.c
> index a230fcba5b64..ac8e9af4879f 100644
> --- a/drivers/net/dsa/mv88e6xxx/chip.c
> +++ b/drivers/net/dsa/mv88e6xxx/chip.c
> @@ -309,9 +309,9 @@ static int mv88e6xxx_serdes_write(struct mv88e6xxx_chip 
> *chip, int reg, u16 val)
>  static int mv88e6xxx_wait(struct mv88e6xxx_chip *chip, int addr, int reg,
> u16 mask)
>  {
> - unsigned long timeout = jiffies + HZ / 10;
> + int i;
>  
> - while (time_before(jiffies, timeout)) {
> + for (i = 0; i < 16; i++) {
>   u16 val;
>   int err;
>  

Since we remove the elapsed time here, can we use mv88e6xxx_wait in
mv88e6xxx_update? It'd be good to have a consistent wait routine
everywhere.

Thanks,

Vivien


Re: [RFC PATCH 07/13] net: sched: support qdisc_reset on NOLOCK qdisc

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 12:36 -0700, John Fastabend wrote:
> The qdisc_reset operation depends on the qdisc lock at the moment
> to halt any additions to gso_skb and statistics while the list is
> free'd and the stats zeroed.

...

> Signed-off-by: John Fastabend 
> ---
>  net/sched/sch_generic.c |   45 +++--
>  1 file changed, 35 insertions(+), 10 deletions(-)
> 
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 3b9a21f..29238c4 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -737,6 +737,20 @@ void qdisc_reset(struct Qdisc *qdisc)
>   kfree_skb(qdisc->skb_bad_txq);
>   qdisc->skb_bad_txq = NULL;
>  
> + if (qdisc->gso_cpu_skb) {
> + int i;
> +
> + for_each_possible_cpu(i) {
> + struct gso_cell *cell;
> +
> + cell = per_cpu_ptr(qdisc->gso_cpu_skb, i);
> + if (cell) {
> + kfree_skb_list(cell->skb);
> + cell = NULL;

You probably wanted :
cell->skb = NULL;


> + }
> + }
> + }
> +
>   if (qdisc->gso_skb) {
>   kfree_skb_list(qdisc->gso_skb);
>   qdisc->gso_skb = NULL;
> @@ -812,10 +826,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue 
> *dev_queue,
>   root_lock = qdisc_lock(oqdisc);
>   spin_lock_bh(root_lock);
>  
> - /* Prune old scheduler */
> - if (oqdisc && atomic_read(>refcnt) <= 1)
> - qdisc_reset(oqdisc);
> -
>  

This probably belongs to a separate patch, before any per cpu / lockless
qdisc changes ?





Re: [RFC PATCH 12/13] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 12:38 -0700, John Fastabend wrote:
> The sch_mq qdisc creates a sub-qdisc per tx queue which are then
> called independently for enqueue and dequeue operations. However
> statistics are aggregated and pushed up to the "master" qdisc.
> 
> This patch adds support for any of the sub-qdiscs to be per cpu
> statistic qdiscs. To handle this case add a check when calculating
> stats and aggregate the per cpu stats if needed.
> 
> Also exports __gnet_stats_copy_queue() to use as a helper function.


Looks like this patch should be happening earlier in the series ?




[PATCH net-next 3/4] bpf: enable event output helper also for xdp types

2016-08-17 Thread Daniel Borkmann
Follow-up to 555c8a8623a3 ("bpf: avoid stack copy and use skb ctx for
event output") for also adding the event output helper for XDP typed
programs. The event output helper has been very useful in particular for
debugging or event notification purposes, since it's much faster and
flexible than regular trace printk due to programmatically being able to
attach meta data. Same flags structure applies as with tc BPF programs.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 net/core/filter.c | 42 +-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index abf546d..3b60dfd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2408,6 +2408,41 @@ static const struct bpf_func_proto 
bpf_skb_under_cgroup_proto = {
 };
 #endif
 
+static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
+ unsigned long off, unsigned long len)
+{
+   memcpy(dst_buff, src_buff + off, len);
+   return 0;
+}
+
+static u64 bpf_xdp_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
+   u64 meta_size)
+{
+   struct xdp_buff *xdp = (struct xdp_buff *)(long) r1;
+   struct bpf_map *map = (struct bpf_map *)(long) r2;
+   u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
+   void *meta = (void *)(long) r4;
+
+   if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
+   return -EINVAL;
+   if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+   return -EFAULT;
+
+   return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size,
+   bpf_xdp_copy);
+}
+
+static const struct bpf_func_proto bpf_xdp_event_output_proto = {
+   .func   = bpf_xdp_event_output,
+   .gpl_only   = true,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_CTX,
+   .arg2_type  = ARG_CONST_MAP_PTR,
+   .arg3_type  = ARG_ANYTHING,
+   .arg4_type  = ARG_PTR_TO_STACK,
+   .arg5_type  = ARG_CONST_STACK_SIZE,
+};
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -2492,7 +2527,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 static const struct bpf_func_proto *
 xdp_func_proto(enum bpf_func_id func_id)
 {
-   return sk_filter_func_proto(func_id);
+   switch (func_id) {
+   case BPF_FUNC_perf_event_output:
+   return _xdp_event_output_proto;
+   default:
+   return sk_filter_func_proto(func_id);
+   }
 }
 
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
-- 
1.9.3



[PATCH net-next 0/4] BPF helper improvements and cleanups

2016-08-17 Thread Daniel Borkmann
This set adds various improvements to BPF helpers, a cleanup to use
skb_pkt_type_ok() helper, addition of bpf_skb_change_tail(), a follow
up for event output helper and removing ifdefs around the cgroupv2
helper bits. For details please see individual patches.

The set is based against net-next tree, but requires a merge of net
into net-next first.

Thanks a lot!

Daniel Borkmann (4):
  bpf: use skb_pkt_type_ok helper in bpf_skb_change_type
  bpf: add bpf_skb_change_tail helper
  bpf: enable event output helper also for xdp types
  bpf: get rid of cgroup helper related ifdefs

 include/linux/skbuff.h   |  43 +-
 include/net/sock.h   |  10 
 include/uapi/linux/bpf.h |  11 
 net/core/filter.c| 152 +++
 4 files changed, 204 insertions(+), 12 deletions(-)

-- 
1.9.3



[PATCH net-next 4/4] bpf: get rid of cgroup helper related ifdefs

2016-08-17 Thread Daniel Borkmann
As recently discussed during the task_under_cgroup_hierarchy() addition,
we should get rid of the ifdefs surrounding the bpf_skb_under_cgroup()
helper. If related functionality is not built-in, the helper cannot be
used anyway, which is also in line with what we do for all other helpers.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 include/net/sock.h | 10 ++
 net/core/filter.c  |  6 +-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index ff5be7e..2aab9b6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1114,6 +1114,16 @@ static inline bool sk_stream_is_writeable(const struct 
sock *sk)
   sk_stream_memory_free(sk);
 }
 
+static inline int sk_under_cgroup_hierarchy(struct sock *sk,
+   struct cgroup *ancestor)
+{
+#ifdef CONFIG_SOCK_CGROUP_DATA
+   return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data),
+   ancestor);
+#else
+   return -ENOTSUPP;
+#endif
+}
 
 static inline bool sk_has_memory_pressure(const struct sock *sk)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index 3b60dfd..a83766b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2374,7 +2374,6 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
}
 }
 
-#ifdef CONFIG_SOCK_CGROUP_DATA
 static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
struct sk_buff *skb = (struct sk_buff *)(long)r1;
@@ -2395,7 +2394,7 @@ static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, 
u64 r4, u64 r5)
if (unlikely(!cgrp))
return -EAGAIN;
 
-   return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data), cgrp);
+   return sk_under_cgroup_hierarchy(sk, cgrp);
 }
 
 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
@@ -2406,7 +2405,6 @@ static const struct bpf_func_proto 
bpf_skb_under_cgroup_proto = {
.arg2_type  = ARG_CONST_MAP_PTR,
.arg3_type  = ARG_ANYTHING,
 };
-#endif
 
 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
  unsigned long off, unsigned long len)
@@ -2515,10 +2513,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return _skb_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
return _get_smp_processor_id_proto;
-#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_under_cgroup:
return _skb_under_cgroup_proto;
-#endif
default:
return sk_filter_func_proto(func_id);
}
-- 
1.9.3



Re: [RFC PATCH 10/13] net: sched: lockless support for netif_schedule

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 12:37 -0700, John Fastabend wrote:

> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index d10b762..f5b7254 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -171,6 +171,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
>   if (qdisc_is_percpu_stats(q)) {
>   qdisc_qstats_cpu_backlog_inc(q, nskb);
>   qdisc_qstats_cpu_qlen_inc(q);
> + set_thread_flag(TIF_NEED_RESCHED);
>   } else {
>   qdisc_qstats_backlog_inc(q, nskb);
>   q->q.qlen++;

Hmm... care to elaborate this bit ?





[PATCH net-next 1/4] bpf: use skb_pkt_type_ok helper in bpf_skb_change_type

2016-08-17 Thread Daniel Borkmann
Since we have a skb_pkt_type_ok() helper for checking the type before
mangling, make use of it instead of open coding. Follow-up to commit
8b10cab64c13 ("net: simplify and make pkt_type_ok() available for other
users") that came in after d2485c4242a8 ("bpf: add bpf_skb_change_type
helper").

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index cb06ace..58b5e6d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1976,8 +1976,8 @@ static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, 
u64 r4, u64 r5)
u32 pkt_type = r2;
 
/* We only allow a restricted subset to be changed for now. */
-   if (unlikely(skb->pkt_type > PACKET_OTHERHOST ||
-pkt_type > PACKET_OTHERHOST))
+   if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
+!skb_pkt_type_ok(pkt_type)))
return -EINVAL;
 
skb->pkt_type = pkt_type;
-- 
1.9.3



[PATCH net-next 2/4] bpf: add bpf_skb_change_tail helper

2016-08-17 Thread Daniel Borkmann
This work adds a bpf_skb_change_tail() helper for tc BPF programs. The
basic idea is to expand or shrink the skb in a controlled manner. The
eBPF program can then rewrite the rest via helpers like bpf_skb_store_bytes(),
bpf_lX_csum_replace() and others rather than passing a raw buffer for
writing here.

bpf_skb_change_tail() is really a slow path helper and intended for
replies with f.e. ICMP control messages. Concept is similar to other
helpers like bpf_skb_change_proto() helper to keep the helper without
protocol specifics and let the BPF program mangle the remaining parts.
A flags field has been added and is reserved for now should we extend
the helper in future.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 include/linux/skbuff.h   |  43 +++-
 include/uapi/linux/bpf.h |  11 ++
 net/core/filter.c| 100 +--
 3 files changed, 150 insertions(+), 4 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0f665cb..7047448 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2295,7 +2295,7 @@ static inline int pskb_network_may_pull(struct sk_buff 
*skb, unsigned int len)
 
 int ___pskb_trim(struct sk_buff *skb, unsigned int len);
 
-static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
+static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
 {
if (unlikely(skb_is_nonlinear(skb))) {
WARN_ON(1);
@@ -2305,6 +2305,11 @@ static inline void __skb_trim(struct sk_buff *skb, 
unsigned int len)
skb_set_tail_pointer(skb, len);
 }
 
+static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
+{
+   __skb_set_length(skb, len);
+}
+
 void skb_trim(struct sk_buff *skb, unsigned int len);
 
 static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
@@ -2335,6 +2340,20 @@ static inline void pskb_trim_unique(struct sk_buff *skb, 
unsigned int len)
BUG_ON(err);
 }
 
+static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
+{
+   unsigned int diff = len - skb->len;
+
+   if (skb_tailroom(skb) < diff) {
+   int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
+  GFP_ATOMIC);
+   if (ret)
+   return ret;
+   }
+   __skb_set_length(skb, len);
+   return 0;
+}
+
 /**
  * skb_orphan - orphan a buffer
  * @skb: buffer to orphan
@@ -2938,6 +2957,21 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, 
unsigned int len)
return __pskb_trim(skb, len);
 }
 
+static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
+{
+   if (skb->ip_summed == CHECKSUM_COMPLETE)
+   skb->ip_summed = CHECKSUM_NONE;
+   __skb_trim(skb, len);
+   return 0;
+}
+
+static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
+{
+   if (skb->ip_summed == CHECKSUM_COMPLETE)
+   skb->ip_summed = CHECKSUM_NONE;
+   return __skb_grow(skb, len);
+}
+
 #define skb_queue_walk(queue, skb) \
for (skb = (queue)->next;   
\
 skb != (struct sk_buff *)(queue);  
\
@@ -3726,6 +3760,13 @@ static inline bool skb_is_gso_v6(const struct sk_buff 
*skb)
return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
 }
 
+static inline void skb_gso_reset(struct sk_buff *skb)
+{
+   skb_shinfo(skb)->gso_size = 0;
+   skb_shinfo(skb)->gso_segs = 0;
+   skb_shinfo(skb)->gso_type = 0;
+}
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb);
 
 static inline bool skb_warn_if_lro(const struct sk_buff *skb)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 866d53c..e4c5a1b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -386,6 +386,17 @@ enum bpf_func_id {
 */
BPF_FUNC_current_task_under_cgroup,
 
+   /**
+* bpf_skb_change_tail(skb, len, flags)
+* The helper will resize the skb to the given new size,
+* to be used f.e. with control messages.
+* @skb: pointer to skb
+* @len: new skb length
+* @flags: reserved
+* Return: 0 on success or negative error
+*/
+   BPF_FUNC_skb_change_tail,
+
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 58b5e6d..abf546d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1350,14 +1350,18 @@ struct bpf_scratchpad {
 
 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
 
+static inline int __bpf_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+   return skb_ensure_writable(skb, write_len);
+}
+
 static inline int bpf_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
 {
-   

Re: [RFC PATCH 08/13] net: sched: support skb_bad_tx with lockless qdisc

2016-08-17 Thread John Fastabend
On 16-08-17 03:58 PM, Eric Dumazet wrote:
> On Wed, 2016-08-17 at 12:36 -0700, John Fastabend wrote:
>> Similar to how gso is handled skb_bad_tx needs to be per cpu to handle
>> lockless qdisc with multiple writer/producers.
> \
>> @@ -1021,6 +1026,7 @@ err_out4:
>>  free_percpu(sch->cpu_bstats);
>>  free_percpu(sch->cpu_qstats);
>>  free_percpu(sch->gso_cpu_skb);
>> +free_percpu(sch->skb_bad_txq_cpu);
> 
> 
> This might be the time to group all these per cpu allocations to a
> single one, to help data locality and decrease overhead of having XX
> pointers.
> 
> 
> 

Sounds like a good idea to me. I'll go ahead and add a patch to the
front to consolidate the stats and then add these there.


Re: [RFC PATCH 07/13] net: sched: support qdisc_reset on NOLOCK qdisc

2016-08-17 Thread John Fastabend
On 16-08-17 03:53 PM, Eric Dumazet wrote:
> On Wed, 2016-08-17 at 12:36 -0700, John Fastabend wrote:
>> The qdisc_reset operation depends on the qdisc lock at the moment
>> to halt any additions to gso_skb and statistics while the list is
>> free'd and the stats zeroed.
> 
> ...
> 
>> Signed-off-by: John Fastabend 
>> ---
>>  net/sched/sch_generic.c |   45 +++--
>>  1 file changed, 35 insertions(+), 10 deletions(-)
>>
>> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
>> index 3b9a21f..29238c4 100644
>> --- a/net/sched/sch_generic.c
>> +++ b/net/sched/sch_generic.c
>> @@ -737,6 +737,20 @@ void qdisc_reset(struct Qdisc *qdisc)
>>  kfree_skb(qdisc->skb_bad_txq);
>>  qdisc->skb_bad_txq = NULL;
>>  
>> +if (qdisc->gso_cpu_skb) {
>> +int i;
>> +
>> +for_each_possible_cpu(i) {
>> +struct gso_cell *cell;
>> +
>> +cell = per_cpu_ptr(qdisc->gso_cpu_skb, i);
>> +if (cell) {
>> +kfree_skb_list(cell->skb);
>> +cell = NULL;
> 
>   You probably wanted :
>   cell->skb = NULL;
> 

Yep thanks!

> 
>> +}
>> +}
>> +}
>> +
>>  if (qdisc->gso_skb) {
>>  kfree_skb_list(qdisc->gso_skb);
>>  qdisc->gso_skb = NULL;
>> @@ -812,10 +826,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue 
>> *dev_queue,
>>  root_lock = qdisc_lock(oqdisc);
>>  spin_lock_bh(root_lock);
>>  
>> -/* Prune old scheduler */
>> -if (oqdisc && atomic_read(>refcnt) <= 1)
>> -qdisc_reset(oqdisc);
>> -
>>  
> 
> This probably belongs to a separate patch, before any per cpu / lockless
> qdisc changes ?
> 
> 
> 

Agreed will do for next rev thanks for reviewing.



Re: [RFC PATCH 08/13] net: sched: support skb_bad_tx with lockless qdisc

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 12:36 -0700, John Fastabend wrote:
> Similar to how gso is handled skb_bad_tx needs to be per cpu to handle
> lockless qdisc with multiple writer/producers.
\
> @@ -1021,6 +1026,7 @@ err_out4:
>   free_percpu(sch->cpu_bstats);
>   free_percpu(sch->cpu_qstats);
>   free_percpu(sch->gso_cpu_skb);
> + free_percpu(sch->skb_bad_txq_cpu);


This might be the time to group all these per cpu allocations to a
single one, to help data locality and decrease overhead of having XX
pointers.





Re: [PATCH net-next 3/3] net: veth: Set features for MPLS

2016-08-17 Thread David Ahern
On 8/17/16 4:41 PM, Eric Dumazet wrote:
> On Wed, 2016-08-17 at 14:49 -0700, David Ahern wrote:
>> veth does not really transmit packets only moves the skb from one
>> netdev to another so gso and checksum is not really needed. Add
>> the features to mpls_features to get the same benefit and performance
>> with MPLS as without it.
> 
> It seems mpls_features should also be managed by bonding and team
> drivers ...
> 

We'll get there. veth is used in Lennert's example so adding an update for it 
now. Can add other devices in time.



Re: [PATCH net] netfilter: tproxy: properly refcount tcp listeners

2016-08-17 Thread Pablo Neira Ayuso
On Wed, Aug 17, 2016 at 09:56:46AM -0700, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> inet_lookup_listener() and inet6_lookup_listener() no longer
> take a reference on the found listener.
> 
> This minimal patch adds back the refcounting, but we might do
> this differently in net-next later.

Applied, thanks Eric!


Re: [RFC PATCH 01/13] net: sched: allow qdiscs to handle locking

2016-08-17 Thread John Fastabend
On 16-08-17 03:33 PM, Eric Dumazet wrote:
> On Wed, 2016-08-17 at 12:33 -0700, John Fastabend wrote:
>> This patch adds a flag for queueing disciplines to indicate the stack
>> does not need to use the qdisc lock to protect operations. This can
>> be used to build lockless scheduling algorithms and improving
>> performance.
>>
>> The flag is checked in the tx path and the qdisc lock is only taken
>> if it is not set. For now use a conditional if statement. Later we
>> could be more aggressive if it proves worthwhile and use a static key
>> or wrap this in a likely().
>>
>> Signed-off-by: John Fastabend 
>> ---
>>  include/net/pkt_sched.h   |4 +++-
>>  include/net/sch_generic.h |1 +
>>  net/core/dev.c|   32 
>>  net/sched/sch_generic.c   |   26 --
>>  4 files changed, 48 insertions(+), 15 deletions(-)
>>
>> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
>> index 7caa99b..69540c6 100644
>> --- a/include/net/pkt_sched.h
>> +++ b/include/net/pkt_sched.h
>> @@ -107,8 +107,10 @@ void __qdisc_run(struct Qdisc *q);
>>  
>>  static inline void qdisc_run(struct Qdisc *q)
>>  {
>> -if (qdisc_run_begin(q))
>> +if (qdisc_run_begin(q)) {
>>  __qdisc_run(q);
>> +qdisc_run_end(q);
>> +}
>>  }
> 
> 
> Looks like you could have a separate patch, removing qdisc_run_end()
> call done in __qdisc_run(q) ?
> 
> Then the 'allow qdiscs to handle locking'
> 
> 

Agreed that would clean this up a bit. Will do for next rev.


Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Florian Fainelli
On 08/17/2016 03:32 PM, Timur Tabi wrote:
> Timur Tabi wrote:
>>>
>>> Nothing prevents you from detailing in ACPI or DT sub-components of a
>>> larger HW block, if there is such a desire, but just make it in a way
>>> that it looks like what would be done for e.g: separate discrete parts,
>>> only the parenting of nodes would change.
>>
>> So instead of just having a single property, instead create a child node
>> just for the SGMII phy, with its own compatible string and various other
>> properties (like interrupt and base register)?  That's not a bad idea.
> 
> I need help getting this to work.
> 
> emac0: ethernet@3880 {
> compatible = "qcom,qdf2432-emac";
> reg-names = "base", "csr";
> reg = <0x0 0x3880 0x0 0x1 >,
> <0x0 0x38816000 0x0 0x1000>,
> <0x0 0x3881C000 0x0 0x4000>;
> interrupts = <0 0x100 0 0 0x104 0>;
> interrupt-names = "core0", "sgmii";
> 
> sgmii-handle = <_sgmii>;
> phy-handle = <>;
> 
> #address-cells = <1>;
> #size-cells = <0>;
> phy0: ethernet-phy@4 {
> reg = <4>;
> };
> };
> 
> emac_sgmii: ethernet-phy@410400 {
> compatible = "qcom,qdf2432-emac-phy";
> reg = <0x0 0x00410400 0x0 0x100>;
> interrupts = <0 0x104 0>;
> };
> 

Is this register range relative to the emac0 node here, or is this
really a separate node, within the same adress space as your emac0 node?

> When my driver probes, the platform_device object points to the emac0
> node, as always.
> 
> How do I parse the emac_sgmii node?  How do I get functions like
> platform_get_resource() to work?  How do I create a new platform_device
> object that points to the emac_sgmii node?

Answer largely depends on whether your device is really located outside
of the emac, if it located outside, then a platform device matching the
compatible string would get you what you want. If the emac_sgmii block
is a sub-block within the EMAC, then a few things need fixing:

- your emac_sgmii node should be a sub-node of the emac node, not a sibling
- the emac0 node should have a "ranges" property that indicates how to
translate the sub-nodes' "reg" property based on the base register
address of the emac0 block
- you would have to call of_platform_populate from the EMAC driver to
ensure that the emac_sgmii child node and therefore platform device gets
created


-- 
Florian


Re: [RFC PATCH 01/13] net: sched: allow qdiscs to handle locking

2016-08-17 Thread John Fastabend
On 16-08-17 03:34 PM, Eric Dumazet wrote:
> On Wed, 2016-08-17 at 12:33 -0700, John Fastabend wrote:
> 
> 
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 4ce07dc..5db395d 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3076,6 +3076,26 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, 
>> struct Qdisc *q,
>>  int rc;
>>  
>>  qdisc_calculate_pkt_len(skb, q);
>> +
>> +if (q->flags & TCQ_F_NOLOCK) {
>> +if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, >state))) {
>> +__qdisc_drop(skb, _free);
>> +rc = NET_XMIT_DROP;
>> +} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q)) {
> 
> For a lockless qdisc, do you believe TCQ_F_CAN_BYPASS is still a gain ?
> 

For the benchmarks from pktgen it appears to be a win or mute to just
drop the TCQ_F_CAN_BYPASS (just taking a look at one sample below)

nolock & nobypass   locked (current master)
--
1:  1435796 1471479
2:  1880642 1746231
4:  1922935 1119626
8:  1585055 1001471
12: 1479273 989269

The only thing would be to test a bunch of netperf RR sessions to be
sure.

> Also !qdisc_qlen(q) looks racy anyway ?

Yep its racy unless you make it an atomic but this hurts performance
metrics. There is a patch further in the stack here that adds the
atomic variants but I tend to think we can just drop the bypass logic
in the lockless case assuming the netperf tests look good.

> 
>> +qdisc_bstats_cpu_update(q, skb);
>> +if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
>> +__qdisc_run(q);
>> +rc = NET_XMIT_SUCCESS;
>> +} else {
>> +rc = q->enqueue(skb, q, _free) & NET_XMIT_MASK;
>> +__qdisc_run(q);
>> +}
>> +
>> +if (unlikely(to_free))
>> +kfree_skb_list(to_free);
>> +return rc;
>> +}
>> +
> 
> 



Re: [PATCH net-next 3/3] net: veth: Set features for MPLS

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 14:49 -0700, David Ahern wrote:
> veth does not really transmit packets only moves the skb from one
> netdev to another so gso and checksum is not really needed. Add
> the features to mpls_features to get the same benefit and performance
> with MPLS as without it.

It seems mpls_features should also be managed by bonding and team
drivers ...





Re: [RFC PATCH 01/13] net: sched: allow qdiscs to handle locking

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 12:33 -0700, John Fastabend wrote:


> diff --git a/net/core/dev.c b/net/core/dev.c
> index 4ce07dc..5db395d 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3076,6 +3076,26 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, 
> struct Qdisc *q,
>   int rc;
>  
>   qdisc_calculate_pkt_len(skb, q);
> +
> + if (q->flags & TCQ_F_NOLOCK) {
> + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, >state))) {
> + __qdisc_drop(skb, _free);
> + rc = NET_XMIT_DROP;
> + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q)) {

For a lockless qdisc, do you believe TCQ_F_CAN_BYPASS is still a gain ?

Also !qdisc_qlen(q) looks racy anyway ?

> + qdisc_bstats_cpu_update(q, skb);
> + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
> + __qdisc_run(q);
> + rc = NET_XMIT_SUCCESS;
> + } else {
> + rc = q->enqueue(skb, q, _free) & NET_XMIT_MASK;
> + __qdisc_run(q);
> + }
> +
> + if (unlikely(to_free))
> + kfree_skb_list(to_free);
> + return rc;
> + }
> +




Re: [RFC PATCH 01/13] net: sched: allow qdiscs to handle locking

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 12:33 -0700, John Fastabend wrote:
> This patch adds a flag for queueing disciplines to indicate the stack
> does not need to use the qdisc lock to protect operations. This can
> be used to build lockless scheduling algorithms and improving
> performance.
> 
> The flag is checked in the tx path and the qdisc lock is only taken
> if it is not set. For now use a conditional if statement. Later we
> could be more aggressive if it proves worthwhile and use a static key
> or wrap this in a likely().
> 
> Signed-off-by: John Fastabend 
> ---
>  include/net/pkt_sched.h   |4 +++-
>  include/net/sch_generic.h |1 +
>  net/core/dev.c|   32 
>  net/sched/sch_generic.c   |   26 --
>  4 files changed, 48 insertions(+), 15 deletions(-)
> 
> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
> index 7caa99b..69540c6 100644
> --- a/include/net/pkt_sched.h
> +++ b/include/net/pkt_sched.h
> @@ -107,8 +107,10 @@ void __qdisc_run(struct Qdisc *q);
>  
>  static inline void qdisc_run(struct Qdisc *q)
>  {
> - if (qdisc_run_begin(q))
> + if (qdisc_run_begin(q)) {
>   __qdisc_run(q);
> + qdisc_run_end(q);
> + }
>  }


Looks like you could have a separate patch, removing qdisc_run_end()
call done in __qdisc_run(q) ?

Then the 'allow qdiscs to handle locking'




Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Timur Tabi

Timur Tabi wrote:


Nothing prevents you from detailing in ACPI or DT sub-components of a
larger HW block, if there is such a desire, but just make it in a way
that it looks like what would be done for e.g: separate discrete parts,
only the parenting of nodes would change.


So instead of just having a single property, instead create a child node
just for the SGMII phy, with its own compatible string and various other
properties (like interrupt and base register)?  That's not a bad idea.


I need help getting this to work.

emac0: ethernet@3880 {
compatible = "qcom,qdf2432-emac";
reg-names = "base", "csr";
reg = <0x0 0x3880 0x0 0x1 >,
<0x0 0x38816000 0x0 0x1000>,
<0x0 0x3881C000 0x0 0x4000>;
interrupts = <0 0x100 0 0 0x104 0>;
interrupt-names = "core0", "sgmii";

sgmii-handle = <_sgmii>;
phy-handle = <>;

#address-cells = <1>;
#size-cells = <0>;
phy0: ethernet-phy@4 {
reg = <4>;
};
};

emac_sgmii: ethernet-phy@410400 {
compatible = "qcom,qdf2432-emac-phy";
reg = <0x0 0x00410400 0x0 0x100>;
interrupts = <0 0x104 0>;
};

When my driver probes, the platform_device object points to the emac0 
node, as always.


How do I parse the emac_sgmii node?  How do I get functions like 
platform_get_resource() to work?  How do I create a new platform_device 
object that points to the emac_sgmii node?


--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.


[PATCH net-next 2/3] net: mpls: Fixups for GSO

2016-08-17 Thread David Ahern
As reported by Lennert the MPLS GSO code is failing to properly segment
large packets. There are a couple of problems:

1. the inner protocol is not set so the gso segment functions for inner
   protocol layers are not getting run, and

2  MPLS labels for packets that use the "native" (non-OVS) MPLS code
   are not properly accounted for in mpls_gso_segment.

The MPLS GSO code was added for OVS. It is re-using skb_mac_gso_segment
to call the gso segment functions for the higher layer protocols. That
means skb_mac_gso_segment is called twice -- once with the network
protocol set to MPLS and again with the network protocol set to the
inner protocol.

This patch sets the inner skb protocol addressing item 1 above and sets
the network_header and inner_network_header to mark where the MPLS labels
start and end. The MPLS code in OVS is also updated to set the two
network markers.

>From there the MPLS GSO code uses the difference between the network
header and the inner network header to know the size of the MPLS header
that was pushed. It then pulls the MPLS header, resets the mac_len and
protocol for the inner protocol and then calls skb_mac_gso_segment
to segment the skb. Afterwards the skb protocol is set to mpls for
each segment as suggested by Simon.

Reported-by: Lennert Buytenhek 
Signed-off-by: David Ahern 
---
 net/mpls/mpls_gso.c   | 24 +---
 net/mpls/mpls_iptunnel.c  |  5 +
 net/openvswitch/actions.c |  6 ++
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 2055e57ed1c3..fa6899f02cc8 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -22,33 +22,35 @@
 static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
   netdev_features_t features)
 {
+   int mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb);
struct sk_buff *segs = ERR_PTR(-EINVAL);
+   u16 mac_offset = skb->mac_header;
netdev_features_t mpls_features;
__be16 mpls_protocol;
+   u16 mac_len = skb->mac_len;
 
/* Setup inner SKB. */
mpls_protocol = skb->protocol;
skb->protocol = skb->inner_protocol;
 
-   /* Push back the mac header that skb_mac_gso_segment() has pulled.
-* It will be re-pulled by the call to skb_mac_gso_segment() below
-*/
-   __skb_push(skb, skb->mac_len);
+   __skb_pull(skb, mpls_hlen);
+   skb->mac_len = skb_inner_network_offset(skb);
 
/* Segment inner packet. */
mpls_features = skb->dev->mpls_features & features;
segs = skb_mac_gso_segment(skb, mpls_features);
-
+   if (IS_ERR_OR_NULL(segs)) {
+   skb_gso_error_unwind(skb, mpls_protocol, mpls_hlen, mac_offset,
+mac_len);
+   goto out;
+   }
 
/* Restore outer protocol. */
skb->protocol = mpls_protocol;
+   for (skb = segs; skb; skb = skb->next)
+   skb->protocol = mpls_protocol;
 
-   /* Re-pull the mac header that the call to skb_mac_gso_segment()
-* above pulled.  It will be re-pushed after returning
-* skb_mac_gso_segment(), an indirect caller of this function.
-*/
-   __skb_pull(skb, skb->data - skb_mac_header(skb));
-
+out:
return segs;
 }
 
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index aed872cc05a6..55c5ab907563 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -90,7 +90,12 @@ static int mpls_xmit(struct sk_buff *skb)
if (skb_cow(skb, hh_len + new_header_size))
goto drop;
 
+   skb_set_inner_protocol(skb, skb->protocol);
+   skb_reset_inner_network_header(skb);
+   skb->encapsulation = 1;
+
skb_push(skb, new_header_size);
+
skb_reset_network_header(skb);
 
skb->dev = out_dev;
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 1ecbd7715f6d..6d78f162a88b 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -167,6 +167,12 @@ static int push_mpls(struct sk_buff *skb, struct 
sw_flow_key *key,
skb->mac_len);
skb_reset_mac_header(skb);
 
+   /* for GSO: set MPLS as network header and encapsulated protocol
+* header as inner network header
+*/
+   skb_set_network_header(skb, skb->mac_len);
+   skb_set_inner_network_header(skb, skb->mac_len + MPLS_HLEN);
+
new_mpls_lse = (__be32 *)skb_mpls_header(skb);
*new_mpls_lse = mpls->mpls_lse;
 
-- 
2.1.4



[PATCH net-next 3/3] net: veth: Set features for MPLS

2016-08-17 Thread David Ahern
veth does not really transmit packets only moves the skb from one
netdev to another so gso and checksum is not really needed. Add
the features to mpls_features to get the same benefit and performance
with MPLS as without it.

Reported-by: Lennert Buytenhek 
Signed-off-by: David Ahern 
---
 drivers/net/veth.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f37a6e61d4ad..5db320a4d5cf 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -340,6 +340,7 @@ static void veth_setup(struct net_device *dev)
 
dev->hw_features = VETH_FEATURES;
dev->hw_enc_features = VETH_FEATURES;
+   dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
 }
 
 /*
-- 
2.1.4



[PATCH net-next 1/3] net: lwtunnel: Handle fragmentation

2016-08-17 Thread David Ahern
From: Roopa Prabhu 

Today mpls iptunnel lwtunnel_output redirect expects the tunnel
output function to handle fragmentation. This is ok but can be
avoided if we did not do the mpls output redirect too early.
ie we could wait until ip fragmentation is done and then call
mpls output for each ip fragment.

To make this work we will need,
1) the lwtunnel state to carry encap headroom
2) and do the redirect to the encap output handler on the ip fragment
(essentially do the output redirect after fragmentation)

This patch adds tunnel headroom in lwtstate to make sure we
account for tunnel data in mtu calculations during fragmentation
and adds new xmit redirect handler to redirect to lwtunnel xmit func
after ip fragmentation.

This includes IPV6 and some mtu fixes and testing from David Ahern.

Signed-off-by: Roopa Prabhu 
Signed-off-by: David Ahern 
---
 include/net/lwtunnel.h   | 44 
 net/core/lwtunnel.c  | 35 +++
 net/ipv4/ip_output.c |  8 
 net/ipv4/route.c |  4 +++-
 net/ipv6/ip6_output.c|  8 
 net/ipv6/route.c |  4 +++-
 net/mpls/mpls_iptunnel.c |  9 +
 7 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index e9f116e29c22..ea3f80f58fd6 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -13,6 +13,13 @@
 /* lw tunnel state flags */
 #define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0)
 #define LWTUNNEL_STATE_INPUT_REDIRECT  BIT(1)
+#define LWTUNNEL_STATE_XMIT_REDIRECT   BIT(2)
+
+enum {
+   LWTUNNEL_XMIT_DONE,
+   LWTUNNEL_XMIT_CONTINUE,
+};
+
 
 struct lwtunnel_state {
__u16   type;
@@ -21,6 +28,7 @@ struct lwtunnel_state {
int (*orig_output)(struct net *net, struct sock *sk, struct 
sk_buff *skb);
int (*orig_input)(struct sk_buff *);
int len;
+   __u16   headroom;
__u8data[0];
 };
 
@@ -34,6 +42,7 @@ struct lwtunnel_encap_ops {
  struct lwtunnel_state *lwtstate);
int (*get_encap_size)(struct lwtunnel_state *lwtstate);
int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
+   int (*xmit)(struct sk_buff *skb);
 };
 
 #ifdef CONFIG_LWTUNNEL
@@ -75,6 +84,24 @@ static inline bool lwtunnel_input_redirect(struct 
lwtunnel_state *lwtstate)
 
return false;
 }
+
+static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
+{
+   if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT))
+   return true;
+
+   return false;
+}
+
+static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
+unsigned int mtu)
+{
+   if (lwtunnel_xmit_redirect(lwtstate) && lwtstate->headroom < mtu)
+   return lwtstate->headroom;
+
+   return 0;
+}
+
 int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
   unsigned int num);
 int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
@@ -90,6 +117,7 @@ struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
 int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
 int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int lwtunnel_input(struct sk_buff *skb);
+int lwtunnel_xmit(struct sk_buff *skb);
 
 #else
 
@@ -117,6 +145,17 @@ static inline bool lwtunnel_input_redirect(struct 
lwtunnel_state *lwtstate)
return false;
 }
 
+static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
+{
+   return false;
+}
+
+static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
+unsigned int mtu)
+{
+   return 0;
+}
+
 static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
 unsigned int num)
 {
@@ -170,6 +209,11 @@ static inline int lwtunnel_input(struct sk_buff *skb)
return -EOPNOTSUPP;
 }
 
+static inline int lwtunnel_xmit(struct sk_buff *skb)
+{
+   return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_LWTUNNEL */
 
 #define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" 
__stringify(encap_type))
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 669ecc9f884e..e5f84c26ba1a 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -251,6 +251,41 @@ int lwtunnel_output(struct net *net, struct sock *sk, 
struct sk_buff *skb)
 }
 EXPORT_SYMBOL(lwtunnel_output);
 
+int lwtunnel_xmit(struct sk_buff *skb)
+{
+   struct dst_entry *dst = skb_dst(skb);
+   const struct lwtunnel_encap_ops *ops;
+   struct lwtunnel_state *lwtstate;
+   int ret = -EINVAL;
+
+   if (!dst)
+   goto drop;
+
+   lwtstate = dst->lwtstate;
+
+   if 

[PATCH v2 net-next 0/3] net: mpls: fragmentation and gso fixes for locally originated traffic

2016-08-17 Thread David Ahern
This series fixes mtu and fragmentation for tunnels using lwtunnel
output redirect, and fixes GSO for MPLS for locally originated traffic
reported by Lennert Buytenhek.

A follow on series will address fragmentation and GSO for forwarded
MPLS traffic. Hardware offload of GSO with MPLS also needs to be
addressed.

v2
- consistent use of network_header in skb to fix GSO for MPLS
- update MPLS code in OVS to network_header and inner_network_header

David Ahern (2):
  net: mpls: Fixups for GSO
  net: veth: Set features for MPLS

Roopa Prabhu (1):
  net: lwtunnel: Handle fragmentation

 drivers/net/veth.c|  1 +
 include/net/lwtunnel.h| 44 
 net/core/lwtunnel.c   | 35 +++
 net/ipv4/ip_output.c  |  8 
 net/ipv4/route.c  |  4 +++-
 net/ipv6/ip6_output.c |  8 
 net/ipv6/route.c  |  4 +++-
 net/mpls/mpls_gso.c   | 24 +---
 net/mpls/mpls_iptunnel.c  | 14 ++
 net/openvswitch/actions.c |  6 ++
 10 files changed, 131 insertions(+), 17 deletions(-)

-- 
2.1.4



[PATCH iproute2] ip: report IFLA_GSO_MAX_SIZE and IFLA_GSO_MAX_SEGS

2016-08-17 Thread Eric Dumazet
From: Eric Dumazet 

kernel support for these attributes was added in linux-4.6

Signed-off-by: Eric Dumazet 
---
 ip/ipaddress.c |8 
 1 file changed, 8 insertions(+)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index ab4b1b1..76bd7b3 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -901,6 +901,14 @@ int print_linkinfo(const struct sockaddr_nl *who,
fprintf(fp, "numrxqueues %u ",
rta_getattr_u32(tb[IFLA_NUM_RX_QUEUES]));
 
+   if (tb[IFLA_GSO_MAX_SIZE])
+   fprintf(fp, "gso_max_size %u ",
+   rta_getattr_u32(tb[IFLA_GSO_MAX_SIZE]));
+
+   if (tb[IFLA_GSO_MAX_SEGS])
+   fprintf(fp, "gso_max_segs %u ",
+   rta_getattr_u32(tb[IFLA_GSO_MAX_SEGS]));
+
if (tb[IFLA_PHYS_PORT_NAME])
fprintf(fp, "portname %s ",
rta_getattr_str(tb[IFLA_PHYS_PORT_NAME]));




Re: [iproute PATCH 1/2] ipaddress: Simplify vf_info parsing

2016-08-17 Thread Phil Sutter
On Wed, Jun 01, 2016 at 10:03:49PM +0200, Phil Sutter wrote:
> Not sure whether I misinterpret commit 7b8179c780a1a, but it looks
> overly complicated. Instead rely upon parse_rtattr_nested() to assign
> the relevant pointer if requested rtattr fields are present.

In order to validate correctness of this patch, I compiled a kernel
which does not export IFLA_VF_SPOOFCHK and verified correct
functionality.

Is there anything else I could do in order to convince you to accept
these patches?

Thanks, Phil


[PATCH net-next] dsa: mv88e6xxx: Timeout based on iterations

2016-08-17 Thread Andrew Lunn
The mv88e6xxx driver times out operations on the switch based on
looping until an elapsed wall clock time is reached. However, if
usleep_range() sleeps much longer than expected, it could timeout with
an error without actually checking to see if the devices has completed
the operation. So replace the elapsed time with a fixed upper bound on
the number of loops.

Testing on various switches has shown that switches takes either 0 or
1 iteration, so a maximum of 16 iterations is a safe limit.

Signed-off-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index a230fcba5b64..ac8e9af4879f 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -309,9 +309,9 @@ static int mv88e6xxx_serdes_write(struct mv88e6xxx_chip 
*chip, int reg, u16 val)
 static int mv88e6xxx_wait(struct mv88e6xxx_chip *chip, int addr, int reg,
  u16 mask)
 {
-   unsigned long timeout = jiffies + HZ / 10;
+   int i;
 
-   while (time_before(jiffies, timeout)) {
+   for (i = 0; i < 16; i++) {
u16 val;
int err;
 
@@ -375,7 +375,7 @@ static int _mv88e6xxx_reg_write(struct mv88e6xxx_chip 
*chip, int addr,
 static int mv88e6xxx_ppu_disable(struct mv88e6xxx_chip *chip)
 {
int ret;
-   unsigned long timeout;
+   int i;
 
ret = _mv88e6xxx_reg_read(chip, REG_GLOBAL, GLOBAL_CONTROL);
if (ret < 0)
@@ -386,8 +386,7 @@ static int mv88e6xxx_ppu_disable(struct mv88e6xxx_chip 
*chip)
if (ret)
return ret;
 
-   timeout = jiffies + 1 * HZ;
-   while (time_before(jiffies, timeout)) {
+   for (i = 0; i < 16; i++) {
ret = _mv88e6xxx_reg_read(chip, REG_GLOBAL, GLOBAL_STATUS);
if (ret < 0)
return ret;
@@ -403,8 +402,7 @@ static int mv88e6xxx_ppu_disable(struct mv88e6xxx_chip 
*chip)
 
 static int mv88e6xxx_ppu_enable(struct mv88e6xxx_chip *chip)
 {
-   int ret, err;
-   unsigned long timeout;
+   int ret, err, i;
 
ret = _mv88e6xxx_reg_read(chip, REG_GLOBAL, GLOBAL_CONTROL);
if (ret < 0)
@@ -415,8 +413,7 @@ static int mv88e6xxx_ppu_enable(struct mv88e6xxx_chip *chip)
if (err)
return err;
 
-   timeout = jiffies + 1 * HZ;
-   while (time_before(jiffies, timeout)) {
+   for (i = 0; i < 16; i++) {
ret = _mv88e6xxx_reg_read(chip, REG_GLOBAL, GLOBAL_STATUS);
if (ret < 0)
return ret;
-- 
2.8.1



[PATCH net-next] tcp: refine tcp_prune_ofo_queue() to not drop all packets

2016-08-17 Thread Eric Dumazet
From: Eric Dumazet 

Over the years, TCP BDP has increased a lot, and is typically
in the order of ~10 Mbytes with help of clever Congestion Control
modules.

In presence of packet losses, TCP stores incoming packets into an out of
order queue, and number of skbs sitting there waiting for the missing
packets to be received can match the BDP (~10 Mbytes)

In some cases, TCP needs to make room for incoming skbs, and current
strategy can simply remove all skbs in the out of order queue as a last
resort, incurring a huge penalty, both for receiver and sender.

Unfortunately these 'last resort events' are quite frequent, forcing
sender to send all packets again, stalling the flow and wasting a lot of
resources.

This patch cleans only a part of the out of order queue in order
to meet the memory constraints.

Signed-off-by: Eric Dumazet 
Cc: Neal Cardwell 
Cc: Yuchung Cheng 
Cc: Soheil Hassas Yeganeh 
Cc: C. Stephen Gun 
Cc: Van Jacobson 
---
 net/ipv4/tcp_input.c |   47 -
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 
3ebf45b38bc309f448dbc4f27fe8722cefabaf19..8cd02c0b056cbc22e2e4a4fe8530b74f7bd25419
 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4392,12 +4392,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct 
sk_buff *skb,
if (tcp_prune_queue(sk) < 0)
return -1;
 
-   if (!sk_rmem_schedule(sk, skb, size)) {
+   while (!sk_rmem_schedule(sk, skb, size)) {
if (!tcp_prune_ofo_queue(sk))
return -1;
-
-   if (!sk_rmem_schedule(sk, skb, size))
-   return -1;
}
}
return 0;
@@ -4874,29 +4871,41 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 }
 
 /*
- * Purge the out-of-order queue.
- * Return true if queue was pruned.
+ * Clean the out-of-order queue to make room.
+ * We drop high sequences packets to :
+ * 1) Let a chance for holes to be filled.
+ * 2) not add too big latencies if thousands of packets sit there.
+ *(But if application shrinks SO_RCVBUF, we could still end up
+ * freeing whole queue here)
+ *
+ * Return true if queue has shrunk.
  */
 static bool tcp_prune_ofo_queue(struct sock *sk)
 {
struct tcp_sock *tp = tcp_sk(sk);
-   bool res = false;
+   struct sk_buff *skb;
 
-   if (!skb_queue_empty(>out_of_order_queue)) {
-   NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
-   __skb_queue_purge(>out_of_order_queue);
+   if (skb_queue_empty(>out_of_order_queue))
+   return false;
 
-   /* Reset SACK state.  A conforming SACK implementation will
-* do the same at a timeout based retransmit.  When a connection
-* is in a sad state like this, we care only about integrity
-* of the connection not performance.
-*/
-   if (tp->rx_opt.sack_ok)
-   tcp_sack_reset(>rx_opt);
+   NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+
+   while ((skb = __skb_dequeue_tail(>out_of_order_queue)) != NULL) {
+   tcp_drop(sk, skb);
sk_mem_reclaim(sk);
-   res = true;
+   if (atomic_read(>sk_rmem_alloc) <= sk->sk_rcvbuf &&
+   !tcp_under_memory_pressure(sk))
+   break;
}
-   return res;
+
+   /* Reset SACK state.  A conforming SACK implementation will
+* do the same at a timeout based retransmit.  When a connection
+* is in a sad state like this, we care only about integrity
+* of the connection not performance.
+*/
+   if (tp->rx_opt.sack_ok)
+   tcp_sack_reset(>rx_opt);
+   return true;
 }
 
 /* Reduce allocated memory if we can, trying to get




[PATCH net-next] net: bgmac: make it clear when setting interface type to RMII

2016-08-17 Thread Rafał Miłecki
From: Rafał Miłecki 

It doesn't really change anything as BGMAC_CHIPCTL_1_IF_TYPE_RMII is
equal to 0. It make code a bit clener, so far when reading it one could
think we forgot to set a proper mode. It also keeps this mode code in
sync with other ones.

Signed-off-by: Rafał Miłecki 
---
 drivers/net/ethernet/broadcom/bgmac.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index 63ef7235..6ea0e5f 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -932,7 +932,8 @@ static void bgmac_chip_reset(struct bgmac *bgmac)
et_swtype <<= 4;
sw_type = et_swtype;
} else if (bgmac->feature_flags & BGMAC_FEAT_SW_TYPE_EPHYRMII) {
-   sw_type = BGMAC_CHIPCTL_1_SW_TYPE_EPHYRMII;
+   sw_type = BGMAC_CHIPCTL_1_IF_TYPE_RMII |
+ BGMAC_CHIPCTL_1_SW_TYPE_EPHYRMII;
} else if (bgmac->feature_flags & BGMAC_FEAT_SW_TYPE_RGMII) {
sw_type = BGMAC_CHIPCTL_1_IF_TYPE_RGMII |
  BGMAC_CHIPCTL_1_SW_TYPE_RGMII;
-- 
1.8.4.5



[PATCH net-next] net: bgmac: support Ethernet core on BCM53573 SoCs

2016-08-17 Thread Rafał Miłecki
From: Rafał Miłecki 

BCM53573 is a new series of Broadcom's SoCs. It's based on ARM and can
be found in two packages (versions): BCM53573 and BCM47189. It shares
some code with the Northstar family, but also requires some new quirks.

First of all there can be up to 2 Ethernet cores on this SoC. If that is
the case, they are connected to two different switch ports allowing some
more complex/optimized setups. It seems the second unit doesn't come
fully configured and requires some IRQ quirk.

Other than that only the first core is connected to the PHY. For the
second one we have to register fixed PHY (similarly to the Northstar),
otherwise generic PHY driver would get some invalid info.

This has been successfully tested on Tenda AC9 (BCM47189B0).

Signed-off-by: Rafał Miłecki 
---
 drivers/net/ethernet/broadcom/bgmac-bcma.c | 19 ++-
 drivers/net/ethernet/broadcom/bgmac.c  | 25 +
 drivers/net/ethernet/broadcom/bgmac.h  | 19 +++
 include/linux/bcma/bcma.h  |  3 +++
 include/linux/bcma/bcma_regs.h |  1 +
 5 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c 
b/drivers/net/ethernet/broadcom/bgmac-bcma.c
index 9a9745c4..3bc0a04 100644
--- a/drivers/net/ethernet/broadcom/bgmac-bcma.c
+++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c
@@ -92,6 +92,7 @@ MODULE_DEVICE_TABLE(bcma, bgmac_bcma_tbl);
 /* http://bcm-v4.sipsolutions.net/mac-gbit/gmac/chipattach */
 static int bgmac_probe(struct bcma_device *core)
 {
+   struct bcma_chipinfo *ci = >bus->chipinfo;
struct ssb_sprom *sprom = >bus->sprom;
struct mii_bus *mii_bus;
struct bgmac *bgmac;
@@ -157,7 +158,8 @@ static int bgmac_probe(struct bcma_device *core)
dev_info(bgmac->dev, "Found PHY addr: %d%s\n", bgmac->phyaddr,
 bgmac->phyaddr == BGMAC_PHY_NOREGS ? " (NOREGS)" : "");
 
-   if (!bgmac_is_bcm4707_family(core)) {
+   if (!bgmac_is_bcm4707_family(core) &&
+   !(ci->id == BCMA_CHIP_ID_BCM53573 && core->core_unit == 1)) {
mii_bus = bcma_mdio_mii_register(core, bgmac->phyaddr);
if (!IS_ERR(mii_bus)) {
err = PTR_ERR(mii_bus);
@@ -230,6 +232,21 @@ static int bgmac_probe(struct bcma_device *core)
bgmac->feature_flags |= BGMAC_FEAT_NO_RESET;
bgmac->feature_flags |= BGMAC_FEAT_FORCE_SPEED_2500;
break;
+   case BCMA_CHIP_ID_BCM53573:
+   bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST;
+   bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK;
+   if (ci->pkg == BCMA_PKG_ID_BCM47189)
+   bgmac->feature_flags |= BGMAC_FEAT_IOST_ATTACHED;
+   if (core->core_unit == 0) {
+   bgmac->feature_flags |= BGMAC_FEAT_CC4_IF_SW_TYPE;
+   if (ci->pkg == BCMA_PKG_ID_BCM47189)
+   bgmac->feature_flags |=
+   BGMAC_FEAT_CC4_IF_SW_TYPE_RGMII;
+   } else if (core->core_unit == 1) {
+   bgmac->feature_flags |= BGMAC_FEAT_IRQ_ID_OOB_6;
+   bgmac->feature_flags |= BGMAC_FEAT_CC7_IF_TYPE_RGMII;
+   }
+   break;
default:
bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST;
bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK;
diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index c4751ec..63ef7235 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -940,6 +940,27 @@ static void bgmac_chip_reset(struct bgmac *bgmac)
bgmac_cco_ctl_maskset(bgmac, 1, ~(BGMAC_CHIPCTL_1_IF_TYPE_MASK |
  BGMAC_CHIPCTL_1_SW_TYPE_MASK),
  sw_type);
+   } else if (bgmac->feature_flags & BGMAC_FEAT_CC4_IF_SW_TYPE) {
+   u32 sw_type = BGMAC_CHIPCTL_4_IF_TYPE_MII |
+ BGMAC_CHIPCTL_4_SW_TYPE_EPHY;
+   u8 et_swtype = 0;
+   char buf[4];
+
+   if (bcm47xx_nvram_getenv("et_swtype", buf, sizeof(buf)) > 0) {
+   if (kstrtou8(buf, 0, _swtype))
+   dev_err(bgmac->dev, "Failed to parse et_swtype 
(%s)\n",
+   buf);
+   sw_type = (et_swtype & 0x0f) << 12;
+   } else if (bgmac->feature_flags & 
BGMAC_FEAT_CC4_IF_SW_TYPE_RGMII) {
+   sw_type = BGMAC_CHIPCTL_4_IF_TYPE_RGMII |
+ BGMAC_CHIPCTL_4_SW_TYPE_RGMII;
+   }
+   bgmac_cco_ctl_maskset(bgmac, 4, ~(BGMAC_CHIPCTL_4_IF_TYPE_MASK |
+ BGMAC_CHIPCTL_4_SW_TYPE_MASK),
+ 

Re: [PATCH iproute] ipila: Fixed unitialized variables

2016-08-17 Thread Stephen Hemminger
On Mon, 15 Aug 2016 16:30:22 -0700
Tom Herbert  wrote:

> Initialize locator and locator_match to zero and only do
> addattr if they have been set.
> 
> Signed-off-by: Tom Herbert 

Applied thanks


Re: [PATCH] [v8] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Andrew Lunn
> Something is odd about of_mdiobus_register().
> 
> This function scans child nodes to look for PHYs:
> 
> /* Loop over the child nodes and register a phy_device for each phy */
> for_each_available_child_of_node(np, child) {
>   addr = of_mdio_parse_addr(>dev, child);
> 
> And in my driver, this works.  However, later when I try to call
> of_phy_find_device(), it fails.  That's because of_phy_find_device()
> wants the np of the child node.  But the only way to get that is
> with a phy-phandle property which I need to manually parse.

You do need the phy-handle, because you can have multiple PHYs, and
ethernet switches on the MDIO bus. You need to indicate which of those
PHYs is connected to this MAC.

> So what's the point of having of_mdiobus_register() parse child
> nodes, if you need a phy-phandle pointer anyway?

So we know about the other phys and switches on the MDIO bus, can load
the driver for them, etc. This is standard policy for a bus
driver. When you instantiate the bus, you enumerate it to see what is
connected to it. You do that either with of_mdiobus_register() or
mdiobus_register().

However, i do agree about having a helper do the phy-handle parsing
would be nice, and it should also handle fixed phys as well.

  Andrew


Re: [PATCH iproute2] ip route: restore_handler should check tb[RTA_PREFSRC] for local networks

2016-08-17 Thread Stephen Hemminger
On Wed, 17 Aug 2016 11:32:53 +0800
Xin Long  wrote:

> Hi, Stephen
> 
> any update on this ?

Was waiting long enough to get a comment or two.


Re: [PATCH iproute2 v2 2/2] tipc: refactor bearer identification

2016-08-17 Thread Stephen Hemminger
On Mon, 15 Aug 2016 10:24:32 +0200
Richard Alpe  wrote:

> Introduce a generic function (nl_add_bearer_name()) that identifies a
> bearer and adds it to an existing netlink message. This reduces code
> complexity and makes the code a little bit easier to maintain.
> 
> Signed-off-by: Richard Alpe 

Thanks for keeping TIPC support up to date.
Applied.


Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Timur Tabi

Florian Fainelli wrote:


If it has self identification, and the MAC is capable of knowing what
kind of PHY it is internally bundled with, why does it matter to
represent that specific piece of information in Device Tree or ACPI?


That's the thing - the MAC is not capable of knowing.  From a hardware 
perspective, the MAC and PHY are physically distinct IP blocks, and so 
the MAC hardware has no idea what it's really connected to.  In theory, 
we don't even need an internal PHY, but if there is an internal PHY, we 
have to program it correctly.


This is why we need a DT and ACPI property (or whatever) to tell the 
driver what kind of PHY it's attached to.



The problem is that the internal PHY has mechanism for
self-identification.  There is no ACPI or device tree node for it. There
is no register you can query to see if it's there or what version it
is.  The MAC part of the EMAC has no knowledge of the PHY part.  The
driver just has to know it's there.


Nothing prevents you from detailing in ACPI or DT sub-components of a
larger HW block, if there is such a desire, but just make it in a way
that it looks like what would be done for e.g: separate discrete parts,
only the parenting of nodes would change.


So instead of just having a single property, instead create a child node 
just for the SGMII phy, with its own compatible string and various other 
properties (like interrupt and base register)?  That's not a bad idea.


--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.


Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Florian Fainelli
On 08/16/2016 02:37 PM, Timur Tabi wrote:
> Al Stone wrote:
>> Does the ACPI portion of the driver*have*  to know about the PHY?  In
>> general,
>> the ACPI assumption on ARM [**] is that those have all been set up
>> before we
>> get to the kernel.  So, does it need to be visible to the ACPI part of
>> the
>> driver at all?
> 
> Yes, the driver supports both "v1" and "v2" of the PHY, and it has code
> like this:
> 
> if (adpt->phy.version == 2)
> emac_sgmii_init_v2(adpt);
> else
> emac_sgmii_init_v1(adpt);
> 
> The question, how should adpt->phy.version be initialized on device tree
> and ACPI platforms?
> 
> The reason why this is confusing is because there are conflicting
> perspectives of this "internal PHY".  It both is and is not part of the
> EMAC.  It is a separate block in the chip, and can be replaced with
> other SGMII blocks, and in the future there will be a "v3" and maybe a
> "v4" or who knows what.  In fact, maybe using version numbers is
> inappropriate and we'll need to use vendor names or something.

If it has self identification, and the MAC is capable of knowing what
kind of PHY it is internally bundled with, why does it matter to
represent that specific piece of information in Device Tree or ACPI?

> 
> The problem is that the internal PHY has mechanism for
> self-identification.  There is no ACPI or device tree node for it. There
> is no register you can query to see if it's there or what version it
> is.  The MAC part of the EMAC has no knowledge of the PHY part.  The
> driver just has to know it's there.

Nothing prevents you from detailing in ACPI or DT sub-components of a
larger HW block, if there is such a desire, but just make it in a way
that it looks like what would be done for e.g: separate discrete parts,
only the parenting of nodes would change.

> 
> This is why I'm advocating a separate property (DT and ACPI) to identify
> the internal PHY
> 
> The truth is that I don't really care HOW it's done, and as long as we
> come to a consensus soon.  I want my driver to be merged into 4.9.
> 


-- 
Florian


Re: [PATCH] [v8] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Florian Fainelli
On 08/17/2016 12:38 PM, Timur Tabi wrote:
>> If this is an external PHY, the expectation is that it will be hanging
>> off a MDIO bus controller, either the MDIO bus internal to the MAC, or
>> an external MDIO bus (separate register range).
> 
> So for starters, I forgot to delete the 'compatible' property from this
> document.  Its presence breaks of_mdiobus_child_is_phy().

The MDIO devices and Ethernet PHY binding should certainly be improved,
but the bulk of it is either you use ethernet-phy-ieee802.3-c22 or
ethernet-phy-ieee802.3-c45, or nothing. Else, the MDIO bus probing code
just assumes it can match a MDIO device (not specifically an Ethernet
PHY) with its compatible string.

> 
>> If such a PHY node is provided, the expectation is that your Ethernet
>> MAC will have a phy-handle property and a phy-mode property to specify
>> how to connect to this external PHY, so in your specific case, better
>> remove the PHY from your example, or detail further how it should be
>> done.
> 
> Something is odd about of_mdiobus_register().
> 
> This function scans child nodes to look for PHYs:
> 
> /* Loop over the child nodes and register a phy_device for each phy */
> for_each_available_child_of_node(np, child) {
> addr = of_mdio_parse_addr(>dev, child);
> 
> And in my driver, this works.  However, later when I try to call
> of_phy_find_device(), it fails.  That's because of_phy_find_device()
> wants the np of the child node.  But the only way to get that is with a
> phy-phandle property which I need to manually parse.

of_get_phandle() should return the device_node pointer associated with
the phy-handle property which you can later use and pass to
of_phy_find_device(). There are plenty of examples in the tree and the
logic is almost always the same.

> 
> So what's the point of having of_mdiobus_register() parse child nodes,
> if you need a phy-phandle pointer anyway?

Not all MDIO devices are Ethernet PHYs, some of them are Ethernet
switches, or any other PHY (USB, PCIE, SATA) just sitting on the same
MDIO bus HW instance so you need to create device_node and struct device
instances for these deviecs listed in the Device Tree since the bus is
self-discoverable (like I2C).

Your Ethernet controller somehow needs to know which Ethernet PHY it is
attached to since the location of the Ethernet PHY on the bus could
change from design to design, let alone the actual topology (you could
have a GPIO bit-banged MDIO bus or something else).

>> Since you have a check on CONFIG_QCOM_EMAC in emac/Makefile, you could
>> always recurse into that directory while building (use obj-y).
> 
> Obviously, having "obj-$(CONFIG_QCOM_EMAC)" in both Makefiles is
> redundant, but wouldn't it make more sense to change
> "obj-$(CONFIG_QCOM_EMAC)" to "obj-y" in
> drivers/net/ethernet/qualcomm/emac/Makefile, so that I only recurse if
> necessary?

Whichever makes the most sense, when there is a directory involved, my
preference is to always recurse in that directory and selectively
compile from there, since the Kconfig/Makefile options are all located
within the same hierarchical level, just a preference.

> 
>>> >+static void emac_adjust_link(struct net_device *netdev)
>>> >+{
>>> >+struct emac_adapter *adpt = netdev_priv(netdev);
>>> >+struct phy_device *phydev = netdev->phydev;
>>> >+
>>> >+if (phydev->link)
>>> >+emac_mac_start(adpt);
>>> >+else
>>> >+emac_mac_stop(adpt);
> 
>> This is really excessive here, you typically don't need to completely
>> stop your transmitter/receiver/DMA/what have you, just reprogram the MAC
>> with the appropriate link speed/duplex/pause parameters.
> 
> Yes, I realize that a lot of the reinit code in my driver is "heavy",
> and that it could be optimized for specific situations.  However, I'd
> like to save that for another patch, because it would require me to
> study the documentation and do extensive tests.
> 
> I can't tell you today whether emac_mac_start() is overkill for the
> specific link change.  For all I know, the hardware does require the TX
> to stop before changing the link speed.  Remember, the EMAC does have a
> weird double-phy (one internal, one external).

Fair enough, not having to stop the TX from the SW perspective should be
kind of basic requirement here, most drivers/HW don't seem to require
that, because even if packets were inflight at the time the link went
down, the MAC should sense the carrier loss and report that accordingly.
Now there could be many different ways to implement and screw that up,
so yes, future patch seems like a good approach.
-- 
Florian


Re: [RFC PATCH 12/13] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq

2016-08-17 Thread John Fastabend
On 16-08-17 12:38 PM, John Fastabend wrote:
> The sch_mq qdisc creates a sub-qdisc per tx queue which are then
> called independently for enqueue and dequeue operations. However
> statistics are aggregated and pushed up to the "master" qdisc.
> 
> This patch adds support for any of the sub-qdiscs to be per cpu
> statistic qdiscs. To handle this case add a check when calculating
> stats and aggregate the per cpu stats if needed.
> 
> Also exports __gnet_stats_copy_queue() to use as a helper function.
> 
> Signed-off-by: John Fastabend 
> ---

[...]

> + if (qdisc_is_percpu_stats(qdisc)) {
> + cpu_bstats = qdisc->cpu_bstats;
> + cpu_qstats = qdisc->cpu_qstats;
> + }
> +
> + qlen = qdisc_qlen_sum(qdisc);
> +
> + __gnet_stats_copy_basic(NULL, >bstats,
> + cpu_bstats, >bstats);
> + __gnet_stats_copy_queue(>qstats,
> + cpu_qstats, >qstats, qlen);


And also I forgot to bump this onto the atomic qlen and it is still
using the per cpu counters giving the incorrect qlen total.



Re: [RFC PATCH 10/13] net: sched: lockless support for netif_schedule

2016-08-17 Thread John Fastabend
On 16-08-17 12:37 PM, John Fastabend wrote:
> netif_schedule uses a bit QDISC_STATE_SCHED to tell the qdisc layer
> if a run of the qdisc has been scheduler. This is important when
> tearing down qdisc instances. We can rcu_free an instance for example
> if its possible that we might have outstanding references to it.
> 
> Perhaps more importantly in the per cpu lockless case we need to
> schedule a run of the qdisc on all qdiscs that are enqueu'ing packets
> and hitting the gso_skb requeue logic or else the skb may get stuck
> on the gso_skb queue without anything to finish the xmit.
> 
> This patch uses a reference counter instead of a bit to account for
> the multiple CPUs.
> ---

oops the commit message is incorrect here it actually uses a per cpu
state bitmask to track this.



[RFC PATCH 00/13] Series short description

2016-08-17 Thread John Fastabend
I've been working on this for a bit now figured its time for a v2 RFC. As
usual any comments, suggestions, observations, musings, etc are appreciated.

Latest round of lockless qdisc patch set with performance metric primarily
using pktgen to inject pkts into the qdisc layer. Some simple netperf tests
below as well but those need to be done correctly.

This v2 RFC version fixes a couple flaws in the original series. The first
major one was that the per_cpu accounting of qlen is not correct with respect
to the qdisc bypass. Using per cpu counters for qlen allows a flow to be
enqueuing on the packets into the qdisc and then get scheduled on another
core and bypass the qdisc completely if that core is not in use. I've reworked
the logic to use an atomic which is _correct_ now but unfortunately costs
a lot in performance. With a single pfifo_fast and 12 threads of pktgen
I still see a ~200k pps improvement even with atomic accounting so it is
still a win but nothing like the +1Mpps without the atomic accounting.

On the mq tests it atomic vs per cpu seems to be in the noise I believe
because mq qdisc is already aligned with a pfifo_fast qdisc per core
with the XPS setup I'm running mapping 1:1.

Any thoughts around this would be interesting to hear. My general thinking
around this is to submit the atomic version for inclusion and then start
to improve it with a few items listed below.

Additionally I've added a __netif_schedule() to the bad_skb_tx path
otherwise I observed a pkt getting stuck on the bad_txq_cpu path on
the pointer and sitting in the qdisc structure until it was kicked again
from another pkt or netif_schedule. And on the netif_schedule() topic
to support per cpu handling of gso and bad_txq_cpu we have to allow
the netif_schedule() logic to fire on a per cpu model as well.

Otherwise a bunch of small stylistic changes were made and I still need
to do another pass to catch checkpatch warnings/errors and try to do a bit
more cleanup around the statistics if/else branching. This series also
has both the atomic qlen code and the per cpu qlen code as I continue
to think up some scheme around the atomic qlen issue.

But this series seems to be working.

Future work is the following,

- convert all qdiscs over to per cpu handling and cleanup the
  rather ugly if/else statistics handling. Although a bit of
  work its mechanical and should help some.

- I'm looking at fq_codel to see how to make it "lockless".

- It seems we can drop the TX_HARD_LOCK on cases where the
  nic exposes a queue per core now that we have enqueue/dequeue
  decoupled. The idea being a bunch of threads enqueue and per
  core dequeue logic runs. Requires XPS to be setup.

- qlen improvements somehow

- look at improvements to the skb_array structure. We can look
  at drop in replacements and/or improving it.


Below is the data I took from pktgen,

./samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -t $NUM -i eth3

I did a run of 4 each time and took the total summation of each
thread. There are four different runs for the mq and pfifo_fast
cases. "without qlen atomic" uses per queue qlen values and allows
bypassing the qdisc via bypass flag, this is incorrect but shows
the impact of having an atomic in the mix. "with qlen atomic" shows
the correct implementation with atomics and bypass enabled. And
finally "without qlen atomic and no bypass" uses per cpu qlen
values and disables bypass to ensure ooo packets are not created.
To be clear the submitted patches here are the "with qlen atomic"
metrics.

nolock pfifo_fast (without qlen atomic)

1:  1440293 1421602 1409553 1393469 1424543
2:  1754890 1819292 1727948 1797711 1743427
4:  3282665 3344095 3315220 3332777 3348972
8:  2940079 1644450 2950777 2922085 2946310
12: 2042084 2610060 2857581 3493162 3104611

nolock pfifo_fast (with qlen atomic)
1:  1425231 1417176 1402862 1432880
2:  1631437 1633398 1630867 1628816
4:  1704383 1709900 1706274 1710198
8:  1348672 1344343 1339072 1334288
12: 1262988 1280724 1262237 1262615 

nolock pfifo_fast (without qlen atomic and no bypass)
1:  1435796 1458522 1471855 1455658
2:  1880642 1876359 1872879 1884578
4:  1922935 1914589 1912832 1912116
8:  1585055 1576887 1577086 1570236
12: 1479273 1450706 1447056 1466330

lock (pfifo_fast)
1:  1471479 1469142 1458825 1456788 1453952
2:  1746231 1749490 1753176 1753780 1755959
4:  1119626 1120515 1121478 1119220 1121115
8:  1001471  999308 1000318 1000776 1000384
12:  989269  992122  991590  986581  990430

nolock (mq with per cpu qlen)
1:   1435952  1459523  1448860  1385451   1435031
2:   2850662  2855702  2859105  2855443   2843382
4:   5288135  5271192  5252242  5270192   5311642
8:  10042731 10018063  9891813  9968382   9956727
12: 13265277 13384199 13438955 13363771  13436198

nolock (mq with qlen atomic)
1:   1558253  1562285  1555037  1558422
2:   2917449  2952852  2921697  2892313
4:   5518243  5375300 

[RFC PATCH 12/13] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq

2016-08-17 Thread John Fastabend
The sch_mq qdisc creates a sub-qdisc per tx queue which are then
called independently for enqueue and dequeue operations. However
statistics are aggregated and pushed up to the "master" qdisc.

This patch adds support for any of the sub-qdiscs to be per cpu
statistic qdiscs. To handle this case add a check when calculating
stats and aggregate the per cpu stats if needed.

Also exports __gnet_stats_copy_queue() to use as a helper function.

Signed-off-by: John Fastabend 
---
 include/net/gen_stats.h |3 +++
 net/core/gen_stats.c|9 +
 net/sched/sch_mq.c  |   25 ++---
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 231e121..5ddc88b 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -47,6 +47,9 @@ int gnet_stats_copy_rate_est(struct gnet_dump *d,
 int gnet_stats_copy_queue(struct gnet_dump *d,
  struct gnet_stats_queue __percpu *cpu_q,
  struct gnet_stats_queue *q, __u32 qlen);
+void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+const struct gnet_stats_queue __percpu *cpu_q,
+const struct gnet_stats_queue *q, __u32 qlen);
 int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
 
 int gnet_stats_finish_copy(struct gnet_dump *d);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051..a503547 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -254,10 +254,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue 
*qstats,
}
 }
 
-static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
-   const struct gnet_stats_queue __percpu *cpu,
-   const struct gnet_stats_queue *q,
-   __u32 qlen)
+void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+const struct gnet_stats_queue __percpu *cpu,
+const struct gnet_stats_queue *q,
+__u32 qlen)
 {
if (cpu) {
__gnet_stats_copy_queue_cpu(qstats, cpu);
@@ -271,6 +271,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue 
*qstats,
 
qstats->qlen = qlen;
 }
+EXPORT_SYMBOL(__gnet_stats_copy_queue);
 
 /**
  * gnet_stats_copy_queue - copy queue statistics into statistics TLV
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index b943982..f4b5bbb 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct mq_sched {
struct Qdisc**qdiscs;
@@ -107,15 +108,25 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
memset(>qstats, 0, sizeof(sch->qstats));
 
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+   struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+   struct gnet_stats_queue __percpu *cpu_qstats = NULL;
+   __u32 qlen = 0;
+
qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
spin_lock_bh(qdisc_lock(qdisc));
-   sch->q.qlen += qdisc->q.qlen;
-   sch->bstats.bytes   += qdisc->bstats.bytes;
-   sch->bstats.packets += qdisc->bstats.packets;
-   sch->qstats.backlog += qdisc->qstats.backlog;
-   sch->qstats.drops   += qdisc->qstats.drops;
-   sch->qstats.requeues+= qdisc->qstats.requeues;
-   sch->qstats.overlimits  += qdisc->qstats.overlimits;
+
+   if (qdisc_is_percpu_stats(qdisc)) {
+   cpu_bstats = qdisc->cpu_bstats;
+   cpu_qstats = qdisc->cpu_qstats;
+   }
+
+   qlen = qdisc_qlen_sum(qdisc);
+
+   __gnet_stats_copy_basic(NULL, >bstats,
+   cpu_bstats, >bstats);
+   __gnet_stats_copy_queue(>qstats,
+   cpu_qstats, >qstats, qlen);
+
spin_unlock_bh(qdisc_lock(qdisc));
}
return 0;



[RFC PATCH 11/13] net: sched: pfifo_fast use alf_queue

2016-08-17 Thread John Fastabend
This converts the pfifo_fast qdisc to use the alf_queue enqueue and
dequeue routines then sets the NOLOCK bit.

This also removes the logic used to pick the next band to dequeue from
and instead just checks each alf_queue for packets from top priority
to lowest. This might need to be a bit more clever but seems to work
for now.

Signed-off-by: John Fastabend 
---
 net/sched/sch_generic.c |  133 +++
 1 file changed, 77 insertions(+), 56 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f5b7254..c41a5dd 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -555,88 +556,79 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
 
 /*
  * Private data for a pfifo_fast scheduler containing:
- * - queues for the three band
- * - bitmap indicating which of the bands contain skbs
+ * - rings for priority bands
  */
 struct pfifo_fast_priv {
-   u32 bitmap;
-   struct sk_buff_head q[PFIFO_FAST_BANDS];
+   struct skb_array q[PFIFO_FAST_BANDS];
 };
 
-/*
- * Convert a bitmap to the first band number where an skb is queued, where:
- * bitmap=0 means there are no skbs on any band.
- * bitmap=1 means there is an skb on band 0.
- * bitmap=7 means there are skbs on all 3 bands, etc.
- */
-static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
-
-static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
-int band)
+static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
+ int band)
 {
-   return priv->q + band;
+   return >q[band];
 }
 
 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
  struct sk_buff **to_free)
 {
-   if (skb_queue_len(>q) < qdisc_dev(qdisc)->tx_queue_len) {
-   int band = prio2band[skb->priority & TC_PRIO_MAX];
-   struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-   struct sk_buff_head *list = band2list(priv, band);
-
-   priv->bitmap |= (1 << band);
-   qdisc->q.qlen++;
-   return __qdisc_enqueue_tail(skb, qdisc, list);
-   }
+   int band = prio2band[skb->priority & TC_PRIO_MAX];
+   struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+   struct skb_array *q = band2list(priv, band);
+   int err;
 
-   return qdisc_drop(skb, qdisc, to_free);
+   err = skb_array_produce(q, skb);
+
+   if (unlikely(err))
+   return qdisc_drop_cpu(skb, qdisc, to_free);
+
+   qdisc_qstats_cpu_qlen_inc(qdisc);
+   qdisc_qstats_cpu_backlog_inc(qdisc, skb);
+   return NET_XMIT_SUCCESS;
 }
 
 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 {
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-   int band = bitmap2band[priv->bitmap];
+   struct sk_buff *skb = NULL;
+   int band;
 
-   if (likely(band >= 0)) {
-   struct sk_buff_head *list = band2list(priv, band);
-   struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+   for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+   struct skb_array *q = band2list(priv, band);
 
-   qdisc->q.qlen--;
-   if (skb_queue_empty(list))
-   priv->bitmap &= ~(1 << band);
+   if (__skb_array_empty(q))
+   continue;
 
-   return skb;
+   skb = skb_array_consume(q);
}
 
-   return NULL;
-}
-
-static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
-{
-   struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-   int band = bitmap2band[priv->bitmap];
-
-   if (band >= 0) {
-   struct sk_buff_head *list = band2list(priv, band);
-
-   return skb_peek(list);
+   if (likely(skb)) {
+   qdisc_qstats_cpu_backlog_dec(qdisc, skb);
+   qdisc_bstats_cpu_update(qdisc, skb);
+   qdisc_qstats_cpu_qlen_dec(qdisc);
}
 
-   return NULL;
+   return skb;
 }
 
 static void pfifo_fast_reset(struct Qdisc *qdisc)
 {
-   int prio;
+   int i, band;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 
-   for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-   __qdisc_reset_queue(band2list(priv, prio));
+   for (band = 0; band < PFIFO_FAST_BANDS; band++) {
+   struct skb_array *q = band2list(priv, band);
+   struct sk_buff *skb;
 
-   priv->bitmap = 0;
-   qdisc->qstats.backlog = 0;
-   qdisc->q.qlen = 0;
+   while ((skb = skb_array_consume(q)) != NULL)
+   kfree_skb(skb);
+   }
+
+   for_each_possible_cpu(i) {
+   struct gnet_stats_queue *q = 

Re: [PATCH] [v8] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Timur Tabi

Florian Fainelli wrote:

On 08/11/2016 02:34 PM, Timur Tabi wrote:

>Add supports for ethernet controller HW on Qualcomm Technologies, Inc. SoC.
>This driver supports the following features:
>1) Checksum offload.
>2) Interrupt coalescing support.
>3) SGMII phy.
>4) phylib interface for external phy



OK, so this is looking good now, just a few nits, feel free to submit as
clean up patches, would this one be accepted.


I will post a v9.


>+soc {
>+   #address-cells = <1>;
>+   #size-cells = <1>;
>+   dma-ranges = <0 0 0x>;
>+
>+   emac0: ethernet@feb2 {
>+   compatible = "qcom,fsm9900-emac";
>+   #address-cells = <1>;
>+   #size-cells = <1>;
>+   reg-names = "base", "csr", "ptp", "sgmii";
>+   reg =   <0xfeb2 0x1>,
>+   <0xfeb36000 0x1000>,
>+   <0xfeb3c000 0x4000>,
>+   <0xfeb38000 0x400>;
>+   interrupt-parent = <>;
>+   #interrupt-cells = <1>;
>+   interrupts = <76 80>;
>+   interrupt-names = "emac_core0", "sgmii_irq";
>+   phy0: ethernet-phy@0 {
>+   compatible = "qcom,fsm9900-emac-phy";
>+   reg = <0>;
>+   }



If this is an external PHY, the expectation is that it will be hanging
off a MDIO bus controller, either the MDIO bus internal to the MAC, or
an external MDIO bus (separate register range).


So for starters, I forgot to delete the 'compatible' property from this 
document.  Its presence breaks of_mdiobus_child_is_phy().



If such a PHY node is provided, the expectation is that your Ethernet
MAC will have a phy-handle property and a phy-mode property to specify
how to connect to this external PHY, so in your specific case, better
remove the PHY from your example, or detail further how it should be done.


Something is odd about of_mdiobus_register().

This function scans child nodes to look for PHYs:

/* Loop over the child nodes and register a phy_device for each phy */
for_each_available_child_of_node(np, child) {
addr = of_mdio_parse_addr(>dev, child);

And in my driver, this works.  However, later when I try to call 
of_phy_find_device(), it fails.  That's because of_phy_find_device() 
wants the np of the child node.  But the only way to get that is with a 
phy-phandle property which I need to manually parse.


So what's the point of having of_mdiobus_register() parse child nodes, 
if you need a phy-phandle pointer anyway?



>diff --git a/drivers/net/ethernet/qualcomm/Kconfig 
b/drivers/net/ethernet/qualcomm/Kconfig
>index a76e380..85b599f 100644
>--- a/drivers/net/ethernet/qualcomm/Kconfig
>+++ b/drivers/net/ethernet/qualcomm/Kconfig
>@@ -24,4 +24,15 @@ config QCA7000
>  To compile this driver as a module, choose M here. The module
>  will be called qcaspi.
>
>+config QCOM_EMAC
>+   tristate "Qualcomm Technologies, Inc. EMAC Gigabit Ethernet support"
>+   select CRC32



select PHYLIB?


Ok.


>diff --git a/drivers/net/ethernet/qualcomm/Makefile 
b/drivers/net/ethernet/qualcomm/Makefile
>index 9da2d75..1b3a0ce 100644
>--- a/drivers/net/ethernet/qualcomm/Makefile
>+++ b/drivers/net/ethernet/qualcomm/Makefile
>@@ -4,3 +4,5 @@
>
>  obj-$(CONFIG_QCA7000) += qcaspi.o
>  qcaspi-objs := qca_spi.o qca_framing.o qca_7k.o qca_debug.o
>+
>+obj-$(CONFIG_QCOM_EMAC) += emac/



Since you have a check on CONFIG_QCOM_EMAC in emac/Makefile, you could
always recurse into that directory while building (use obj-y).


Obviously, having "obj-$(CONFIG_QCOM_EMAC)" in both Makefiles is 
redundant, but wouldn't it make more sense to change 
"obj-$(CONFIG_QCOM_EMAC)" to "obj-y" in 
drivers/net/ethernet/qualcomm/emac/Makefile, so that I only recurse if 
necessary?



>+static void emac_adjust_link(struct net_device *netdev)
>+{
>+   struct emac_adapter *adpt = netdev_priv(netdev);
>+   struct phy_device *phydev = netdev->phydev;
>+
>+   if (phydev->link)
>+   emac_mac_start(adpt);
>+   else
>+   emac_mac_stop(adpt);



This is really excessive here, you typically don't need to completely
stop your transmitter/receiver/DMA/what have you, just reprogram the MAC
with the appropriate link speed/duplex/pause parameters.


Yes, I realize that a lot of the reinit code in my driver is "heavy", 
and that it could be optimized for specific situations.  However, I'd 
like to save that for another patch, because it would require me to 
study the documentation and do extensive tests.


I can't tell you today whether emac_mac_start() is overkill for the 
specific link change.  For all I know, the hardware does require the TX 
to stop before changing the link speed.  Remember, the EMAC does have a 
weird double-phy (one internal, one external).


--
Sent by an employee of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the
Code Aurora Forum, hosted by The Linux Foundation.


[RFC PATCH 13/13] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mqprio

2016-08-17 Thread John Fastabend
The sch_mqprio qdisc creates a sub-qdisc per tx queue which are then
called independently for enqueue and dequeue operations. However
statistics are aggregated and pushed up to the "master" qdisc.

This patch adds support for any of the sub-qdiscs to be per cpu
statistic qdiscs. To handle this case add a check when calculating
stats and aggregate the per cpu stats if needed.

Signed-off-by: John Fastabend 
---
 net/sched/sch_mqprio.c |   61 +++-
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 549c663..24f0360 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -229,22 +229,32 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff 
*skb)
unsigned char *b = skb_tail_pointer(skb);
struct tc_mqprio_qopt opt = { 0 };
struct Qdisc *qdisc;
-   unsigned int i;
+   unsigned int ntx, tc;
 
sch->q.qlen = 0;
memset(>bstats, 0, sizeof(sch->bstats));
memset(>qstats, 0, sizeof(sch->qstats));
 
-   for (i = 0; i < dev->num_tx_queues; i++) {
-   qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
+   for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+   struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+   struct gnet_stats_queue __percpu *cpu_qstats = NULL;
+   __u32 qlen = 0;
+
+   qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
spin_lock_bh(qdisc_lock(qdisc));
-   sch->q.qlen += qdisc->q.qlen;
-   sch->bstats.bytes   += qdisc->bstats.bytes;
-   sch->bstats.packets += qdisc->bstats.packets;
-   sch->qstats.backlog += qdisc->qstats.backlog;
-   sch->qstats.drops   += qdisc->qstats.drops;
-   sch->qstats.requeues+= qdisc->qstats.requeues;
-   sch->qstats.overlimits  += qdisc->qstats.overlimits;
+
+   if (qdisc_is_percpu_stats(qdisc)) {
+   cpu_bstats = qdisc->cpu_bstats;
+   cpu_qstats = qdisc->cpu_qstats;
+   }
+
+   qlen = qdisc_qlen_sum(qdisc);
+
+   __gnet_stats_copy_basic(NULL, >bstats,
+   cpu_bstats, >bstats);
+   __gnet_stats_copy_queue(>qstats,
+   cpu_qstats, >qstats, qlen);
+
spin_unlock_bh(qdisc_lock(qdisc));
}
 
@@ -252,9 +262,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff 
*skb)
memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
opt.hw = priv->hw_owned;
 
-   for (i = 0; i < netdev_get_num_tc(dev); i++) {
-   opt.count[i] = dev->tc_to_txq[i].count;
-   opt.offset[i] = dev->tc_to_txq[i].offset;
+   for (tc = 0; tc < netdev_get_num_tc(dev); tc++) {
+   opt.count[tc] = dev->tc_to_txq[tc].count;
+   opt.offset[tc] = dev->tc_to_txq[tc].offset;
}
 
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), ))
@@ -332,7 +342,6 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, 
unsigned long cl,
if (cl <= netdev_get_num_tc(dev)) {
int i;
__u32 qlen = 0;
-   struct Qdisc *qdisc;
struct gnet_stats_queue qstats = {0};
struct gnet_stats_basic_packed bstats = {0};
struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
@@ -347,18 +356,26 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, 
unsigned long cl,
 
for (i = tc.offset; i < tc.offset + tc.count; i++) {
struct netdev_queue *q = netdev_get_tx_queue(dev, i);
+   struct Qdisc *qdisc = rtnl_dereference(q->qdisc);
+   struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+   struct gnet_stats_queue __percpu *cpu_qstats = NULL;
 
-   qdisc = rtnl_dereference(q->qdisc);
spin_lock_bh(qdisc_lock(qdisc));
-   qlen  += qdisc->q.qlen;
-   bstats.bytes  += qdisc->bstats.bytes;
-   bstats.packets+= qdisc->bstats.packets;
-   qstats.backlog+= qdisc->qstats.backlog;
-   qstats.drops  += qdisc->qstats.drops;
-   qstats.requeues   += qdisc->qstats.requeues;
-   qstats.overlimits += qdisc->qstats.overlimits;
+   if (qdisc_is_percpu_stats(qdisc)) {
+   cpu_bstats = qdisc->cpu_bstats;
+   cpu_qstats = qdisc->cpu_qstats;
+   }
+
+   qlen = qdisc_qlen_sum(qdisc);
+   __gnet_stats_copy_basic(NULL, >bstats,
+ 

[RFC PATCH 08/13] net: sched: support skb_bad_tx with lockless qdisc

2016-08-17 Thread John Fastabend
Similar to how gso is handled skb_bad_tx needs to be per cpu to handle
lockless qdisc with multiple writer/producers.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |7 +++
 net/sched/sch_api.c   |6 +++
 net/sched/sch_generic.c   |   95 +
 3 files changed, 99 insertions(+), 9 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 0864813..d465fb9 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -40,6 +40,10 @@ struct gso_cell {
struct sk_buff *skb;
 };
 
+struct bad_txq_cell {
+   struct sk_buff *skb;
+};
+
 struct Qdisc {
int (*enqueue)(struct sk_buff *skb,
   struct Qdisc *sch,
@@ -77,7 +81,8 @@ struct Qdisc {
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
 
-   struct gso_cell __percpu *gso_cpu_skb;
+   struct gso_cell __percpu *gso_cpu_skb;
+   struct bad_txq_cell __percpu *skb_bad_txq_cpu;
 
/*
 * For performance sake on SMP, we put highly modified fields at the end
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d713052..b90a23a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -970,6 +970,11 @@ qdisc_create(struct net_device *dev, struct netdev_queue 
*dev_queue,
sch->gso_cpu_skb = alloc_percpu(struct gso_cell);
if (!sch->gso_cpu_skb)
goto err_out4;
+
+   sch->skb_bad_txq_cpu =
+   alloc_percpu(struct bad_txq_cell);
+   if (!sch->skb_bad_txq_cpu)
+   goto err_out4;
}
 
if (tca[TCA_STAB]) {
@@ -1021,6 +1026,7 @@ err_out4:
free_percpu(sch->cpu_bstats);
free_percpu(sch->cpu_qstats);
free_percpu(sch->gso_cpu_skb);
+   free_percpu(sch->skb_bad_txq_cpu);
/*
 * Any broken qdiscs that would require a ops->reset() here?
 * The qdisc was never in action so it shouldn't be necessary.
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 29238c4..d10b762 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,6 +44,43 @@ EXPORT_SYMBOL(default_qdisc_ops);
  * - ingress filtering is also serialized via qdisc root lock
  * - updates to tree and tree walking are only done under the rtnl mutex.
  */
+static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *sch)
+{
+   if (sch->skb_bad_txq_cpu) {
+   struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu);
+
+   return cell->skb;
+   }
+
+   return sch->skb_bad_txq;
+}
+
+static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *sch,
+struct sk_buff *skb)
+{
+   if (sch->skb_bad_txq_cpu) {
+   struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu);
+
+   cell->skb = skb;
+   __netif_schedule(sch);
+   return;
+   }
+
+   sch->skb_bad_txq = skb;
+}
+
+static inline void qdisc_null_skb_bad_txq(struct Qdisc *sch)
+{
+   if (sch->skb_bad_txq_cpu) {
+   struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu);
+
+   cell->skb = NULL;
+   return;
+   }
+
+   sch->skb_bad_txq = NULL;
+}
+
 static inline struct sk_buff *qdisc_dequeue_gso_skb(struct Qdisc *sch)
 {
if (sch->gso_cpu_skb)
@@ -129,9 +166,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
if (!nskb)
break;
if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
-   q->skb_bad_txq = nskb;
-   qdisc_qstats_backlog_inc(q, nskb);
-   q->q.qlen++;
+   qdisc_enqueue_skb_bad_txq(q, nskb);
+
+   if (qdisc_is_percpu_stats(q)) {
+   qdisc_qstats_cpu_backlog_inc(q, nskb);
+   qdisc_qstats_cpu_qlen_inc(q);
+   } else {
+   qdisc_qstats_backlog_inc(q, nskb);
+   q->q.qlen++;
+   }
break;
}
skb->next = nskb;
@@ -160,7 +203,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool 
*validate,
qdisc_null_gso_skb(q);
 
if (qdisc_is_percpu_stats(q)) {
-   qdisc_qstats_cpu_backlog_inc(q, skb);
+   qdisc_qstats_cpu_backlog_dec(q, skb);
qdisc_qstats_cpu_qlen_dec(q);
} else {
qdisc_qstats_backlog_dec(q, skb);

[RFC PATCH 09/13] net: sched: helper to sum qlen

2016-08-17 Thread John Fastabend
Reporting qlen when qlen is per cpu requires aggregating the per
cpu counters. This adds a helper routine for this.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |   15 +++
 net/sched/sch_api.c   |3 ++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d465fb9..cc28af0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -280,6 +280,21 @@ static inline int qdisc_qlen(const struct Qdisc *q)
return q->q.qlen;
 }
 
+static inline int qdisc_qlen_sum(const struct Qdisc *q)
+{
+   __u32 qlen = 0;
+   int i;
+
+   if (q->flags & TCQ_F_NOLOCK) {
+   for_each_possible_cpu(i)
+   qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
+   } else {
+   qlen = q->q.qlen;
+   }
+
+   return qlen;
+}
+
 static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
 {
return (struct qdisc_skb_cb *)skb->cb;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b90a23a..6c5bf13 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1370,7 +1370,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct 
Qdisc *q, u32 clid,
goto nla_put_failure;
if (q->ops->dump && q->ops->dump(q, skb) < 0)
goto nla_put_failure;
-   qlen = q->q.qlen;
+
+   qlen = qdisc_qlen_sum(q);
 
stab = rtnl_dereference(q->stab);
if (stab && qdisc_dump_stab(skb, stab) < 0)



[RFC PATCH 10/13] net: sched: lockless support for netif_schedule

2016-08-17 Thread John Fastabend
netif_schedule uses a bit QDISC_STATE_SCHED to tell the qdisc layer
if a run of the qdisc has been scheduler. This is important when
tearing down qdisc instances. We can rcu_free an instance for example
if its possible that we might have outstanding references to it.

Perhaps more importantly in the per cpu lockless case we need to
schedule a run of the qdisc on all qdiscs that are enqueu'ing packets
and hitting the gso_skb requeue logic or else the skb may get stuck
on the gso_skb queue without anything to finish the xmit.

This patch uses a reference counter instead of a bit to account for
the multiple CPUs.
---
 include/net/sch_generic.h |1 +
 net/core/dev.c|   32 +++-
 net/sched/sch_api.c   |5 +
 net/sched/sch_generic.c   |   16 +++-
 4 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cc28af0..2e0e5b0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -94,6 +94,7 @@ struct Qdisc {
seqcount_t  running;
struct gnet_stats_queue qstats;
unsigned long   state;
+   unsigned long __percpu  *cpu_state;
struct Qdisc*next_sched;
struct sk_buff  *skb_bad_txq;
struct rcu_head rcu_head;
diff --git a/net/core/dev.c b/net/core/dev.c
index 5db395d..f491845 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2272,8 +2272,14 @@ static void __netif_reschedule(struct Qdisc *q)
 
 void __netif_schedule(struct Qdisc *q)
 {
-   if (!test_and_set_bit(__QDISC_STATE_SCHED, >state))
+   if (q->flags & TCQ_F_NOLOCK) {
+   unsigned long *s = this_cpu_ptr(q->cpu_state);
+
+   if (!test_and_set_bit(__QDISC_STATE_SCHED, s))
+   __netif_reschedule(q);
+   } else if (!test_and_set_bit(__QDISC_STATE_SCHED, >state)) {
__netif_reschedule(q);
+   }
 }
 EXPORT_SYMBOL(__netif_schedule);
 
@@ -3925,15 +3931,23 @@ static void net_tx_action(struct softirq_action *h)
if (!(q->flags & TCQ_F_NOLOCK)) {
root_lock = qdisc_lock(q);
spin_lock(root_lock);
-   }
-   /* We need to make sure head->next_sched is read
-* before clearing __QDISC_STATE_SCHED
-*/
-   smp_mb__before_atomic();
-   clear_bit(__QDISC_STATE_SCHED, >state);
-   qdisc_run(q);
-   if (!(q->flags & TCQ_F_NOLOCK))
+
+   /* We need to make sure head->next_sched is read
+* before clearing __QDISC_STATE_SCHED
+*/
+   smp_mb__before_atomic();
+   clear_bit(__QDISC_STATE_SCHED, >state);
+
+   qdisc_run(q);
+
spin_unlock(root_lock);
+   } else {
+   unsigned long *s = this_cpu_ptr(q->cpu_state);
+
+   smp_mb__before_atomic();
+   clear_bit(__QDISC_STATE_SCHED, s);
+   __qdisc_run(q);
+   }
}
}
 }
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 6c5bf13..89989a6 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -975,6 +975,10 @@ qdisc_create(struct net_device *dev, struct netdev_queue 
*dev_queue,
alloc_percpu(struct bad_txq_cell);
if (!sch->skb_bad_txq_cpu)
goto err_out4;
+
+   sch->cpu_state = alloc_percpu(unsigned long);
+   if (!sch->cpu_state)
+   goto err_out4;
}
 
if (tca[TCA_STAB]) {
@@ -1027,6 +1031,7 @@ err_out4:
free_percpu(sch->cpu_qstats);
free_percpu(sch->gso_cpu_skb);
free_percpu(sch->skb_bad_txq_cpu);
+   free_percpu(sch->cpu_state);
/*
 * Any broken qdiscs that would require a ops->reset() here?
 * The qdisc was never in action so it shouldn't be necessary.
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index d10b762..f5b7254 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -171,6 +171,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
if (qdisc_is_percpu_stats(q)) {
qdisc_qstats_cpu_backlog_inc(q, nskb);
qdisc_qstats_cpu_qlen_inc(q);
+   set_thread_flag(TIF_NEED_RESCHED);
} else {
qdisc_qstats_backlog_inc(q, nskb);
 

[RFC PATCH 05/13] net: sched: a dflt qdisc may be used with per cpu stats

2016-08-17 Thread John Fastabend
Enable dflt qdisc support for per cpu stats before this patch a
dflt qdisc was required to use the global statistics qstats and
bstats.

Signed-off-by: John Fastabend 
---
 net/sched/sch_generic.c |   24 
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index af32418..f8fec81 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -645,18 +645,34 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue 
*dev_queue,
struct Qdisc *sch;
 
if (!try_module_get(ops->owner))
-   goto errout;
+   return NULL;
 
sch = qdisc_alloc(dev_queue, ops);
if (IS_ERR(sch))
-   goto errout;
+   return NULL;
sch->parent = parentid;
 
-   if (!ops->init || ops->init(sch, NULL) == 0)
+   if (!ops->init)
return sch;
 
-   qdisc_destroy(sch);
+   if (ops->init(sch, NULL))
+   goto errout;
+
+   /* init() may have set percpu flags so init data structures */
+   if (qdisc_is_percpu_stats(sch)) {
+   sch->cpu_bstats =
+   netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+   if (!sch->cpu_bstats)
+   goto errout;
+
+   sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+   if (!sch->cpu_qstats)
+   goto errout;
+   }
+
+   return sch;
 errout:
+   qdisc_destroy(sch);
return NULL;
 }
 EXPORT_SYMBOL(qdisc_create_dflt);



[RFC PATCH 06/13] net: sched: per cpu gso handlers

2016-08-17 Thread John Fastabend
The net sched infrastructure has a gso ptr that points to skb structs
that have failed to be enqueued by the device driver.

This can happen when multiple cores try to push a skb onto the same
underlying hardware queue resulting in lock contention. This case is
handled by a cpu collision handler handle_dev_cpu_collision(). Another
case occurs when the stack overruns the drivers low level tx queues
capacity. Ideally these should be a rare occurrence in a well-tuned
system but they do happen.

To handle this in the lockless case use a per cpu gso field to park
the skb until the conflict can be resolved. Note at this point the
skb has already been popped off the qdisc so it has to be handled
by the infrastructure.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |   39 +
 net/sched/sch_api.c   |7 
 net/sched/sch_generic.c   |   71 ++---
 3 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 193cf8c..0864813 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -36,6 +36,10 @@ struct qdisc_size_table {
u16 data[];
 };
 
+struct gso_cell {
+   struct sk_buff *skb;
+};
+
 struct Qdisc {
int (*enqueue)(struct sk_buff *skb,
   struct Qdisc *sch,
@@ -73,6 +77,8 @@ struct Qdisc {
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
 
+   struct gso_cell __percpu *gso_cpu_skb;
+
/*
 * For performance sake on SMP, we put highly modified fields at the end
 */
@@ -744,6 +750,23 @@ static inline struct sk_buff *qdisc_peek_dequeued(struct 
Qdisc *sch)
return sch->gso_skb;
 }
 
+static inline struct sk_buff *qdisc_peek_dequeued_cpu(struct Qdisc *sch)
+{
+   struct gso_cell *gso = this_cpu_ptr(sch->gso_cpu_skb);
+
+   if (!gso->skb) {
+   struct sk_buff *skb = sch->dequeue(sch);
+
+   if (skb) {
+   gso->skb = skb;
+   qdisc_qstats_cpu_backlog_inc(sch, skb);
+   qdisc_qstats_cpu_qlen_inc(sch);
+   }
+   }
+
+   return gso->skb;
+}
+
 /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
 static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 {
@@ -760,6 +783,22 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct 
Qdisc *sch)
return skb;
 }
 
+static inline struct sk_buff *qdisc_dequeue_peeked_skb(struct Qdisc *sch)
+{
+   struct gso_cell *gso = this_cpu_ptr(sch->gso_cpu_skb);
+   struct sk_buff *skb = gso->skb;
+
+   if (skb) {
+   gso->skb = NULL;
+   qdisc_qstats_cpu_backlog_dec(sch, skb);
+   qdisc_qstats_cpu_qlen_dec(sch);
+   } else {
+   skb = sch->dequeue(sch);
+   }
+
+   return skb;
+}
+
 static inline void __qdisc_reset_queue(struct sk_buff_head *list)
 {
/*
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 12ebde8..d713052 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -966,6 +966,12 @@ qdisc_create(struct net_device *dev, struct netdev_queue 
*dev_queue,
goto err_out4;
}
 
+   if (sch->flags & TCQ_F_NOLOCK) {
+   sch->gso_cpu_skb = alloc_percpu(struct gso_cell);
+   if (!sch->gso_cpu_skb)
+   goto err_out4;
+   }
+
if (tca[TCA_STAB]) {
stab = qdisc_get_stab(tca[TCA_STAB]);
if (IS_ERR(stab)) {
@@ -1014,6 +1020,7 @@ err_out:
 err_out4:
free_percpu(sch->cpu_bstats);
free_percpu(sch->cpu_qstats);
+   free_percpu(sch->gso_cpu_skb);
/*
 * Any broken qdiscs that would require a ops->reset() here?
 * The qdisc was never in action so it shouldn't be necessary.
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f8fec81..3b9a21f 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,8 +44,25 @@ EXPORT_SYMBOL(default_qdisc_ops);
  * - ingress filtering is also serialized via qdisc root lock
  * - updates to tree and tree walking are only done under the rtnl mutex.
  */
+static inline struct sk_buff *qdisc_dequeue_gso_skb(struct Qdisc *sch)
+{
+   if (sch->gso_cpu_skb)
+   return (this_cpu_ptr(sch->gso_cpu_skb))->skb;
 
-static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+   return sch->gso_skb;
+}
+
+static inline void qdisc_null_gso_skb(struct Qdisc *sch)
+{
+   if (sch->gso_cpu_skb) {
+   (this_cpu_ptr(sch->gso_cpu_skb))->skb = NULL;
+   return;
+   }
+
+   sch->gso_skb = NULL;
+}
+
+static inline int 

[RFC PATCH 07/13] net: sched: support qdisc_reset on NOLOCK qdisc

2016-08-17 Thread John Fastabend
The qdisc_reset operation depends on the qdisc lock at the moment
to halt any additions to gso_skb and statistics while the list is
free'd and the stats zeroed.

Without the qdisc lock we can not guarantee another cpu is not in
the process of adding a skb to one of the "cells". Here are the
two cases we have to handle.

 case 1: qdisc_graft operation. In this case a "new" qdisc is attached
 and the 'qdisc_destroy' operation is called on the old qdisc.
 The destroy operation will wait a rcu grace period and call
 qdisc_rcu_free(). At which point gso_cpu_skb is free'd along
 with all stats so no need to zero stats and gso_cpu_skb from
 the reset operation itself.

 Because we can not continue to call qdisc_reset before waiting
 an rcu grace period so that the qdisc is detached from all
 cpus simply do not call qdisc_reset() at all and let the
 qdisc_destroy operation clean up the qdisc. Note, a refcnt
 greater than 1 would cause the destroy operation to be
 aborted however if this ever happened the reference to the
 qdisc would be lost and we would have a memory leak.

 case 2: dev_deactivate sequence. This can come from a user bringing
 the interface down which causes the gso_skb list to be flushed
 and the qlen zero'd. At the moment this is protected by the
 qdisc lock so while we clear the qlen/gso_skb fields we are
 guaranteed no new skbs are added. For the lockless case
 though this is not true. To resolve this move the qdisc_reset
 call after the new qdisc is assigned and a grace period is
 exercised to ensure no new skbs can be enqueued. Further
 the RTNL lock is held so we can not get another call to
 activate the qdisc while the skb lists are being free'd.

 Finally, fix qdisc_reset to handle the per cpu stats and
 skb lists.

Signed-off-by: John Fastabend 
---
 net/sched/sch_generic.c |   45 +++--
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3b9a21f..29238c4 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -737,6 +737,20 @@ void qdisc_reset(struct Qdisc *qdisc)
kfree_skb(qdisc->skb_bad_txq);
qdisc->skb_bad_txq = NULL;
 
+   if (qdisc->gso_cpu_skb) {
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct gso_cell *cell;
+
+   cell = per_cpu_ptr(qdisc->gso_cpu_skb, i);
+   if (cell) {
+   kfree_skb_list(cell->skb);
+   cell = NULL;
+   }
+   }
+   }
+
if (qdisc->gso_skb) {
kfree_skb_list(qdisc->gso_skb);
qdisc->gso_skb = NULL;
@@ -812,10 +826,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue 
*dev_queue,
root_lock = qdisc_lock(oqdisc);
spin_lock_bh(root_lock);
 
-   /* Prune old scheduler */
-   if (oqdisc && atomic_read(>refcnt) <= 1)
-   qdisc_reset(oqdisc);
-
/* ... and graft new one */
if (qdisc == NULL)
qdisc = _qdisc;
@@ -929,7 +939,6 @@ static void dev_deactivate_queue(struct net_device *dev,
set_bit(__QDISC_STATE_DEACTIVATED, >state);
 
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
-   qdisc_reset(qdisc);
 
spin_unlock_bh(qdisc_lock(qdisc));
}
@@ -966,6 +975,16 @@ static bool some_qdisc_is_busy(struct net_device *dev)
return false;
 }
 
+static void dev_qdisc_reset(struct net_device *dev,
+   struct netdev_queue *dev_queue,
+   void *none)
+{
+   struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+
+   if (qdisc)
+   qdisc_reset(qdisc);
+}
+
 /**
  * dev_deactivate_many - deactivate transmissions on several devices
  * @head: list of devices to deactivate
@@ -976,7 +995,6 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate_many(struct list_head *head)
 {
struct net_device *dev;
-   bool sync_needed = false;
 
list_for_each_entry(dev, head, close_list) {
netdev_for_each_tx_queue(dev, dev_deactivate_queue,
@@ -986,20 +1004,27 @@ void dev_deactivate_many(struct list_head *head)
 _qdisc);
 
dev_watchdog_down(dev);
-   sync_needed |= !dev->dismantle;
}
 
/* Wait for outstanding qdisc-less dev_queue_xmit calls.
 * This is avoided if all devices are in dismantle phase :
 * Caller will call synchronize_net() for us
 */
-   if (sync_needed)
-   synchronize_net();
+   synchronize_net();
 
/* 

[RFC PATCH 01/13] net: sched: allow qdiscs to handle locking

2016-08-17 Thread John Fastabend
This patch adds a flag for queueing disciplines to indicate the stack
does not need to use the qdisc lock to protect operations. This can
be used to build lockless scheduling algorithms and improving
performance.

The flag is checked in the tx path and the qdisc lock is only taken
if it is not set. For now use a conditional if statement. Later we
could be more aggressive if it proves worthwhile and use a static key
or wrap this in a likely().

Signed-off-by: John Fastabend 
---
 include/net/pkt_sched.h   |4 +++-
 include/net/sch_generic.h |1 +
 net/core/dev.c|   32 
 net/sched/sch_generic.c   |   26 --
 4 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 7caa99b..69540c6 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -107,8 +107,10 @@ void __qdisc_run(struct Qdisc *q);
 
 static inline void qdisc_run(struct Qdisc *q)
 {
-   if (qdisc_run_begin(q))
+   if (qdisc_run_begin(q)) {
__qdisc_run(q);
+   qdisc_run_end(q);
+   }
 }
 
 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 909aff2..3de6a8c 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -58,6 +58,7 @@ struct Qdisc {
 #define TCQ_F_NOPARENT 0x40 /* root of its hierarchy :
  * qdisc_tree_decrease_qlen() should stop.
  */
+#define TCQ_F_NOLOCK   0x80 /* qdisc does not require locking */
u32 limit;
const struct Qdisc_ops  *ops;
struct qdisc_size_table __rcu *stab;
diff --git a/net/core/dev.c b/net/core/dev.c
index 4ce07dc..5db395d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3076,6 +3076,26 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, 
struct Qdisc *q,
int rc;
 
qdisc_calculate_pkt_len(skb, q);
+
+   if (q->flags & TCQ_F_NOLOCK) {
+   if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, >state))) {
+   __qdisc_drop(skb, _free);
+   rc = NET_XMIT_DROP;
+   } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q)) {
+   qdisc_bstats_cpu_update(q, skb);
+   if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
+   __qdisc_run(q);
+   rc = NET_XMIT_SUCCESS;
+   } else {
+   rc = q->enqueue(skb, q, _free) & NET_XMIT_MASK;
+   __qdisc_run(q);
+   }
+
+   if (unlikely(to_free))
+   kfree_skb_list(to_free);
+   return rc;
+   }
+
/*
 * Heuristic to force contended enqueues to serialize on a
 * separate lock before trying to get qdisc main lock.
@@ -3118,6 +3138,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, 
struct Qdisc *q,
contended = false;
}
__qdisc_run(q);
+   qdisc_run_end(q);
}
}
spin_unlock(root_lock);
@@ -3897,19 +3918,22 @@ static void net_tx_action(struct softirq_action *h)
 
while (head) {
struct Qdisc *q = head;
-   spinlock_t *root_lock;
+   spinlock_t *root_lock = NULL;
 
head = head->next_sched;
 
-   root_lock = qdisc_lock(q);
-   spin_lock(root_lock);
+   if (!(q->flags & TCQ_F_NOLOCK)) {
+   root_lock = qdisc_lock(q);
+   spin_lock(root_lock);
+   }
/* We need to make sure head->next_sched is read
 * before clearing __QDISC_STATE_SCHED
 */
smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED, >state);
qdisc_run(q);
-   spin_unlock(root_lock);
+   if (!(q->flags & TCQ_F_NOLOCK))
+   spin_unlock(root_lock);
}
}
 }
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index e95b67c..af32418 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -170,7 +170,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
int ret = NETDEV_TX_BUSY;
 
/* And release qdisc */
-   spin_unlock(root_lock);
+   if (!(q->flags & TCQ_F_NOLOCK))
+   spin_unlock(root_lock);
 
/* Note that we validate skb (GSO, checksum, ...) outside of locks */
if (validate)
@@ -183,10 +184,13 @@ int 

[RFC PATCH 04/13] net: sched: provide atomic qlen helpers for bypass case

2016-08-17 Thread John Fastabend
The qlen is used by the core/dev.c to determine if a packet
can skip the qdisc on qdiscs with bypass enabled. In these
cases a per cpu qlen value can cause one cpu to bypass a
qdisc that has packets in it.

To avoid this case use the simplest solution I could come
up with for now and add an atomic qlen value to the qdisc
to use in these cases.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |   21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f69da4b..193cf8c 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -78,6 +78,7 @@ struct Qdisc {
 */
struct sk_buff  *gso_skb cacheline_aligned_in_smp;
struct sk_buff_head q;
+   atomic_tqlen_atomic;
struct gnet_stats_basic_packed bstats;
seqcount_t  running;
struct gnet_stats_queue qstats;
@@ -247,6 +248,11 @@ static inline void qdisc_cb_private_validate(const struct 
sk_buff *skb, int sz)
BUILD_BUG_ON(sizeof(qcb->data) < sz);
 }
 
+static inline int qdisc_qlen_atomic(const struct Qdisc *q)
+{
+   return atomic_read(>qlen_atomic);
+}
+
 static inline int qdisc_qlen_cpu(const struct Qdisc *q)
 {
return this_cpu_ptr(q->cpu_qstats)->qlen;
@@ -254,8 +260,11 @@ static inline int qdisc_qlen_cpu(const struct Qdisc *q)
 
 static inline int qdisc_qlen(const struct Qdisc *q)
 {
+   /* current default is to use atomic ops for qdisc qlen when
+* running with TCQ_F_NOLOCK.
+*/
if (q->flags & TCQ_F_NOLOCK)
-   return qdisc_qlen_cpu(q);
+   return qdisc_qlen_atomic(q);
 
return q->q.qlen;
 }
@@ -595,6 +604,16 @@ static inline void qdisc_qstats_cpu_backlog_inc(struct 
Qdisc *sch,
q->backlog += qdisc_pkt_len(skb);
 }
 
+static inline void qdisc_qstats_atomic_qlen_inc(struct Qdisc *sch)
+{
+   atomic_inc(>qlen_atomic);
+}
+
+static inline void qdisc_qstats_atomic_qlen_dec(struct Qdisc *sch)
+{
+   atomic_dec(>qlen_atomic);
+}
+
 static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
 {
this_cpu_ptr(sch->cpu_qstats)->qlen++;



[RFC PATCH 02/13] net: sched: qdisc_qlen for per cpu logic

2016-08-17 Thread John Fastabend
This is a bit interesting because it means sch_direct_xmit will
return a positive value which causes the dequeue/xmit cycle to
continue only when a specific cpu has a qlen > 0.

However checking each cpu for qlen will break performance so
its important to note that qdiscs that set the no lock bit need
to have some sort of per cpu enqueue/dequeue data structure that
maps to the per cpu qlen value.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |8 
 1 file changed, 8 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3de6a8c..354951d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -247,8 +247,16 @@ static inline void qdisc_cb_private_validate(const struct 
sk_buff *skb, int sz)
BUILD_BUG_ON(sizeof(qcb->data) < sz);
 }
 
+static inline int qdisc_qlen_cpu(const struct Qdisc *q)
+{
+   return this_cpu_ptr(q->cpu_qstats)->qlen;
+}
+
 static inline int qdisc_qlen(const struct Qdisc *q)
 {
+   if (q->flags & TCQ_F_NOLOCK)
+   return qdisc_qlen_cpu(q);
+
return q->q.qlen;
 }
 



[RFC PATCH 03/13] net: sched: provide per cpu qstat helpers

2016-08-17 Thread John Fastabend
The per cpu qstats support was added with per cpu bstat support which
is currently used by the ingress qdisc. This patch adds a set of
helpers needed to make other qdiscs that use qstats per cpu as well.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |   39 +++
 1 file changed, 39 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 354951d..f69da4b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -573,12 +573,43 @@ static inline void qdisc_qstats_backlog_dec(struct Qdisc 
*sch,
sch->qstats.backlog -= qdisc_pkt_len(skb);
 }
 
+static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch,
+   const struct sk_buff *skb)
+{
+   struct gnet_stats_queue *q = this_cpu_ptr(sch->cpu_qstats);
+
+   q->backlog -= qdisc_pkt_len(skb);
+}
+
 static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch,
const struct sk_buff *skb)
 {
sch->qstats.backlog += qdisc_pkt_len(skb);
 }
 
+static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
+   const struct sk_buff *skb)
+{
+   struct gnet_stats_queue *q = this_cpu_ptr(sch->cpu_qstats);
+
+   q->backlog += qdisc_pkt_len(skb);
+}
+
+static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
+{
+   this_cpu_ptr(sch->cpu_qstats)->qlen++;
+}
+
+static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
+{
+   this_cpu_ptr(sch->cpu_qstats)->qlen--;
+}
+
+static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
+{
+   this_cpu_ptr(sch->cpu_qstats)->requeues++;
+}
+
 static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
 {
sch->qstats.drops += count;
@@ -751,6 +782,14 @@ static inline void rtnl_qdisc_drop(struct sk_buff *skb, 
struct Qdisc *sch)
qdisc_qstats_drop(sch);
 }
 
+static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch,
+struct sk_buff **to_free)
+{
+   __qdisc_drop(skb, to_free);
+   qdisc_qstats_cpu_drop(sch);
+
+   return NET_XMIT_DROP;
+}
 
 static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
 struct sk_buff **to_free)



Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver

2016-08-17 Thread Timur Tabi

Timur Tabi wrote:


The question, how should adpt->phy.version be initialized on device tree
and ACPI platforms?

The reason why this is confusing is because there are conflicting
perspectives of this "internal PHY".  It both is and is not part of the
EMAC.  It is a separate block in the chip, and can be replaced with
other SGMII blocks, and in the future there will be a "v3" and maybe a
"v4" or who knows what.  In fact, maybe using version numbers is
inappropriate and we'll need to use vendor names or something.

The problem is that the internal PHY has mechanism for
self-identification.  There is no ACPI or device tree node for it. There
is no register you can query to see if it's there or what version it
is.  The MAC part of the EMAC has no knowledge of the PHY part.  The
driver just has to know it's there.

This is why I'm advocating a separate property (DT and ACPI) to identify
the internal PHY.


So we had some internal discussion on this.  Different compatible 
properties for DT are okay, and for ACPI we'll either create a 
"phy-version" property (DT-like properties are called DSDs in ACPI) or 
use an HRV (a specific property for the hardware version).


But this speaks to the greater problem of mapping DT compatible strings 
to ACPI.  Like I said, the ACPI equivalent of a compatible string is the 
HID.  But unlike DT, we cannot create new HIDs whenever we feel like it. 
 The process for generating and assigning HIDs is much more involved, 
and we do have a very limited namespace.


--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.


Re: [RFC PATCH 4/5] net: filter: run cgroup eBPF programs

2016-08-17 Thread Alexei Starovoitov
On Wed, Aug 17, 2016 at 11:20:29AM -0700, Alexei Starovoitov wrote:
> On Wed, Aug 17, 2016 at 04:00:47PM +0200, Daniel Mack wrote:
> > If CONFIG_CGROUP_BPF is enabled, and the cgroup associated with the
> > receiving socket has an eBPF programs installed, run them from
> > sk_filter_trim_cap().
> > 
> > eBPF programs used in this context are expected to either return 1 to
> > let the packet pass, or != 1 to drop them. The programs have access to
> > the full skb, including the MAC headers.
> > 
> > This patch only implements the call site for ingress packets.
> > 
> > Signed-off-by: Daniel Mack 
> > ---
> >  net/core/filter.c | 44 
> >  1 file changed, 44 insertions(+)
> > 
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index c5d8332..a1dd94b 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -52,6 +52,44 @@
> >  #include 
> >  #include 
> >  
> > +#ifdef CONFIG_CGROUP_BPF
> > +static int sk_filter_cgroup_bpf(struct sock *sk, struct sk_buff *skb,
> > +   enum bpf_attach_type type)
> > +{
> > +   struct sock_cgroup_data *skcd = >sk_cgrp_data;
> > +   struct cgroup *cgrp = sock_cgroup_ptr(skcd);
> > +   struct bpf_prog *prog;
> > +   int ret = 0;
> > +
> > +   rcu_read_lock();
> > +
> > +   switch (type) {
> > +   case BPF_ATTACH_TYPE_CGROUP_EGRESS:
> > +   prog = rcu_dereference(cgrp->bpf_egress);
> > +   break;
> > +   case BPF_ATTACH_TYPE_CGROUP_INGRESS:
> > +   prog = rcu_dereference(cgrp->bpf_ingress);
> > +   break;
> > +   default:
> > +   WARN_ON_ONCE(1);
> > +   ret = -EINVAL;
> > +   break;
> > +   }
> > +
> > +   if (prog) {
> 
> I really like how in this version of the patches it became
> a single load+cmp of per-packet cost when this feature is off.
> Please move
> + struct cgroup *cgrp = sock_cgroup_ptr(skcd);
> into if (prog) {..}
> to make sure it's actually single load.
> The compiler cannot avoid that load when it's placed at the top.

sorry. brain fart. it is two loads. scratch that.

> > +   unsigned int offset = skb->data - skb_mac_header(skb);
> > +
> > +   __skb_push(skb, offset);
> > +   ret = bpf_prog_run_clear_cb(prog, skb) > 0 ? 0 : -EPERM;
> 
> that doesn't match commit log. The above '> 0' makes sense to me though.
> If we want to do it for 1 only we have to define it in uapi/bpf.h
> as action code, so we can extend to 2, 3 in the future if necessary.
> 
> It also have to be bpf_prog_run_save_cb() (as sk_filter_trim_cap() does)
> instead of bpf_prog_run_clear_cb().
> See commit ff936a04e5f2 ("bpf: fix cb access in socket filter programs")
> 
> > +   __skb_pull(skb, offset);
> > +   }
> > +
> > +   rcu_read_unlock();
> > +
> > +   return ret;
> > +}
> > +#endif /* !CONFIG_CGROUP_BPF */
> > +
> >  /**
> >   * sk_filter_trim_cap - run a packet through a socket filter
> >   * @sk: sock associated with _buff
> > @@ -78,6 +116,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff 
> > *skb, unsigned int cap)
> > if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
> > return -ENOMEM;
> >  
> > +#ifdef CONFIG_CGROUP_BPF
> > +   err = sk_filter_cgroup_bpf(sk, skb, BPF_ATTACH_TYPE_CGROUP_INGRESS);
> > +   if (err)
> > +   return err;
> > +#endif
> > +
> > err = security_sock_rcv_skb(sk, skb);
> > if (err)
> > return err;
> > -- 
> > 2.5.5
> > 


Re: [RFC PATCH 3/5] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands

2016-08-17 Thread Alexei Starovoitov
On Wed, Aug 17, 2016 at 09:16:02AM -0700, Eric Dumazet wrote:
> On Wed, 2016-08-17 at 16:00 +0200, Daniel Mack wrote:
> 
> > +   progp = is_ingress ? >bpf_ingress : >bpf_egress;
> > +
> > +   rcu_read_lock();
> > +   old_prog = rcu_dereference(*progp);
> > +   rcu_assign_pointer(*progp, prog);
> > +
> > +   if (old_prog)
> > +   bpf_prog_put(old_prog);
> > +
> > +   rcu_read_unlock();
> 
> 
> This is a bogus locking strategy.

yep. this rcu_lock/unlock won't solve the race between parallel
bpf_prog_attach calls.
please use xchg() similar to bpf_fd_array_map_update_elem()
Then prog pointer will be swapped automically and bpf_prog_put()
will free it via call_rcu.
The reader side in sk_filter_cgroup_bpf() looks correct.
 


Re: [RFC PATCH 4/5] net: filter: run cgroup eBPF programs

2016-08-17 Thread Alexei Starovoitov
On Wed, Aug 17, 2016 at 04:00:47PM +0200, Daniel Mack wrote:
> If CONFIG_CGROUP_BPF is enabled, and the cgroup associated with the
> receiving socket has an eBPF programs installed, run them from
> sk_filter_trim_cap().
> 
> eBPF programs used in this context are expected to either return 1 to
> let the packet pass, or != 1 to drop them. The programs have access to
> the full skb, including the MAC headers.
> 
> This patch only implements the call site for ingress packets.
> 
> Signed-off-by: Daniel Mack 
> ---
>  net/core/filter.c | 44 
>  1 file changed, 44 insertions(+)
> 
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c5d8332..a1dd94b 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -52,6 +52,44 @@
>  #include 
>  #include 
>  
> +#ifdef CONFIG_CGROUP_BPF
> +static int sk_filter_cgroup_bpf(struct sock *sk, struct sk_buff *skb,
> + enum bpf_attach_type type)
> +{
> + struct sock_cgroup_data *skcd = >sk_cgrp_data;
> + struct cgroup *cgrp = sock_cgroup_ptr(skcd);
> + struct bpf_prog *prog;
> + int ret = 0;
> +
> + rcu_read_lock();
> +
> + switch (type) {
> + case BPF_ATTACH_TYPE_CGROUP_EGRESS:
> + prog = rcu_dereference(cgrp->bpf_egress);
> + break;
> + case BPF_ATTACH_TYPE_CGROUP_INGRESS:
> + prog = rcu_dereference(cgrp->bpf_ingress);
> + break;
> + default:
> + WARN_ON_ONCE(1);
> + ret = -EINVAL;
> + break;
> + }
> +
> + if (prog) {

I really like how in this version of the patches it became
a single load+cmp of per-packet cost when this feature is off.
Please move
+   struct cgroup *cgrp = sock_cgroup_ptr(skcd);
into if (prog) {..}
to make sure it's actually single load.
The compiler cannot avoid that load when it's placed at the top.

> + unsigned int offset = skb->data - skb_mac_header(skb);
> +
> + __skb_push(skb, offset);
> + ret = bpf_prog_run_clear_cb(prog, skb) > 0 ? 0 : -EPERM;

that doesn't match commit log. The above '> 0' makes sense to me though.
If we want to do it for 1 only we have to define it in uapi/bpf.h
as action code, so we can extend to 2, 3 in the future if necessary.

It also have to be bpf_prog_run_save_cb() (as sk_filter_trim_cap() does)
instead of bpf_prog_run_clear_cb().
See commit ff936a04e5f2 ("bpf: fix cb access in socket filter programs")

> + __skb_pull(skb, offset);
> + }
> +
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +#endif /* !CONFIG_CGROUP_BPF */
> +
>  /**
>   *   sk_filter_trim_cap - run a packet through a socket filter
>   *   @sk: sock associated with _buff
> @@ -78,6 +116,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff 
> *skb, unsigned int cap)
>   if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
>   return -ENOMEM;
>  
> +#ifdef CONFIG_CGROUP_BPF
> + err = sk_filter_cgroup_bpf(sk, skb, BPF_ATTACH_TYPE_CGROUP_INGRESS);
> + if (err)
> + return err;
> +#endif
> +
>   err = security_sock_rcv_skb(sk, skb);
>   if (err)
>   return err;
> -- 
> 2.5.5
> 


Re: [RFC PATCH 2/5] cgroup: add bpf_{e,in}gress pointers

2016-08-17 Thread Alexei Starovoitov
On Wed, Aug 17, 2016 at 04:00:45PM +0200, Daniel Mack wrote:
> Add two pointers for eBPF programs to struct cgroup. These will be used
> to store programs for ingress and egress for accounting and filtering.
> 
> This new feature is guarded by CONFIG_CGROUP_BPF.
...
> +#ifdef CONFIG_CGROUP_BPF
> + /* used by the networking layer */
> + struct bpf_prog *bpf_ingress;
> + struct bpf_prog *bpf_egress;
> +#endif
...
> +config CGROUP_BPF
> + bool "Enable eBPF programs in cgroups"
> + depends on BPF_SYSCALL
> + help
> +   This options allows cgroups to accommodate eBPF programs that
> +   can be used for network traffic filtering and accounting. See
> +   Documentation/networking/filter.txt for more information.
> +

I think this extra config is unnecessary. It makes the code harder to follow.
Anyone turning on bpf syscall and cgroups should be able to have this feature.
Extra config is imo overkill.



Re: [RFC PATCH 2/5] cgroup: add bpf_{e,in}gress pointers

2016-08-17 Thread Tejun Heo
Hello,

On Wed, Aug 17, 2016 at 10:50:40AM -0700, Alexei Starovoitov wrote:
> > +config CGROUP_BPF
> > +   bool "Enable eBPF programs in cgroups"
> > +   depends on BPF_SYSCALL
> > +   help
> > + This options allows cgroups to accommodate eBPF programs that
> > + can be used for network traffic filtering and accounting. See
> > + Documentation/networking/filter.txt for more information.
> > +
> 
> I think this extra config is unnecessary. It makes the code harder to follow.
> Anyone turning on bpf syscall and cgroups should be able to have this feature.
> Extra config is imo overkill.

Agreed, the added code is pretty small, especially in comparison to
both cgroup and bpf, and the only memory overhead would be four
pointers per cgroup, which shouldn't be noticeable at all.

Thanks.

-- 
tejun


[PATCH] net: ethernet: nuvoton: fix spelling mistake: "aligment" -> "alignment"

2016-08-17 Thread Colin King
From: Colin Ian King 

trivial fix to spelling mistake in dev_err message

Signed-off-by: Colin Ian King 
---
 drivers/net/ethernet/nuvoton/w90p910_ether.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/nuvoton/w90p910_ether.c 
b/drivers/net/ethernet/nuvoton/w90p910_ether.c
index 87b7b81..712d8bc 100644
--- a/drivers/net/ethernet/nuvoton/w90p910_ether.c
+++ b/drivers/net/ethernet/nuvoton/w90p910_ether.c
@@ -751,7 +751,7 @@ static void netdev_rx(struct net_device *dev)
dev_err(>dev, "rx crc err\n");
ether->stats.rx_crc_errors++;
} else if (status & RXDS_ALIE) {
-   dev_err(>dev, "rx aligment err\n");
+   dev_err(>dev, "rx alignment err\n");
ether->stats.rx_frame_errors++;
} else if (status & RXDS_PTLE) {
dev_err(>dev, "rx longer err\n");
-- 
2.9.3



Re: [RFC PATCH 3/5] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands

2016-08-17 Thread Alexei Starovoitov
On Wed, Aug 17, 2016 at 05:51:44PM +0200, Daniel Mack wrote:
> On 08/17/2016 05:06 PM, Tejun Heo wrote:
> > What I'd suggest is keeping two sets of pointers, one set for the
> > programs explicitly attached to the cgroup and the other for programs
> > which are to be executed for the cgroup.  The two pointers in the
> > latter set will point to the closest non-null program in its ancestry.
> > Note that the pointers may point to programs belonging to different
> > ancestors.
> > 
> > A - B - C
> >   \ D - E
> > 
> > Let's say B has both ingress and egress programs and D only has
> > egress.  E's execution pointers should point to D's egress program and
> > B's ingress program.
> > 
> > The pointers would need to be updated
> > 
> > 1. When a program is installed or removed.  For simplicity's sake, we
> >can just walk the whole hierarchy.  If that's too expensive
> >(shouldn't be), we can find out the closest ancestor where both
> >programs can be determined and then walk from there.
> > 
> > 2. When a new cgroup is created, it should inherit its execution table
> >from its parent.
> 
> Ok, yes, that sounds sane to me. Will implement that scheme.

That makes sense to me as well. Sounds quite flexible and fast
with only one check per packet per direction.



Re: [PATCH v2 1/1] ppp: Fix one deadlock issue of PPP when send frame

2016-08-17 Thread Guillaume Nault
On Tue, Aug 16, 2016 at 07:33:38PM +0800, f...@ikuai8.com wrote:
> From: Gao Feng 
> 
> PPP channel holds one spinlock before send frame. But the skb may
> select the same PPP channel with wrong route policy. As a result,
> the skb reaches the same channel path. It tries to get the same
> spinlock which is held before. Bang, the deadlock comes out.
> 
Unless I misunderstood the problem you're trying to solve, this patch
doesn't really help: deadlock still occurs if the same IP is used for
L2TP and PPP's peer address.

> Now add one lock owner to avoid it like xmit_lock_owner of
> netdev_queue. Check the lock owner before try to get the spinlock.
> If the current cpu is already the owner, needn't lock again. When
> PPP channel holds the spinlock at the first time, it sets owner
> with current CPU ID.
>
I think you should forbid lock recursion entirely, and drop the packet
if the owner tries to re-acquire the channel lock. Otherwise you just
move the deadlock down the stack (l2tp_xmit_skb() can't be called
recursively).

> diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
> index 70cfa06..6909ab1 100644
> --- a/drivers/net/ppp/ppp_generic.c
> +++ b/drivers/net/ppp/ppp_generic.c
> @@ -162,6 +162,37 @@ struct ppp {
>|SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
>|SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)
>  
> +struct chennel_lock {
^
I guess you meant 'channel_lock'.

> + spinlock_t lock;
> + u32 owner;
This field's default value is -1. So why not declaring it as 'int'?
--
To unsubscribe from this list: send the line "unsubscribe linux-ppp" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: regression in xgene-enet in 4.8-rc series, oops from xgene_enet_probe

2016-08-17 Thread Loc Ho
Hi Riku,

>> When the driver is configured as kernel module and when it gets
>> unloaded and reloaded, kernel crash was observed.  This patch
>> addresses the software cleanup by doing the following,
>>
>> - Moved register_netdev call after hardware is ready
>> - Since ndev is not ready, added set_irq_name to set irq name
>> - Since ndev is not ready, changed mdio_bus->parent to pdev->dev
>> - Replaced netif_start(stop)_queue by netif_tx_start(stop)_queues
>> - Removed napi_del call since it's called by free_netdev
>> - Added dev_close call, within remove
>> - Added shutdown callback
>> - Changed to use dmam_ APIs
>
> Bisecting points this patch, commited as
> cb0366b7c16427a25923350b69f53a5b1345a34b the cause of oops when
> booting apm mustang:
>
> [1.670201] [ cut here ]
> [1.674804] WARNING: CPU: 2 PID: 1 at ../net/core/dev.c:6696
> rollback_registered_many+0x60/0x300
> [1.683543] Modules linked in: realtek
> [1.687291]
> [1.688774] CPU: 2 PID: 1 Comm: swapper/0 Not tainted
> 4.8.0-rc2-00037-g3ec60b92d3ba #1
> [1.696648] Hardware name: APM X-Gene Mustang board (DT)
> [1.701930] task: 8003ee078000 task.stack: 8003ee054000
> [1.707819] PC is at rollback_registered_many+0x60/0x300
> [1.713102] LR is at rollback_registered_many+0x30/0x300
> [1.718384] pc : [] lr : [] pstate: 2045
> [1.725739] sp : 8003ee057b00
> [1.729034] x29: 8003ee057b00 x28: 8003eda1a000
> [1.734338] x27: 0002 x26: 8003ebcba970
> [1.739641] x25: 8003eda1a208 x24: 8003eda1a010
> [1.744945] x23: 8003ee057c58 x22: 8003ebcba000
> [1.750247] x21: ffed x20: 8003ee057b70
> [1.755549] x19: 8003ee057b50 x18: 08dfafff
> [1.760852] x17: 0007 x16: 0001
> [1.766154] x15: 08ce2000 x14: 
> [1.771458] x13: 0008 x12: 0030
> [1.776760] x11: 0030 x10: 0101010101010101
> [1.782062] x9 :  x8 : 8003df80c700
> [1.787365] x7 :  x6 : 0001
> [1.792668] x5 : dead0100 x4 : dead0200
> [1.797971] x3 : 8003ebcba070 x2 : 
> [1.803273] x1 : 8003ee057b00 x0 : 8003ebcba000
> [1.808575]
> [1.810057] ---[ end trace 93f1dda704e63533 ]---
> [1.814648] Call trace:
> [1.816207] ata2: SATA link down (SStatus 0 SControl 4300)
> [1.822535] Exception stack(0x8003ee057930 to 0x8003ee057a60)
> [1.828941] 7920:
> 8003ee057b50 0001
> [1.836729] 7940: 8003ee057b00 08773c18
> 8003ee057980 08849a1c
> [1.844517] 7960: 0009 08e5
> 8003ee0579a0 086eb03c
> [1.852305] 7980: 08dbcde8 8003fffe1ca0
> 0040 8003ee057998
> [1.860094] 79a0: 8003ee0579e0 086eb1b0
> 0004 8003ee057a4c
> f8003ebcba000 8003ee057b00
> [1.875669] 79e0:  8003ebcba070
> dead0200 dead0100
> [1.883457] 7a00: 0001 
> 8003df80c700 
> [1.884198] ata4: SATA link down (SStatus 0 SControl 4300)
> [1.884211] ata3: SATA link down (SStatus 0 SControl 4300)
> [1.902153] 7a20: 0101010101010101 0030
> 0030 0008
> [1.909941] 7a40:  08ce2000
> 0001 0007
> [1.917730] [] rollback_registered_many+0x60/0x300
> [1.924050] [] rollback_registered+0x28/0x40
> [1.929852] [] unregister_netdevice_queue+0x78/0xb8
> [1.936259] [] unregister_netdev+0x20/0x30
> [1.941889] [] xgene_enet_probe+0x638/0xf98
> [1.947605] [] platform_drv_probe+0x50/0xb8
> [1.953320] [] driver_probe_device+0x204/0x2b0
> [1.959294] [] __driver_attach+0xac/0xb0
> [1.964751] [] bus_for_each_dev+0x60/0xa0
> [1.970293] [] driver_attach+0x20/0x28
> [1.975576] [] bus_add_driver+0x1d0/0x238
> [1.981118] [] driver_register+0x60/0xf8
> [1.986574] [] __platform_driver_register+0x40/0x48
> [1.992982] [] xgene_enet_driver_init+0x18/0x20
> [1.999044] [] do_one_initcall+0x38/0x128
> [2.004588] [] kernel_init_freeable+0x1ac/0x250
> [2.010651] [] kernel_init+0x10/0x100
> [2.015847] [] ret_from_fork+0x10/0x40
> [2.021152] network todo 'eth%d' but state 0
>
> Picked up from:
>
> https://storage.kernelci.org/mainline/v4.8-rc2-37-g3ec60b92d3ba/arm64-defconfig/lab-cambridge/boot-apm-mustang.html
>
> Visible on all mainline/apt-mustang boot reports. net-next seems to
> have a fix for this.

Iyappan will follow up if required. But I notice that you are using
1.13.xx boot loader. The latest official released boot loader version
is 1.15.23 (from late 2015). I would suggest and recommend that you
upgrade if possible.

-Loc


Server fails to boot with net-next

2016-08-17 Thread Yuval Mintz
Just updated to last net-next and boot crashed in x2apic_cluster_probe().
Bisection seemed to point to:

commit 6b2c28471de550308784560206c3365e5179d42f
Author: Sebastian Andrzej Siewior 
Date:   Wed Jul 13 17:17:00 2016 +

x86/x2apic: Convert to CPU hotplug state machine

When I re-compiled after removing CONFIG_X86_X2APIC, server managed
to load properly.

Are there known issues here?


config-bisect-failing
Description: config-bisect-failing


Re: Server fails to boot with net-next

2016-08-17 Thread bige...@linutronix.de
On 2016-08-17 16:58:02 [+], Yuval Mintz wrote:
> Are there known issues here?
fixed in -rc2 by 
http://git.kernel.org/tip/d52c0569bab4edc32df44dc7ac28517134f6

Sebastian


[PATCH net] netfilter: tproxy: properly refcount tcp listeners

2016-08-17 Thread Eric Dumazet
From: Eric Dumazet 

inet_lookup_listener() and inet6_lookup_listener() no longer
take a reference on the found listener.

This minimal patch adds back the refcounting, but we might do
this differently in net-next later.

Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood")
Reported-and-tested-by: Denys Fedoryshchenko 
Signed-off-by: Eric Dumazet 
---
Note: bug added in 4.7, stable candidate.

 net/netfilter/xt_TPROXY.c |4 
 1 file changed, 4 insertions(+)

diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 7f4414d26a66..663c4c3c9072 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -127,6 +127,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, 
void *hp,
daddr, dport,
in->ifindex);
 
+   if (sk && !atomic_inc_not_zero(>sk_refcnt))
+   sk = NULL;
/* NOTE: we return listeners even if bound to
 * 0.0.0.0, those are filtered out in
 * xt_socket, since xt_TPROXY needs 0 bound
@@ -195,6 +197,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, 
int thoff, void *hp,
   daddr, ntohs(dport),
   in->ifindex);
 
+   if (sk && !atomic_inc_not_zero(>sk_refcnt))
+   sk = NULL;
/* NOTE: we return listeners even if bound to
 * 0.0.0.0, those are filtered out in
 * xt_socket, since xt_TPROXY needs 0 bound




Re: kernel panic TPROXY , vanilla 4.7.1

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 19:44 +0300, Denys Fedoryshchenko wrote:

> Yes, everything fine after patch!
> Thanks a lot

Perfect, thanks for testing, I am sending the official patch.




Re: kernel panic TPROXY , vanilla 4.7.1

2016-08-17 Thread Denys Fedoryshchenko

On 2016-08-17 19:04, Eric Dumazet wrote:

On Wed, 2016-08-17 at 08:42 -0700, Eric Dumazet wrote:

On Wed, 2016-08-17 at 17:31 +0300, Denys Fedoryshchenko wrote:
> Hi!
>
> Tried to run squid on latest kernel, and hit a panic
> Sometimes it just shows warning in dmesg (but doesnt work properly)
> [   75.701666] IPv4: Attempt to release TCP socket in state 10
> 88102d430780
> [   83.866974] squid (2700) used greatest stack depth: 12912 bytes left
> [   87.506644] IPv4: Attempt to release TCP socket in state 10
> 880078a48780
> [  114.704295] IPv4: Attempt to release TCP socket in state 10
> 881029f8ad00
>
> I cannot catch yet oops/panic message, netconsole not working.
>
> After triggering warning message 3 times, i am unable to run squid
> anymore (without reboot), and in netstat it doesnt show port running.
>
> firewall is:
> *mangle
> -A PREROUTING -p tcp -m socket -j DIVERT
> -A PREROUTING -p tcp -m tcp --dport 80 -i eno1 -j TPROXY --on-port 3129
> --on-ip 0.0.0.0 --tproxy-mark 0x1/0x1
> -A DIVERT -j MARK --set-xmark 0x1/0x
> -A DIVERT -j ACCEPT
>
> routing
> ip rule add fwmark 1 lookup 100
> ip route add local default dev eno1 table 100
>
>
> squid config is default with tproxy option
> http_port 3129 tproxy
>

Hmppff... sorry for this, I will send a fix.

Thanks for the report !




Could you try the following ?

Thanks !

 net/netfilter/xt_TPROXY.c |4 
 1 file changed, 4 insertions(+)

diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 7f4414d26a66..663c4c3c9072 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -127,6 +127,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct
sk_buff *skb, void *hp,
daddr, dport,
in->ifindex);

+   if (sk && !atomic_inc_not_zero(>sk_refcnt))
+   sk = NULL;
/* NOTE: we return listeners even if bound to
 * 0.0.0.0, those are filtered out in
 * xt_socket, since xt_TPROXY needs 0 bound
@@ -195,6 +197,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct
sk_buff *skb, int thoff, void *hp,
   daddr, ntohs(dport),
   in->ifindex);

+   if (sk && !atomic_inc_not_zero(>sk_refcnt))
+   sk = NULL;
/* NOTE: we return listeners even if bound to
 * 0.0.0.0, those are filtered out in
 * xt_socket, since xt_TPROXY needs 0 bound

Yes, everything fine after patch!
Thanks a lot


Re: [PATCH] net: ipv6: Remove addresses for failures with strict DAD

2016-08-17 Thread Mike Manning
On 08/17/2016 04:40 PM, Hannes Frederic Sowa wrote:
> On 17.08.2016 12:28, Mike Manning wrote:
>> +static void dev_disable_change(struct inet6_dev *idev);
>>  
>>  /*
>>   *  Configured unicast address hash table
>> @@ -1945,6 +1946,12 @@ lock_errdad:
>>  
>>  pr_info("%s: IPv6 being disabled!\n",
>>  ifp->idev->dev->name);
>> +spin_unlock_bh(>lock);
>> +addrconf_dad_stop(ifp, 1);
>> +rtnl_lock();
>> +dev_disable_change(idev);
>> +rtnl_unlock();
>> +return;
>>  }
>>  }
> 
> You can't take rtnl_lock at that point but must postpone the actions and
> do that in addrconf_dad_work.
> 
> Probably the whole ... else if (idev->cnf.accept_dad > 1 && ...) needs
> to move there.
> 
> Bye,
> Hannes
> 
> 

Thanks for the prompt review, I will look into making these changes.

Also these changes caused a build error due to conditional compilation without 
CONFIG_SYSCTL, which is resolved by replacing the call to 
dev_disable_change(idev) by directly calling addrconf_ifdown(idev->dev, 0) 
instead.

I would appreciate any further comments if the suggested change in behavior is 
not acceptable.

Thanks
Mike



Re: [PATCH] net: ipv6: Remove addresses for failures with strict DAD

2016-08-17 Thread Hannes Frederic Sowa
On Wed, Aug 17, 2016, at 18:08, Mike Manning wrote:
> On 08/17/2016 04:40 PM, Hannes Frederic Sowa wrote:
> > On 17.08.2016 12:28, Mike Manning wrote:
> >> +static void dev_disable_change(struct inet6_dev *idev);
> >>  
> >>  /*
> >>   *Configured unicast address hash table
> >> @@ -1945,6 +1946,12 @@ lock_errdad:
> >>  
> >>pr_info("%s: IPv6 being disabled!\n",
> >>ifp->idev->dev->name);
> >> +  spin_unlock_bh(>lock);
> >> +  addrconf_dad_stop(ifp, 1);
> >> +  rtnl_lock();
> >> +  dev_disable_change(idev);
> >> +  rtnl_unlock();
> >> +  return;
> >>}
> >>}
> > 
> > You can't take rtnl_lock at that point but must postpone the actions and
> > do that in addrconf_dad_work.
> > 
> > Probably the whole ... else if (idev->cnf.accept_dad > 1 && ...) needs
> > to move there.
> > 
> > Bye,
> > Hannes
> > 
> > 
> 
> Thanks for the prompt review, I will look into making these changes.
> 
> Also these changes caused a build error due to conditional compilation
> without CONFIG_SYSCTL, which is resolved by replacing the call to
> dev_disable_change(idev) by directly calling addrconf_ifdown(idev->dev,
> 0) instead.
> 
> I would appreciate any further comments if the suggested change in
> behavior is not acceptable.

What you describe in the changelog what is happening right now looks
like a bug to me thus your patch made sense to me.

Bye,
Hannes


Re: [RFC PATCH 3/5] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 16:00 +0200, Daniel Mack wrote:

> + progp = is_ingress ? >bpf_ingress : >bpf_egress;
> +
> + rcu_read_lock();
> + old_prog = rcu_dereference(*progp);
> + rcu_assign_pointer(*progp, prog);
> +
> + if (old_prog)
> + bpf_prog_put(old_prog);
> +
> + rcu_read_unlock();


This is a bogus locking strategy.

You do not want to use rcu_read_lock()/rcu_read_unlock() here, but
appropriate writer exclusion (a mutex probably, or a spinlock)

Then use rcu_dereference_protected() instead of rcu_dereference(*progp);




Re: [PATCH net] tcp: fix use after free in tcp_xmit_retransmit_queue()

2016-08-17 Thread Cong Wang
On Wed, Aug 17, 2016 at 5:56 AM, Eric Dumazet  wrote:
> From: Eric Dumazet 
>
> When tcp_sendmsg() allocates a fresh and empty skb, it puts it at the
> tail of the write queue using tcp_add_write_queue_tail()
>
> Then it attempts to copy user data into this fresh skb.
>
> If the copy fails, we undo the work and remove the fresh skb.
>
> Unfortunately, this undo lacks the change done to tp->highest_sack and
> we can leave a dangling pointer (to a freed skb)
>
> Later, tcp_xmit_retransmit_queue() can dereference this pointer and
> access freed memory. For regular kernels where memory is not unmapped,
> this might cause SACK bugs because tcp_highest_sack_seq() is buggy,
> returning garbage instead of tp->snd_nxt, but with various debug
> features like CONFIG_DEBUG_PAGEALLOC, this can crash the kernel.
>
> This bug was found by Marco Grassi thanks to syzkaller.
>
> Fixes: 6859d49475d4 ("[TCP]: Abstract tp->highest_sack accessing & point to 
> next skb")
> Reported-by: Marco Grassi 
> Signed-off-by: Eric Dumazet 
> Cc: Ilpo Järvinen 
> Cc: Yuchung Cheng 
> Cc: Neal Cardwell 
> ---
>  include/net/tcp.h |2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index c00e7d51bb18..7717302cab91 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1523,6 +1523,8 @@ static inline void tcp_check_send_head(struct sock *sk, 
> struct sk_buff *skb_unli
>  {
> if (sk->sk_send_head == skb_unlinked)
> sk->sk_send_head = NULL;
> +   if (tcp_sk(sk)->highest_sack == skb_unlinked)
> +   tcp_sk(sk)->highest_sack = NULL;
>  }
>

Nit: the function name probably needs to change too, since it now
checks more than just send_head. ;) But we can always do this
for net-next, don't let this be a blocker for this security fix.

Reviewed-by: Cong Wang 

Thanks!


Re: kernel panic TPROXY , vanilla 4.7.1

2016-08-17 Thread Eric Dumazet
On Wed, 2016-08-17 at 08:42 -0700, Eric Dumazet wrote:
> On Wed, 2016-08-17 at 17:31 +0300, Denys Fedoryshchenko wrote:
> > Hi!
> > 
> > Tried to run squid on latest kernel, and hit a panic
> > Sometimes it just shows warning in dmesg (but doesnt work properly)
> > [   75.701666] IPv4: Attempt to release TCP socket in state 10 
> > 88102d430780
> > [   83.866974] squid (2700) used greatest stack depth: 12912 bytes left
> > [   87.506644] IPv4: Attempt to release TCP socket in state 10 
> > 880078a48780
> > [  114.704295] IPv4: Attempt to release TCP socket in state 10 
> > 881029f8ad00
> > 
> > I cannot catch yet oops/panic message, netconsole not working.
> > 
> > After triggering warning message 3 times, i am unable to run squid 
> > anymore (without reboot), and in netstat it doesnt show port running.
> > 
> > firewall is:
> > *mangle
> > -A PREROUTING -p tcp -m socket -j DIVERT
> > -A PREROUTING -p tcp -m tcp --dport 80 -i eno1 -j TPROXY --on-port 3129 
> > --on-ip 0.0.0.0 --tproxy-mark 0x1/0x1
> > -A DIVERT -j MARK --set-xmark 0x1/0x
> > -A DIVERT -j ACCEPT
> > 
> > routing
> > ip rule add fwmark 1 lookup 100
> > ip route add local default dev eno1 table 100
> > 
> > 
> > squid config is default with tproxy option
> > http_port 3129 tproxy
> > 
> 
> Hmppff... sorry for this, I will send a fix.
> 
> Thanks for the report !
> 


Could you try the following ?

Thanks !

 net/netfilter/xt_TPROXY.c |4 
 1 file changed, 4 insertions(+)

diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 7f4414d26a66..663c4c3c9072 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -127,6 +127,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, 
void *hp,
daddr, dport,
in->ifindex);
 
+   if (sk && !atomic_inc_not_zero(>sk_refcnt))
+   sk = NULL;
/* NOTE: we return listeners even if bound to
 * 0.0.0.0, those are filtered out in
 * xt_socket, since xt_TPROXY needs 0 bound
@@ -195,6 +197,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, 
int thoff, void *hp,
   daddr, ntohs(dport),
   in->ifindex);
 
+   if (sk && !atomic_inc_not_zero(>sk_refcnt))
+   sk = NULL;
/* NOTE: we return listeners even if bound to
 * 0.0.0.0, those are filtered out in
 * xt_socket, since xt_TPROXY needs 0 bound




Re: [RFC PATCH 3/5] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands

2016-08-17 Thread Daniel Mack
On 08/17/2016 05:06 PM, Tejun Heo wrote:
> What I'd suggest is keeping two sets of pointers, one set for the
> programs explicitly attached to the cgroup and the other for programs
> which are to be executed for the cgroup.  The two pointers in the
> latter set will point to the closest non-null program in its ancestry.
> Note that the pointers may point to programs belonging to different
> ancestors.
> 
>   A - B - C
>   \ D - E
> 
> Let's say B has both ingress and egress programs and D only has
> egress.  E's execution pointers should point to D's egress program and
> B's ingress program.
> 
> The pointers would need to be updated
> 
> 1. When a program is installed or removed.  For simplicity's sake, we
>can just walk the whole hierarchy.  If that's too expensive
>(shouldn't be), we can find out the closest ancestor where both
>programs can be determined and then walk from there.
> 
> 2. When a new cgroup is created, it should inherit its execution table
>from its parent.

Ok, yes, that sounds sane to me. Will implement that scheme.


Thanks!
Daniel



  1   2   >