date:20171101

Re: SKB Reference Question

2017-11-01 Thread David Miller

From: Joe Smith 
Date: Wed, 1 Nov 2017 10:27:49 -0700

> How strictly are references on the SKB enforced. For example,
> tcp_transmit_skb() clones the SKB and adds a TCP header. Can I assume
> that in case of re-transmission the header added will be there and can
> be reused instead of creating a new one from scratch. Some fields like
> time stamp would need to be updated but they should be unmodified.

Every time a data packet goes out, whether it is the initial transmission
or a retransmission, it goes through tcp_transmit_skb() which pushes the
TCP headers and fills in all of the TCP header options as well.

[PATCH 2/2] net: bridge: Convert timers to use timer_setup()

2017-11-01 Thread Allen Pais

switch to using the new timer_setup() and from_timer() api's.

Signed-off-by: Allen Pais 
---
 net/bridge/br_stp_timer.c | 48 +++
 1 file changed, 19 insertions(+), 29 deletions(-)

diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index 60b6fe2..0e68790 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -31,9 +31,9 @@ static int br_is_designated_for_some_port(const struct 
net_bridge *br)
return 0;
 }
 
-static void br_hello_timer_expired(unsigned long arg)
+static void br_hello_timer_expired(struct timer_list *t)
 {
-   struct net_bridge *br = (struct net_bridge *)arg;
+   struct net_bridge *br = from_timer(br, t, hello_timer);
 
br_debug(br, "hello timer expired\n");
spin_lock(>lock);
@@ -47,9 +47,9 @@ static void br_hello_timer_expired(unsigned long arg)
spin_unlock(>lock);
 }
 
-static void br_message_age_timer_expired(unsigned long arg)
+static void br_message_age_timer_expired(struct timer_list *t)
 {
-   struct net_bridge_port *p = (struct net_bridge_port *) arg;
+   struct net_bridge_port *p = from_timer(p, t, message_age_timer);
struct net_bridge *br = p->br;
const bridge_id *id = >designated_bridge;
int was_root;
@@ -80,9 +80,9 @@ static void br_message_age_timer_expired(unsigned long arg)
spin_unlock(>lock);
 }
 
-static void br_forward_delay_timer_expired(unsigned long arg)
+static void br_forward_delay_timer_expired(struct timer_list *t)
 {
-   struct net_bridge_port *p = (struct net_bridge_port *) arg;
+   struct net_bridge_port *p = from_timer(p, t, forward_delay_timer);
struct net_bridge *br = p->br;
 
br_debug(br, "port %u(%s) forward delay timer\n",
@@ -104,9 +104,9 @@ static void br_forward_delay_timer_expired(unsigned long 
arg)
spin_unlock(>lock);
 }
 
-static void br_tcn_timer_expired(unsigned long arg)
+static void br_tcn_timer_expired(struct timer_list *t)
 {
-   struct net_bridge *br = (struct net_bridge *) arg;
+   struct net_bridge *br = from_timer(br, t, tcn_timer);
 
br_debug(br, "tcn timer expired\n");
spin_lock(>lock);
@@ -118,9 +118,9 @@ static void br_tcn_timer_expired(unsigned long arg)
spin_unlock(>lock);
 }
 
-static void br_topology_change_timer_expired(unsigned long arg)
+static void br_topology_change_timer_expired(struct timer_list *t)
 {
-   struct net_bridge *br = (struct net_bridge *) arg;
+   struct net_bridge *br = from_timer(br, t, topology_change_timer);
 
br_debug(br, "topo change timer expired\n");
spin_lock(>lock);
@@ -129,9 +129,9 @@ static void br_topology_change_timer_expired(unsigned long 
arg)
spin_unlock(>lock);
 }
 
-static void br_hold_timer_expired(unsigned long arg)
+static void br_hold_timer_expired(struct timer_list *t)
 {
-   struct net_bridge_port *p = (struct net_bridge_port *) arg;
+   struct net_bridge_port *p = from_timer(p, t, hold_timer);
 
br_debug(p->br, "port %u(%s) hold timer expired\n",
 (unsigned int) p->port_no, p->dev->name);
@@ -144,27 +144,17 @@ static void br_hold_timer_expired(unsigned long arg)
 
 void br_stp_timer_init(struct net_bridge *br)
 {
-   setup_timer(>hello_timer, br_hello_timer_expired,
- (unsigned long) br);
-
-   setup_timer(>tcn_timer, br_tcn_timer_expired,
- (unsigned long) br);
-
-   setup_timer(>topology_change_timer,
- br_topology_change_timer_expired,
- (unsigned long) br);
+   timer_setup(>hello_timer, br_hello_timer_expired, 0);
+   timer_setup(>tcn_timer, br_tcn_timer_expired, 0);
+   timer_setup(>topology_change_timer,
+   br_topology_change_timer_expired, 0);
 }
 
 void br_stp_port_timer_init(struct net_bridge_port *p)
 {
-   setup_timer(>message_age_timer, br_message_age_timer_expired,
- (unsigned long) p);
-
-   setup_timer(>forward_delay_timer, br_forward_delay_timer_expired,
- (unsigned long) p);
-
-   setup_timer(>hold_timer, br_hold_timer_expired,
- (unsigned long) p);
+   timer_setup(>message_age_timer, br_message_age_timer_expired, 0);
+   timer_setup(>forward_delay_timer, br_forward_delay_timer_expired, 0);
+   timer_setup(>hold_timer, br_hold_timer_expired, 0);
 }
 
 /* Report ticks left (in USER_HZ) used for API */
-- 
1.9.1

[PATCH 1/2] net: bridge: Convert timers to use timer_setup()

2017-11-01 Thread Allen Pais

switch to using the new timer_setup() and from_timer() api's.

Signed-off-by: Allen Pais 
---
 net/bridge/br_multicast.c | 75 +++
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 8dc5c8d..8de4d10 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -239,9 +239,9 @@ static void br_multicast_free_group(struct rcu_head *head)
kfree(mp);
 }
 
-static void br_multicast_group_expired(unsigned long data)
+static void br_multicast_group_expired(struct timer_list *t)
 {
-   struct net_bridge_mdb_entry *mp = (void *)data;
+   struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer);
struct net_bridge *br = mp->br;
struct net_bridge_mdb_htable *mdb;
 
@@ -302,9 +302,9 @@ static void br_multicast_del_pg(struct net_bridge *br,
WARN_ON(1);
 }
 
-static void br_multicast_port_group_expired(unsigned long data)
+static void br_multicast_port_group_expired(struct timer_list *t)
 {
-   struct net_bridge_port_group *pg = (void *)data;
+   struct net_bridge_port_group *pg = from_timer(pg, t, timer);
struct net_bridge *br = pg->port->br;
 
spin_lock(>multicast_lock);
@@ -701,8 +701,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct 
net_bridge *br,
 
mp->br = br;
mp->addr = *group;
-   setup_timer(>timer, br_multicast_group_expired,
-   (unsigned long)mp);
+   timer_setup(>timer, br_multicast_group_expired, 0);
 
hlist_add_head_rcu(>hlist[mdb->ver], >mhash[hash]);
mdb->size++;
@@ -729,8 +728,7 @@ struct net_bridge_port_group *br_multicast_new_port_group(
p->flags = flags;
rcu_assign_pointer(p->next, next);
hlist_add_head(>mglist, >mglist);
-   setup_timer(>timer, br_multicast_port_group_expired,
-   (unsigned long)p);
+   timer_setup(>timer, br_multicast_port_group_expired, 0);
 
if (src)
memcpy(p->eth_addr, src, ETH_ALEN);
@@ -843,9 +841,10 @@ static int br_ip6_multicast_add_group(struct net_bridge 
*br,
 }
 #endif
 
-static void br_multicast_router_expired(unsigned long data)
+static void br_multicast_router_expired(struct timer_list *t)
 {
-   struct net_bridge_port *port = (void *)data;
+   struct net_bridge_port *port =
+   from_timer(port, t, multicast_router_timer);
struct net_bridge *br = port->br;
 
spin_lock(>multicast_lock);
@@ -859,7 +858,7 @@ static void br_multicast_router_expired(unsigned long data)
spin_unlock(>multicast_lock);
 }
 
-static void br_multicast_local_router_expired(unsigned long data)
+static void br_multicast_local_router_expired(struct timer_list *unused)
 {
 }
 
@@ -876,17 +875,17 @@ static void br_multicast_querier_expired(struct 
net_bridge *br,
spin_unlock(>multicast_lock);
 }
 
-static void br_ip4_multicast_querier_expired(unsigned long data)
+static void br_ip4_multicast_querier_expired(struct timer_list *t)
 {
-   struct net_bridge *br = (void *)data;
+   struct net_bridge *br = from_timer(br, t, ip4_other_query.timer);
 
br_multicast_querier_expired(br, >ip4_own_query);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static void br_ip6_multicast_querier_expired(unsigned long data)
+static void br_ip6_multicast_querier_expired(struct timer_list *t)
 {
-   struct net_bridge *br = (void *)data;
+   struct net_bridge *br = from_timer(br, t, ip6_other_query.timer);
 
br_multicast_querier_expired(br, >ip6_own_query);
 }
@@ -987,17 +986,17 @@ static void br_multicast_send_query(struct net_bridge *br,
spin_unlock(>multicast_lock);
 }
 
-static void br_ip4_multicast_port_query_expired(unsigned long data)
+static void br_ip4_multicast_port_query_expired(struct timer_list *t)
 {
-   struct net_bridge_port *port = (void *)data;
+   struct net_bridge_port *port = from_timer(port, t, ip4_own_query.timer);
 
br_multicast_port_query_expired(port, >ip4_own_query);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static void br_ip6_multicast_port_query_expired(unsigned long data)
+static void br_ip6_multicast_port_query_expired(struct timer_list *t)
 {
-   struct net_bridge_port *port = (void *)data;
+   struct net_bridge_port *port = from_timer(port, t, ip6_own_query.timer);
 
br_multicast_port_query_expired(port, >ip6_own_query);
 }
@@ -1019,13 +1018,13 @@ int br_multicast_add_port(struct net_bridge_port *port)
 {
port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 
-   setup_timer(>multicast_router_timer, br_multicast_router_expired,
-   (unsigned long)port);
-   setup_timer(>ip4_own_query.timer,
-   br_ip4_multicast_port_query_expired, (unsigned long)port);
+   timer_setup(>multicast_router_timer,
+   br_multicast_router_expired, 0);
+

Re: [PATCH net-next v2 0/3] enic: Additional ethtool support

2017-11-01 Thread David Miller

From: Parvi Kaustubhi 
Date: Wed,  1 Nov 2017 08:44:45 -0700

> This patch set allows the user to show or modify rx/tx ring sizes using
> ethtool.
> 
> v2:
> - remove unused variable to fix build warning.
> - update list of maintainers for cisco vic ethernet nic driver.

Series applied, thanks.

Re: [net-next PATCH 0/4] Updates for samples/pktgen

2017-11-01 Thread David Miller

From: Jesper Dangaard Brouer 
Date: Wed, 01 Nov 2017 11:41:04 +0100

> This patchset updates samples/pktgen and synchronize with changes
> maintained in https://github.com/netoptimizer/network-testing/
> 
> Features wise Robert Hoo  added support for
> detecting and determining dev NUMA node IRQs, and added a new script
> named pktgen_sample06_numa_awared_queue_irq_affinity.sh that use these
> features.
> 
> Cleanup remove last of the old sample files, as IPv6 is covered by
> existing sample code.

Series applied, thanks Jesper.

Re: Bond recovery from BOND_LINK_FAIL state not working

2017-11-01 Thread Jay Vosburgh

Jarod Wilson  wrote:

>On 2017-11-01 8:35 PM, Jay Vosburgh wrote:
>> Jay Vosburgh  wrote:
>>
>>> Alex Sidorenko  wrote:
>>>
 The problem has been found while trying to deploy RHEL7 on HPE Synergy
 platform, it is seen both in customer's environment and in HPE test lab.

 There are several bonds configured in TLB mode and miimon=100, all other
 options are default. Slaves are connected to VirtualConnect
 modules. Rebooting a VC module should bring one bond slave (ens3f0) down
 temporarily, but not another one (ens3f1). But what we see is

 Oct 24 10:37:12 SYDC1LNX kernel: bond0: link status up again after 0 ms 
 for interface ens3f1
>>
>>  In net-next, I don't see a path in the code that will lead to
>> this message, as it would apparently require entering
>> bond_miimon_inspect in state BOND_LINK_FAIL but with downdelay set to 0.
>> If downdelay is 0, the code will transition to BOND_LINK_DOWN and not
>> remain in _FAIL state.
>
>The kernel in question is laden with a fair bit of additional debug spew,
>as we were going back and forth, trying to isolate where things were going
>wrong.  That was indeed from the BOND_LINK_FAIL state in
>bond_miimon_inspect, inside the if (link_state) clause though, so after
>commit++, there's a continue, which ... does what now? Doesn't it take us
>back to the top of the bond_for_each_slave_rcu() loop, so we bypass the
>next few lines of code that would have led to a transition to
>BOND_LINK_DOWN?

Just to confirm: your downdelay is 0, correct?

And, do you get any other log messages other than "link status
up again after 0 ms"?

To answer your question, yes, the "if (link_state) {" block in
the BOND_LINK_FAIL case of bond_miimon_inspect ends in continue, but
this path is nominally for the downdelay logic.  If downdelay is active
and the link recovers before the delay expires, the link should never
have moved to BOND_LINK_DOWN.  The commit++ causes bond_miimon_inspect
to return nonzero, causing in turn the bond_propose_link_state change to
BOND_LINK_FAIL state to be committed.  This path deliberately does not
set slave->new_link, as downdelay is purposely delaying the transition
to BOND_LINK_DOWN.

If downdelay is 0, the slave->link should not persist in
BOND_LINK_FAIL state; it should set new_link = BOND_LINK_DOWN which will
cause a transition in bond_miimon_commit.  The bond_propose_link_state
call to set BOND_LINK_FAIL in the BOND_LINK_UP case will be committed in
bond_mii_monitor prior to calling bond_miimon_commit, which will in turn
do the transition to _DOWN state.  In this case, the BOND_LINK_FAIL case
"if (link_state) {" block should never be entered.

I'm a little leery of adding the state transition you suggest
without understanding how this situation arose, as it shouldn't get into
this condition in the first place.

-J

>...
>>> Your patch does not apply to net-next, so I'm not absolutely
>>> sure where this is, but presuming that this is in the BOND_LINK_FAIL
>>> case of the switch, it looks like both BOND_LINK_FAIL and BOND_LINK_BACK
>>> will have the issue that if the link recovers or fails, respectively,
>>> within the delay window (for down/updelay > 0) it won't set a
>>> slave->new_link.
>>>
>>> Looks like this got lost somewhere along the line, as originally
>>> the transition back to UP (or DOWN) happened immediately, and that has
>>> been lost somewhere.
>>>
>>> I'll have to dig out when that broke, but I'll see about a test
>>> patch this afternoon.
>>
>>  The case I was concerned with was moved around; the proposed
>> state is committed in bond_mii_monitor.  But to commit to _FAIL state,
>> the downdelay would have to be > 0.  I'm not seeing any errors in
>> net-next; can you reproduce your erroneous behavior on net-next?
>
>I can try to get a net-next-ish kernel into their hands, but the bonding
>driver we're working with here is quite close to current net-next already,
>so I'm fairly confident the same thing will happen.
>
>-- 
>Jarod Wilson
>ja...@redhat.com

---
-Jay Vosburgh, jay.vosbu...@canonical.com

Re: [PATCH net v4 1/2] imx7s/imx7d has the ptp interrupt newly added as well.

2017-11-01 Thread Shawn Guo

On Wed, Nov 1, 2017 at 4:16 AM, Troy Kisky
 wrote:
> For imx7, "int0" is the interrupt for queue 0 and ENET_MII
> "int1" is for queue 1
> "int2" is for queue 2
>
> For imx6sx, "int0" handles all 3 queues and ENET_MII
>
> And of course, the "pps" interrupt is for the PTP_CLOCK_PPS interrupts
> This will help document what each interrupt does.
>
> Signed-off-by: Troy Kisky 

I had updated my email address in MAINTAINERS for years.  Please use that one.

> v4: add blank, ie s/"int0","pps"/"int0", "pps"/ as suggested by Andrew Lunn
> ---
>  Documentation/devicetree/bindings/net/fsl-fec.txt | 13 +

Please have bindings go with net driver instead of DTS changes.

Shawn

>  arch/arm/boot/dts/imx6qdl.dtsi|  1 +
>  arch/arm/boot/dts/imx6sx.dtsi |  2 ++
>  arch/arm/boot/dts/imx6ul.dtsi |  2 ++
>  arch/arm/boot/dts/imx7d.dtsi  |  6 --
>  arch/arm/boot/dts/imx7s.dtsi  |  6 --
>  6 files changed, 26 insertions(+), 4 deletions(-)

RE: [PATCH net v4 2/2] net: fec: Let fec_ptp have its own interrupt routine

2017-11-01 Thread Andy Duan

From: Troy Kisky   Sent: Thursday, November 02, 
2017 1:36 AM
>To: Andy Duan ; shawn@linaro.org;
>netdev@vger.kernel.org; da...@davemloft.net
>Cc: Fabio Estevam ; lzn...@gmail.com;
>and...@lunn.ch
>Subject: Re: [PATCH net v4 2/2] net: fec: Let fec_ptp have its own interrupt
>routine
>
>On 10/31/2017 7:43 PM, Andy Duan wrote:
>> From: Troy Kisky  Sent: Wednesday,
>> November 01, 2017 4:17 AM
>>> This is better for code locality and should slightly speed up normal
>interrupts.
>>>
>>> This also allows PPS clock output to start working for i.mx7. This is
>>> because
>>> i.mx7 was already using the limit of 3 interrupts, and needed another.
>>>
>>> Signed-off-by: Troy Kisky 
>>>
>>> ---
>>>
>>> v2: made this change independent of any devicetree change so that old
>>> dtbs continue to work.
>>>
>>> Continue to register ptp clock if interrupt is not found.
>>>
>>> v3: renamed "ptp" interrupt to "pps" interrupt
>>>
>>> v4: no change
>>> ---
>>> drivers/net/ethernet/freescale/fec.h  |  3 +-
>>> drivers/net/ethernet/freescale/fec_main.c | 25 ++
>>> drivers/net/ethernet/freescale/fec_ptp.c  | 82
>>> ++
>>> -
>>> 3 files changed, 65 insertions(+), 45 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/freescale/fec.h
>>> b/drivers/net/ethernet/freescale/fec.h
>>> index ede1876a9a19..be56ac1f1ac4 100644
>>> --- a/drivers/net/ethernet/freescale/fec.h
>>> +++ b/drivers/net/ethernet/freescale/fec.h
>>> @@ -582,12 +582,11 @@ struct fec_enet_private {
>>> u64 ethtool_stats[0];
>>> };
>>>
>>> -void fec_ptp_init(struct platform_device *pdev);
>>> +void fec_ptp_init(struct platform_device *pdev, int irq_index);
>>
>> Seems change irq_index to irq_idx much better.
>
>
>no problem
>
>>> @@ -3465,18 +3463,27 @@ fec_probe(struct platform_device *pdev)
>>> if (ret)
>>> goto failed_reset;
>>>
>>> +   irq_cnt = platform_irq_count(pdev);
>>> +   if (irq_cnt > FEC_IRQ_NUM)
>>> +   irq_cnt = FEC_IRQ_NUM;  /* last for pps */
>>> +   else if (irq_cnt == 2)
>>> +   irq_cnt = 1;/* last for pps */
>>> +   else if (irq_cnt <= 0)
>>> +   irq_cnt = 1;/* Let the for loop fail */
>>> +
>>
>> Do some parse on probe function seems uncomfortable...can you
>encapsulate these code into one api ?
>>
>
>
>Do you mean something like
>   irq_cnt = limit_irq(pdev);
>
>
>int limit_irq(struct platform_device *pdev) {

It is better: fec_enet_get_irq_cnt()

>   int irq_cnt = platform_irq_count(pdev);
>
>   if (irq_cnt > FEC_IRQ_NUM)
>   irq_cnt = FEC_IRQ_NUM;  /* last for pps */
>   else if (irq_cnt == 2)
>   irq_cnt = 1;/* last for pps */
>   else if (irq_cnt <= 0)
>   irq_cnt = 1;/* At least 1 irq is needed */
>   return irq_cnt;
>}
>___
>Can you give some code to your idea? I don't think I understand.

Others seems fine for me.

Re: [PATCH net-next V2 1/3] tun: abstract flow steering logic

2017-11-01 Thread Jason Wang




On 2017年11月02日 11:45, Michael S. Tsirkin wrote:

On Thu, Nov 02, 2017 at 11:43:48AM +0800, Jason Wang wrote:


On 2017年11月02日 09:11, Willem de Bruijn wrote:

On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang  wrote:

tun now use flow caches based automatic queue steering method. This
may not suffice all user cases. To extend it to be able to use more
flow steering policy, this patch abstracts flow steering logic into
tun_steering_ops, then we can declare and use different methods in
the future.
Signed-off-by: Jason Wang 
---
   drivers/net/tun.c | 85 
+--
   1 file changed, 63 insertions(+), 22 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ea29da9..bff6259 100644

The previous RFC enabled support for multiple pluggable steering
policies. But as all can be implemented in BPF and we only plan to
support an eBPF policy besides the legacy one, this patch is no longer
needed. We can save a few indirect function calls.

But we should at least support two kinds of steering policy, so this is
still needed?

And I'm not quite sure we can implement all kinds of policies through BPF
e.g RSS or we may want to offload the queue selection to underlayer switch
or nic .

Thanks

I think a simple if condition is preferable for now, too. Let's wait
until we get some 3/4 of these.



That's a solution but we may need if in at least four places. If this is 
ok, I will do it in next version.


Thanks

Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method

2017-11-01 Thread Jason Wang




On 2017年11月01日 21:59, Michael S. Tsirkin wrote:

On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:


On 2017年11月01日 00:45, Michael S. Tsirkin wrote:

+static void __tun_set_steering_ebpf(struct tun_struct *tun,
+   struct bpf_prog *new)
+{
+   struct bpf_prog *old;
+
+   old = rtnl_dereference(tun->steering_prog);
+   rcu_assign_pointer(tun->steering_prog, new);
+
+   if (old) {
+   synchronize_net();
+   bpf_prog_destroy(old);
+   }
+}
+

Is this really called under rtnl?

Yes it is __tun_chr_ioctl() will call rtnl_lock().

Is the call from tun_free_netdev under rtnl too?


Looks not, need hold rtnl_lock() in tun_free_netdev in next version.




If no then rtnl_dereference
is wrong. If yes I'm not sure you can call synchronize_net
under rtnl.


Are you worrying about the long wait? Looking at synchronize_net(), it does:

void synchronize_net(void)
{
     might_sleep();
     if (rtnl_is_locked())
         synchronize_rcu_expedited();
     else
         synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);

Thanks

Not the wait - expedited is not a good thing to allow unpriveledged
userspace to do, it interrupts all VMs running on the same box.


Good point.



We could use a callback though the docs warn userspace can use that
to cause a DOS and needs to be limited.




Will do this in next version.

Thanks

Re: [PATCH net-next V2 1/3] tun: abstract flow steering logic

2017-11-01 Thread Michael S. Tsirkin

On Thu, Nov 02, 2017 at 11:43:48AM +0800, Jason Wang wrote:
> 
> 
> On 2017年11月02日 09:11, Willem de Bruijn wrote:
> > On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang  wrote:
> > > tun now use flow caches based automatic queue steering method. This
> > > may not suffice all user cases. To extend it to be able to use more
> > > flow steering policy, this patch abstracts flow steering logic into
> > > tun_steering_ops, then we can declare and use different methods in
> > > the future.
> > > Signed-off-by: Jason Wang 
> > > ---
> > >   drivers/net/tun.c | 85 
> > > +--
> > >   1 file changed, 63 insertions(+), 22 deletions(-)
> > > 
> > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > > index ea29da9..bff6259 100644
> > The previous RFC enabled support for multiple pluggable steering
> > policies. But as all can be implemented in BPF and we only plan to
> > support an eBPF policy besides the legacy one, this patch is no longer
> > needed. We can save a few indirect function calls.
> 
> But we should at least support two kinds of steering policy, so this is
> still needed?
> 
> And I'm not quite sure we can implement all kinds of policies through BPF
> e.g RSS or we may want to offload the queue selection to underlayer switch
> or nic .
> 
> Thanks

I think a simple if condition is preferable for now, too. Let's wait
until we get some 3/4 of these.

-- 
MST

Re: [PATCH net-next V2 1/3] tun: abstract flow steering logic

2017-11-01 Thread Jason Wang




On 2017年11月02日 09:11, Willem de Bruijn wrote:

On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang  wrote:

tun now use flow caches based automatic queue steering method. This
may not suffice all user cases. To extend it to be able to use more
flow steering policy, this patch abstracts flow steering logic into
tun_steering_ops, then we can declare and use different methods in
the future.
Signed-off-by: Jason Wang 
---
  drivers/net/tun.c | 85 +--
  1 file changed, 63 insertions(+), 22 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ea29da9..bff6259 100644

The previous RFC enabled support for multiple pluggable steering
policies. But as all can be implemented in BPF and we only plan to
support an eBPF policy besides the legacy one, this patch is no longer
needed. We can save a few indirect function calls.


But we should at least support two kinds of steering policy, so this is 
still needed?


And I'm not quite sure we can implement all kinds of policies through 
BPF e.g RSS or we may want to offload the queue selection to underlayer 
switch or nic .


Thanks

Re: [PATCH net-next 3/9] net: hns3: Refactor the initialization of command queue

2017-11-01 Thread Yunsheng Lin

Hi, Lipeng

On 2017/11/1 22:47, Lipeng wrote:
> From: qumingguang 
> 
> There is no necessary to reallocate the descriptor and remap the descriptor
> memory in reset process, But there is still some other action exit in both

exit -> exist

> reset process and initialization process.
> 
> To reuse the common interface of reset process and initialization process,

of -> in

> This patch moved out the descriptor allocate and memory maping from

This -> this, moved -> moves

> interface cmdq_init.
> 
> Signed-off-by: qumingguang 
> Signed-off-by: Lipeng 
> ---
>  .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 39 
> +-
>  .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |  1 +
>  .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c|  9 -
>  3 files changed, 33 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c 
> b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
> index 60960e5..ff13d18 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
> @@ -62,7 +62,7 @@ static void hclge_free_cmd_desc(struct hclge_cmq_ring *ring)
>   ring->desc = NULL;
>  }
>  
> -static int hclge_init_cmd_queue(struct hclge_dev *hdev, int ring_type)
> +static int hclge_alloc_cmd_queue(struct hclge_dev *hdev, int ring_type)
>  {
>   struct hclge_hw *hw = >hw;
>   struct hclge_cmq_ring *ring =
> @@ -79,9 +79,6 @@ static int hclge_init_cmd_queue(struct hclge_dev *hdev, int 
> ring_type)
>   return ret;
>   }
>  
> - ring->next_to_clean = 0;
> - ring->next_to_use = 0;
> -
>   return 0;
>  }
>  
> @@ -302,37 +299,52 @@ static enum hclge_cmd_status 
> hclge_cmd_query_firmware_version(
>   return ret;
>  }
>  
> -int hclge_cmd_init(struct hclge_dev *hdev)
> +int hclge_cmd_queue_init(struct hclge_dev *hdev)
>  {
> - u32 version;
>   int ret;
>  
>   /* Setup the queue entries for use cmd queue */
>   hdev->hw.cmq.csq.desc_num = HCLGE_NIC_CMQ_DESC_NUM;
>   hdev->hw.cmq.crq.desc_num = HCLGE_NIC_CMQ_DESC_NUM;
>  
> - /* Setup the lock for command queue */
> - spin_lock_init(>hw.cmq.csq.lock);
> - spin_lock_init(>hw.cmq.crq.lock);
> -
>   /* Setup Tx write back timeout */
>   hdev->hw.cmq.tx_timeout = HCLGE_CMDQ_TX_TIMEOUT;
>  
>   /* Setup queue rings */
> - ret = hclge_init_cmd_queue(hdev, HCLGE_TYPE_CSQ);
> + ret = hclge_alloc_cmd_queue(hdev, HCLGE_TYPE_CSQ);
>   if (ret) {
>   dev_err(>pdev->dev,
>   "CSQ ring setup error %d\n", ret);
>   return ret;
>   }
>  
> - ret = hclge_init_cmd_queue(hdev, HCLGE_TYPE_CRQ);
> + ret = hclge_alloc_cmd_queue(hdev, HCLGE_TYPE_CRQ);
>   if (ret) {
>   dev_err(>pdev->dev,
>   "CRQ ring setup error %d\n", ret);
>   goto err_csq;
>   }
>  
> + return 0;
> +err_csq:
> + hclge_free_cmd_desc(>hw.cmq.csq);
> + return ret;
> +}
> +
> +int hclge_cmd_init(struct hclge_dev *hdev)
> +{
> + u32 version;
> + int ret;
> +
> + hdev->hw.cmq.csq.next_to_clean = 0;
> + hdev->hw.cmq.csq.next_to_use = 0;
> + hdev->hw.cmq.crq.next_to_clean = 0;
> + hdev->hw.cmq.crq.next_to_use = 0;
> +
> + /* Setup the lock for command queue */
> + spin_lock_init(>hw.cmq.csq.lock);
> + spin_lock_init(>hw.cmq.crq.lock);
> +
>   hclge_cmd_init_regs(>hw);
>  
>   ret = hclge_cmd_query_firmware_version(>hw, );
> @@ -346,9 +358,6 @@ int hclge_cmd_init(struct hclge_dev *hdev)
>   dev_info(>pdev->dev, "The firmware version is %08x\n", version);
>  
>   return 0;
> -err_csq:
> - hclge_free_cmd_desc(>hw.cmq.csq);
> - return ret;
>  }
>  
>  static void hclge_destroy_queue(struct hclge_cmq_ring *ring)
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h 
> b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
> index b437334..6bdc216 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
> @@ -750,4 +750,5 @@ enum hclge_cmd_status hclge_cmd_mdio_read(struct hclge_hw 
> *hw,
> struct hclge_desc *desc);
>  
>  void hclge_destroy_cmd_queue(struct hclge_hw *hw);
> +int hclge_cmd_queue_init(struct hclge_dev *hdev);
>  #endif
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
> b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> index 4ef4592..a7686fe 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> @@ -4446,7 +4446,14 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev 
> *ae_dev)
>   goto err_pci_init;
>   }
>  
> - /* Command queue initialize */
> + /* Firmware command queue

Re: [PATCH 4/7] MIPS: Octeon: Add Free Pointer Unit (FPA) support.

2017-11-01 Thread Florian Fainelli

Le 11/01/17 à 17:36, David Daney a écrit :
> From: Carlos Munoz 
> 
> From the hardware user manual: "The FPA is a unit that maintains
> pools of pointers to free L2/DRAM memory. To provide QoS, the pools
> are referenced indirectly through 1024 auras. Both core software
> and hardware units allocate and free pointers."

This looks like a possibly similar implement to what
drivers/net/ethernet/marvell/mvneta_bm.c, can you see if you can make
any use of genpool_* and include/net/hwbm.h here as well?
-- 
Florian

Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method

2017-11-01 Thread Jason Wang




On 2017年11月02日 03:12, Alexei Starovoitov wrote:

On Wed, Nov 01, 2017 at 03:59:48PM +0200, Michael S. Tsirkin wrote:

On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:


On 2017年11月01日 00:45, Michael S. Tsirkin wrote:

+static void __tun_set_steering_ebpf(struct tun_struct *tun,
+   struct bpf_prog *new)
+{
+   struct bpf_prog *old;
+
+   old = rtnl_dereference(tun->steering_prog);
+   rcu_assign_pointer(tun->steering_prog, new);
+
+   if (old) {
+   synchronize_net();
+   bpf_prog_destroy(old);
+   }
+}
+

Is this really called under rtnl?

Yes it is __tun_chr_ioctl() will call rtnl_lock().

Is the call from tun_free_netdev under rtnl too?


If no then rtnl_dereference
is wrong. If yes I'm not sure you can call synchronize_net
under rtnl.


Are you worrying about the long wait? Looking at synchronize_net(), it does:

void synchronize_net(void)
{
     might_sleep();
     if (rtnl_is_locked())
         synchronize_rcu_expedited();
     else
         synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);

Thanks

Not the wait - expedited is not a good thing to allow unpriveledged
userspace to do, it interrupts all VMs running on the same box.

We could use a callback though the docs warn userspace can use that
to cause a DOS and needs to be limited.

the whole __tun_set_steering_ebpf() looks odd to me.
There is tun_attach_filter/tun_detach_filter pattern
that works for classic BPF. Why for eBPF this strange
synchronize_net() is there?



I'm not sure I get the question. eBPF here is used to do queue 
selection, so we could not reuse socket filter (tun_detach_filter use 
call_rcu()). cBPF could be used here, but I'm not quite sure it's worth 
to support it. And I agree we should use call_rcu() here.


Hope this answer your question.

Thanks

Re: [PATCH net-next] vhost_net: conditionally enable tx polling

2017-11-01 Thread Jason Wang




On 2017年11月01日 23:03, Michael S. Tsirkin wrote:

On Wed, Nov 01, 2017 at 08:51:36PM +0800, Jason Wang wrote:


On 2017年11月01日 00:36, Michael S. Tsirkin wrote:

On Tue, Oct 31, 2017 at 06:27:20PM +0800, Jason Wang wrote:

We always poll tx for socket, this is sub optimal since:

- we only want to be notified when sndbuf is available
- this will slightly increase the waitqueue traversing time and more
important, vhost could not benefit from commit
commit 9e641bdcfa4e
("net-tun: restructure tun_do_read for better sleep/wakeup efficiency")
even if we've stopped rx polling during handle_rx() since tx poll
were still left in the waitqueue.

Pktgen from a remote host to VM over mlx4 shows 5.5% improvements on
rx PPS. (from 1.27Mpps to 1.34Mpps)

Cc: Wei Xu 
Cc: Matthew Rosato 
Signed-off-by: Jason Wang 
---

Now that vhost_poll_stop happens on data path
a lot, I'd say
  if (poll->wqh)
there should be unlikely().

It has been there since 8241a1e466cd ("vhost_net: stop polling socket during
rx processing"). So it will be used for rx path too which unlikely() does
not work as well as the case in tx.

Worth testing, does not have to block this patch.


Ok.




   drivers/vhost/net.c | 11 ---
   1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 68677d9..286c3e4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -471,6 +471,7 @@ static void handle_tx(struct vhost_net *net)
goto out;
vhost_disable_notify(>dev, vq);
+   vhost_net_disable_vq(net, vq);
hdr_size = nvq->vhost_hlen;
zcopy = nvq->ubufs;
@@ -556,6 +557,8 @@ static void handle_tx(struct vhost_net *net)
% UIO_MAXIOV;
}
vhost_discard_vq_desc(vq, 1);
+   if (err == -EAGAIN)
+   vhost_net_enable_vq(net, vq);
break;
}
if (err != len)

I would probably just enable it unconditionally here. Why not?


I thought we only care about the case of tun_sock_write_space() and for the
errors other than -EAGAIN, they have nothing to do with polling.

We could thinkably get ENOMEM I guess. If we miss a code things
get stuck - It's just easier not to add extra code IMHO.


ENOBUFS actually. I agree to remove the specific checking of -EAGAIN here.




@@ -1145,9 +1148,11 @@ static long vhost_net_set_backend(struct vhost_net *n, 
unsigned index, int fd)
r = vhost_vq_init_access(vq);
if (r)
goto err_used;
-   r = vhost_net_enable_vq(n, vq);
-   if (r)
-   goto err_used;
+   if (index == VHOST_NET_VQ_RX) {
+   r = vhost_net_enable_vq(n, vq);
+   if (r)
+   goto err_used;
+   }
oldubufs = nvq->ubufs;
nvq->ubufs = ubufs;

This last chunk seems questionable. If queue has stuff in it
when we connect the backend, we'll miss a wakeup.
I suspect this can happen during migration.

Unless qemu pass a tap which s already had pending tx packets.

I can remove this chuck, but if guest does not transmit any packet, rx can't
benefit from this.

Thanks

Not sure I understand the last sentence. vhost will stay
polling the socket - why is that a problem?


No function problem at all. If guest does not transmit any packet, the 
patch does not have any effect.


Will remove this chunk in next version.

Thanks






--
2.7.4

___
Virtualization mailing list
virtualizat...@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next v15] openvswitch: enable NSH support

2017-11-01 Thread Yang, Yi

On Thu, Nov 02, 2017 at 08:52:40AM +0800, Pravin Shelar wrote:
> On Tue, Oct 31, 2017 at 9:03 PM, Yi Yang  wrote:
> >
> > OVS master and 2.8 branch has merged NSH userspace
> > patch series, this patch is to enable NSH support
> > in kernel data path in order that OVS can support
> > NSH in compat mode by porting this.
> >
> > Signed-off-by: Yi Yang 
> > ---
> I have comment related to checksum, otherwise patch looks good to me.

Pravin, thank you for your comments, the below part is incremental patch
for checksum, please help check it, I'll send out v16 with this after
you confirm.

diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
index 2764682..d7da99a 100644
--- a/net/nsh/nsh.c
+++ b/net/nsh/nsh.c
@@ -36,6 +36,7 @@ int nsh_push(struct sk_buff *skb, const struct nshhdr 
*pushed_nh)
nh = (struct nshhdr *)(skb->data);
memcpy(nh, pushed_nh, length);
nh->np = next_proto;
+   skb_postpush_rcsum(skb, nh, length);
 
skb->protocol = htons(ETH_P_NSH);
skb_reset_mac_header(skb);
@@ -63,7 +64,7 @@ int nsh_pop(struct sk_buff *skb)
if (!inner_proto)
return -EAFNOSUPPORT;
 
-   skb_pull(skb, length);
+   skb_pull_rcsum(skb, length);
skb_reset_mac_header(skb);
skb_reset_network_header(skb);
skb_reset_mac_len(skb);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index dd1449d..5ba0e35 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -671,6 +671,7 @@ static int set_nsh(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
return err;
 
nh = nsh_hdr(skb);
+   skb_postpull_rcsum(skb, nh, length);
flags = nsh_get_flags(nh);
flags = OVS_MASKED(flags, key.base.flags, mask.base.flags);
flow_key->nsh.base.flags = flags;
@@ -698,6 +699,7 @@ static int set_nsh(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
default:
return -EINVAL;
}
+   skb_postpush_rcsum(skb, nh, length);
return 0;
 }

Re: Confirm Reciept Of This Mail

2017-11-01 Thread Meiwa Corporation co.Ltd



Season Greetings To You;

Are you interested in taking up a representative position (Account Receivable 
Agent) for Meiwa Corporation co.Ltd, who is currently looking on contracting 
both Companies and Individuals who are based in Canada and USA irrespectively

Please Note:
* This is a part time contract offer that requires not too much of your time 
daily.
* This position is open to all regardless of your Profession, Occupation and 
Position.
* This position comes with Monthly stipends and good commission.

Do Advise for more details if interested in taking up this offer Kindly Reply 
Back ASAP.

Best Regard
HR Manager

- Opprinnelig melding -
Fra: Meiwa Corporation co.Ltd 
Til: marius arnesen 
Sendt: Thu, 02 Nov 2017 03:18:17 +0100 (CET)
Emne: Confirm Reciept Of This Mail

Re: Bond recovery from BOND_LINK_FAIL state not working

2017-11-01 Thread Jarod Wilson


On 2017-11-01 8:35 PM, Jay Vosburgh wrote:

Jay Vosburgh  wrote:


Alex Sidorenko  wrote:


The problem has been found while trying to deploy RHEL7 on HPE Synergy
platform, it is seen both in customer's environment and in HPE test lab.

There are several bonds configured in TLB mode and miimon=100, all other
options are default. Slaves are connected to VirtualConnect
modules. Rebooting a VC module should bring one bond slave (ens3f0) down
temporarily, but not another one (ens3f1). But what we see is

Oct 24 10:37:12 SYDC1LNX kernel: bond0: link status up again after 0 ms for 
interface ens3f1


In net-next, I don't see a path in the code that will lead to
this message, as it would apparently require entering
bond_miimon_inspect in state BOND_LINK_FAIL but with downdelay set to 0.
If downdelay is 0, the code will transition to BOND_LINK_DOWN and not
remain in _FAIL state.


The kernel in question is laden with a fair bit of additional debug 
spew, as we were going back and forth, trying to isolate where things 
were going wrong.  That was indeed from the BOND_LINK_FAIL state in 
bond_miimon_inspect, inside the if (link_state) clause though, so after 
commit++, there's a continue, which ... does what now? Doesn't it take 
us back to the top of the bond_for_each_slave_rcu() loop, so we bypass 
the next few lines of code that would have led to a transition to 
BOND_LINK_DOWN?


...

Your patch does not apply to net-next, so I'm not absolutely
sure where this is, but presuming that this is in the BOND_LINK_FAIL
case of the switch, it looks like both BOND_LINK_FAIL and BOND_LINK_BACK
will have the issue that if the link recovers or fails, respectively,
within the delay window (for down/updelay > 0) it won't set a
slave->new_link.

Looks like this got lost somewhere along the line, as originally
the transition back to UP (or DOWN) happened immediately, and that has
been lost somewhere.

I'll have to dig out when that broke, but I'll see about a test
patch this afternoon.


The case I was concerned with was moved around; the proposed
state is committed in bond_mii_monitor.  But to commit to _FAIL state,
the downdelay would have to be > 0.  I'm not seeing any errors in
net-next; can you reproduce your erroneous behavior on net-next?


I can try to get a net-next-ish kernel into their hands, but the bonding 
driver we're working with here is quite close to current net-next 
already, so I'm fairly confident the same thing will happen.


--
Jarod Wilson
ja...@redhat.com

Re: [PATCH net] tcp: Always cleanup skb before sending

2017-11-01 Thread Eric Dumazet

On Wed, 2017-11-01 at 18:00 -0700, Eric Dumazet wrote:
> On Wed, 2017-11-01 at 17:10 -0700, Christoph Paasch wrote:
> 
> > Yes, that looks good to me. Thanks!
> > 
> > But we still need to clean up the skb in tcp_v4_send_reset and
> > tcp_v4_send_ack, as the fields (e.g., tcp_skb_cb->seq) have been set when
> > coming from tcp_v4_rcv.
> 
> You might be confused : ip_send_unicast_reply() does not send back the
> incoming skb.
> 
> A fresh skb is allocated, then appended/sent.
> 
> And commit 24a2d43d8886f5a29c did the changes to provide to
> __ip_options_echo() the proper IPCB header location.
> 

More details :

Fields written by tcp_init_nondata_skb() on the synack packet :

->seq  (32bits) at offset 0 of skb->cb[]
->end_seq  (32bits) at offset 4 of skb->cb[]
->tcp_gso_segs (16bits) at offset 8
->tcp_flags(8bits) at offset 12 value (TCPHDR_SYN | TCPHDR_ACK ->
0x12)
->sacked   (8bits) at offset 13

IPCB fields sharing these 14 bytes :

iif  /* 32bits, offset 0 */
opt.faddr(32bits) offset 4
opt.nexthop  (32bits) offset 8 /* value 1 */
opt.optlen   (8bits) offset 12 /* value 0x12 */
opt.srr  (8bits) offset 13

IP6CB fields sharing these 14 bytes :

iif   /* 32bits, offset 0 */
ra/* 16 bits, offset 4 */
dst0  /* 16 bits offset 6 */
srcrt /* 16 bits offset 8 */  -> 0x0001
dst1  /* 16 bits offset 10 */ (not mangled -> 0)
lastopt /* 16 bits offset 12 */  -> 0x12


At xmit :

IPV4 uses ip_build_and_send_pkt() to transmit the SYNACK, so skb->cb[]
is not used.

IPv6 uses other fields.

So I really wonder what exact issue you observed, please share your
drugs ;)

Thanks !

Re: [RFC PATCH 01/14] packet: introduce AF_PACKET V4 userspace API

2017-11-01 Thread Willem de Bruijn

On Tue, Oct 31, 2017 at 9:41 PM, Björn Töpel  wrote:
> From: Björn Töpel 
>
> This patch adds the necessary AF_PACKET V4 structures for usage from
> userspace. AF_PACKET V4 is a new interface optimized for high
> performance packet processing.
>
> Signed-off-by: Björn Töpel 
> ---
>  include/uapi/linux/if_packet.h | 65 
> +-
>  1 file changed, 64 insertions(+), 1 deletion(-)
>
> +struct tpacket4_queue {
> +   struct tpacket4_desc *ring;
> +
> +   unsigned int avail_idx;
> +   unsigned int last_used_idx;
> +   unsigned int num_free;
> +   unsigned int ring_mask;
> +};
>
>  struct packet_mreq {
> @@ -294,6 +335,28 @@ struct packet_mreq {
> unsigned char   mr_address[8];
>  };
>
> +/*
> + * struct tpacket_memreg_req is used in conjunction with PACKET_MEMREG
> + * to register user memory which should be used to store the packet
> + * data.
> + *
> + * There are some constraints for the memory being registered:
> + * - The memory area has to be memory page size aligned.
> + * - The frame size has to be a power of 2.
> + * - The frame size cannot be smaller than 2048B.
> + * - The frame size cannot be larger than the memory page size.
> + *
> + * Corollary: The number of frames that can be stored is
> + * len / frame_size.
> + *
> + */
> +struct tpacket_memreg_req {
> +   unsigned long   addr;   /* Start of packet data area */
> +   unsigned long   len;/* Length of packet data area */
> +   unsigned intframe_size; /* Frame size */
> +   unsigned intdata_headroom;  /* Frame head room */
> +};

Existing packet sockets take a tpacket_req, allocate memory and let the
user process mmap this. I understand that TPACKET_V4 distinguishes
the descriptor from packet pools, but could both use the existing structs
and logic (packet_mmap)? That would avoid introducing a lot of new code
just for granting user pages to the kernel.

Also, use of unsigned long can cause problems on 32/64 bit compat
environments. Prefer fixed width types in uapi. Same for pointer in
tpacket4_queue.

( Compensation Reinbursement )

2017-11-01 Thread United Nations

View the enclosed file for your Compensation Reinbursement

Code Payment.pdf
Description: Adobe PDF document

Re: [PATCH 1/7] dt-bindings: Add Cavium Octeon Common Ethernet Interface.

2017-11-01 Thread David Daney


On 11/01/2017 06:09 PM, Florian Fainelli wrote:

On 11/01/2017 05:36 PM, David Daney wrote:

From: Carlos Munoz 

Add bindings for Common Ethernet Interface (BGX) block.

Signed-off-by: Carlos Munoz 
Signed-off-by: Steven J. Hill 
Signed-off-by: David Daney 
---

[snip]

+Properties:
+
+- compatible: "cavium,octeon-7360-xcv": Compatibility with cn73xx SOCs.
+
+- reg: The index of the interface within the BGX block.
+
+- local-mac-address: Mac address for the interface.
+
+- phy-handle: phandle to the phy node connected to the interface.
+
+- cavium,rx-clk-delay-bypass: Set to <1> to bypass the rx clock delay setting.
+  Needed by the Micrel PHY.


Is not that implied by an appropriate "phy-mode" property already?


I think you are correct.  That string never appears in the source code, 
so I am going to remove that property from the binding document for the 
next revision of the patch set.


Thanks,
David Daney

Re: [PATCH net-next V2 2/3] tun: introduce ioctls to set and get steering policies

2017-11-01 Thread Willem de Bruijn

On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang  wrote:
> This patch introduces new ioctl for change packet steering policy for
> tun. Only automatic flow steering is supported, more policies will
> come.
>
> Signed-off-by: Jason Wang 
> ---
>  drivers/net/tun.c   | 35 ++-
>  include/uapi/linux/if_tun.h |  7 +++
>  2 files changed, 41 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index bff6259..ab109ff 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -122,7 +122,8 @@ do {  
>   \
>  #define TUN_VNET_BE 0x4000
>
>  #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
> - IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
> + IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS | \
> + IFF_MULTI_STEERING)
>
>  #define GOODCOPY_LEN 128
>
> @@ -2516,6 +2517,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
> int cmd,
> unsigned int ifindex;
> int le;
> int ret;
> +   unsigned int steering;
>
> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 
> SOCK_IOC_TYPE) {
> if (copy_from_user(, argp, ifreq_len))
> @@ -2774,6 +2776,37 @@ static long __tun_chr_ioctl(struct file *file, 
> unsigned int cmd,
> ret = 0;
> break;
>
> +   case TUNSETSTEERING:
> +   ret = -EFAULT;
> +   if (copy_from_user(, argp, sizeof(steering)))
> +   break;
> +   ret = 0;
> +   switch (steering) {
> +   case TUN_STEERING_AUTOMQ:
> +   tun->steering_ops = _automq_ops;
> +   break;
> +   default:
> +   ret = -EFAULT;
> +   }
> +   break;
> +
> +   case TUNGETSTEERING:
> +   ret = 0;
> +   if (tun->steering_ops == _automq_ops)
> +   steering = TUN_STEERING_AUTOMQ;
> +   else
> +   BUG();
> +   if (copy_to_user(argp, , sizeof(steering)))
> +   ret = -EFAULT;
> +   break;
> +
> +   case TUNGETSTEERINGFEATURES:
> +   ret = 0;
> +   steering = TUN_STEERING_AUTOMQ;
> +   if (copy_to_user(argp, , sizeof(steering)))
> +   ret = -EFAULT;
> +   break;
> +


Similar to my comment in patch 1/3: if only eBPF is used, these
calls can be avoided in favor of only TUNSETSTEERINGEBPF
from patch 3/3.

[PATCH net-next] liquidio: bump up driver version to 1.7.0 to match newer NIC firmware

2017-11-01 Thread Felix Manlunas

Signed-off-by: Felix Manlunas 
Acked-by: Derek Chickles 
---
 drivers/net/ethernet/cavium/liquidio/liquidio_common.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h 
b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 3bcdda8..522dcc4 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -27,8 +27,8 @@
 
 #define LIQUIDIO_PACKAGE ""
 #define LIQUIDIO_BASE_MAJOR_VERSION 1
-#define LIQUIDIO_BASE_MINOR_VERSION 6
-#define LIQUIDIO_BASE_MICRO_VERSION 1
+#define LIQUIDIO_BASE_MINOR_VERSION 7
+#define LIQUIDIO_BASE_MICRO_VERSION 0
 #define LIQUIDIO_BASE_VERSION   __stringify(LIQUIDIO_BASE_MAJOR_VERSION) "." \
__stringify(LIQUIDIO_BASE_MINOR_VERSION)
 #define LIQUIDIO_MICRO_VERSION  "." __stringify(LIQUIDIO_BASE_MICRO_VERSION)

Re: [PATCH net-next V2 1/3] tun: abstract flow steering logic

2017-11-01 Thread Willem de Bruijn

On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang  wrote:
> tun now use flow caches based automatic queue steering method. This
> may not suffice all user cases. To extend it to be able to use more
> flow steering policy, this patch abstracts flow steering logic into
> tun_steering_ops, then we can declare and use different methods in
> the future.
> Signed-off-by: Jason Wang 
> ---
>  drivers/net/tun.c | 85 
> +--
>  1 file changed, 63 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index ea29da9..bff6259 100644

The previous RFC enabled support for multiple pluggable steering
policies. But as all can be implemented in BPF and we only plan to
support an eBPF policy besides the legacy one, this patch is no longer
needed. We can save a few indirect function calls.

Re: [PATCH 1/7] dt-bindings: Add Cavium Octeon Common Ethernet Interface.

2017-11-01 Thread Florian Fainelli

On 11/01/2017 05:36 PM, David Daney wrote:
> From: Carlos Munoz 
> 
> Add bindings for Common Ethernet Interface (BGX) block.
> 
> Signed-off-by: Carlos Munoz 
> Signed-off-by: Steven J. Hill 
> Signed-off-by: David Daney 
> ---
[snip]
> +Properties:
> +
> +- compatible: "cavium,octeon-7360-xcv": Compatibility with cn73xx SOCs.
> +
> +- reg: The index of the interface within the BGX block.
> +
> +- local-mac-address: Mac address for the interface.
> +
> +- phy-handle: phandle to the phy node connected to the interface.
> +
> +- cavium,rx-clk-delay-bypass: Set to <1> to bypass the rx clock delay 
> setting.
> +  Needed by the Micrel PHY.

Is not that implied by an appropriate "phy-mode" property already?
-- 
Florian

Re: [PATCH 1/2] bpf: add a bpf_override_function helper

2017-11-01 Thread Daniel Borkmann


On 11/01/2017 06:00 PM, Josef Bacik wrote:

From: Josef Bacik 

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Signed-off-by: Josef Bacik 


Looks useful (seccomp/BPF has something similar but on syscall level).
I think given we change kernel behavior, it would be good if we emit
similar warning like in case of bpf_get_probe_write_proto(), such that
from staring at the log (e.g. in case of subsequent crash), it could
help identifying that cause to trigger the bug could have been due to
bpf_override_function().

Re: [PATCH net] tcp: Always cleanup skb before sending

2017-11-01 Thread Eric Dumazet

On Wed, 2017-11-01 at 17:10 -0700, Christoph Paasch wrote:

> Yes, that looks good to me. Thanks!
> 
> But we still need to clean up the skb in tcp_v4_send_reset and
> tcp_v4_send_ack, as the fields (e.g., tcp_skb_cb->seq) have been set when
> coming from tcp_v4_rcv.

You might be confused : ip_send_unicast_reply() does not send back the
incoming skb.

A fresh skb is allocated, then appended/sent.

And commit 24a2d43d8886f5a29c did the changes to provide to
__ip_options_echo() the proper IPCB header location.

Re: [PATCH net-next v15] openvswitch: enable NSH support

2017-11-01 Thread Pravin Shelar

On Tue, Oct 31, 2017 at 9:03 PM, Yi Yang  wrote:
> v14->v15
>  - Check size in nsh_hdr_from_nlattr
>  - Fixed four small issues pointed out By Jiri and Eric
>
> v13->v14
>  - Rename skb_push_nsh to nsh_push per Dave's comment
>  - Rename skb_pop_nsh to nsh_pop per Dave's comment
>
> v12->v13
>  - Fix NSH header length check in set_nsh
>
> v11->v12
>  - Fix missing changes old comments pointed out
>  - Fix new comments for v11
>
> v10->v11
>  - Fix the left three disputable comments for v9
>but not fixed in v10.
>
> v9->v10
>  - Change struct ovs_key_nsh to
>struct ovs_nsh_key_base base;
>__be32 context[NSH_MD1_CONTEXT_SIZE];
>  - Fix new comments for v9
>
> v8->v9
>  - Fix build error reported by daily intel build
>because nsh module isn't selected by openvswitch
>
> v7->v8
>  - Rework nested value and mask for OVS_KEY_ATTR_NSH
>  - Change pop_nsh to adapt to nsh kernel module
>  - Fix many issues per comments from Jiri Benc
>
> v6->v7
>  - Remove NSH GSO patches in v6 because Jiri Benc
>reworked it as another patch series and they have
>been merged.
>  - Change it to adapt to nsh kernel module added by NSH
>GSO patch series
>
> v5->v6
>  - Fix the rest comments for v4.
>  - Add NSH GSO support for VxLAN-gpe + NSH and
>Eth + NSH.
>
> v4->v5
>  - Fix many comments by Jiri Benc and Eric Garver
>for v4.
>
> v3->v4
>  - Add new NSH match field ttl
>  - Update NSH header to the latest format
>which will be final format and won't change
>per its author's confirmation.
>  - Fix comments for v3.
>
> v2->v3
>  - Change OVS_KEY_ATTR_NSH to nested key to handle
>length-fixed attributes and length-variable
>attriubte more flexibly.
>  - Remove struct ovs_action_push_nsh completely
>  - Add code to handle nested attribute for SET_MASKED
>  - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH
>to transfer NSH header data.
>  - Fix comments and coding style issues by Jiri and Eric
>
> v1->v2
>  - Change encap_nsh and decap_nsh to push_nsh and pop_nsh
>  - Dynamically allocate struct ovs_action_push_nsh for
>length-variable metadata.
>
> OVS master and 2.8 branch has merged NSH userspace
> patch series, this patch is to enable NSH support
> in kernel data path in order that OVS can support
> NSH in compat mode by porting this.
>
> Signed-off-by: Yi Yang 
> ---
I have comment related to checksum, otherwise patch looks good to me.

>  include/net/nsh.h|   3 +
>  include/uapi/linux/openvswitch.h |  29 
>  net/nsh/nsh.c|  59 
>  net/openvswitch/Kconfig  |   1 +
>  net/openvswitch/actions.c| 119 +++
>  net/openvswitch/flow.c   |  51 +++
>  net/openvswitch/flow.h   |   7 +
>  net/openvswitch/flow_netlink.c   | 315 
> ++-
>  net/openvswitch/flow_netlink.h   |   5 +
>  9 files changed, 588 insertions(+), 1 deletion(-)
>

...
> diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
> index 58fb827..2764682 100644
> --- a/net/nsh/nsh.c
> +++ b/net/nsh/nsh.c
> @@ -14,6 +14,65 @@
>  #include 
>  #include 
>
> +int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh)
> +{
> +   struct nshhdr *nh;
> +   size_t length = nsh_hdr_len(pushed_nh);
> +   u8 next_proto;
> +
> +   if (skb->mac_len) {
> +   next_proto = TUN_P_ETHERNET;
> +   } else {
> +   next_proto = tun_p_from_eth_p(skb->protocol);
> +   if (!next_proto)
> +   return -EAFNOSUPPORT;
> +   }
> +
> +   /* Add the NSH header */
> +   if (skb_cow_head(skb, length) < 0)
> +   return -ENOMEM;
> +
> +   skb_push(skb, length);
> +   nh = (struct nshhdr *)(skb->data);
> +   memcpy(nh, pushed_nh, length);
> +   nh->np = next_proto;
dont you need to update checksum for CHECKSUM_COMPLETE case?

> +
> +   skb->protocol = htons(ETH_P_NSH);
> +   skb_reset_mac_header(skb);
> +   skb_reset_network_header(skb);
> +   skb_reset_mac_len(skb);
> +
> +   return 0;
> +}
> +EXPORT_SYMBOL_GPL(nsh_push);
> +
> +int nsh_pop(struct sk_buff *skb)
> +{
> +   struct nshhdr *nh;
> +   size_t length;
> +   __be16 inner_proto;
> +
> +   if (!pskb_may_pull(skb, NSH_BASE_HDR_LEN))
> +   return -ENOMEM;
> +   nh = (struct nshhdr *)(skb->data);
> +   length = nsh_hdr_len(nh);
> +   inner_proto = tun_p_to_eth_p(nh->np);
> +   if (!pskb_may_pull(skb, length))
> +   return -ENOMEM;
> +
> +   if (!inner_proto)
> +   return -EAFNOSUPPORT;
> +
> +   skb_pull(skb, length);
same as above, we need to update the checksum.

> +   skb_reset_mac_header(skb);
> +   skb_reset_network_header(skb);
> +   skb_reset_mac_len(skb);
> +   skb->protocol = inner_proto;
> +
> +   return 0;
> +}
> +EXPORT_SYMBOL_GPL(nsh_pop);
> +
>  static struct sk_buff

[PATCH 1/7] dt-bindings: Add Cavium Octeon Common Ethernet Interface.

2017-11-01 Thread David Daney

From: Carlos Munoz 

Add bindings for Common Ethernet Interface (BGX) block.

Signed-off-by: Carlos Munoz 
Signed-off-by: Steven J. Hill 
Signed-off-by: David Daney 
---
 .../devicetree/bindings/net/cavium-bgx.txt | 59 ++
 1 file changed, 59 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/cavium-bgx.txt

diff --git a/Documentation/devicetree/bindings/net/cavium-bgx.txt 
b/Documentation/devicetree/bindings/net/cavium-bgx.txt
new file mode 100644
index ..9fb79f8bc17f
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/cavium-bgx.txt
@@ -0,0 +1,59 @@
+* Common Ethernet Interface (BGX) block
+
+Properties:
+
+- compatible: "cavium,octeon-7890-bgx": Compatibility with all cn7xxx SOCs.
+
+- reg: The base address of the BGX block.
+
+- #address-cells: Must be <1>.
+
+- #size-cells: Must be <0>.  BGX addresses have no size component.
+
+Typically a BGX block has several children each representing an
+Ethernet interface.
+
+Example:
+
+   ethernet-mac-nexus@11800e000 {
+   compatible = "cavium,octeon-7890-bgx";
+   reg = <0x00011800 0xe000 0x 0x0100>;
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   ethernet-mac@0 {
+   ...
+   reg = <0>;
+   };
+   };
+
+
+* Ethernet Interface (BGX port) connects to PKI/PKO
+
+Properties:
+
+- compatible: "cavium,octeon-7890-bgx-port": Compatibility with all cn7xxx
+  SOCs.
+
+- reg: The index of the interface within the BGX block.
+
+- local-mac-address: Mac address for the interface.
+
+- phy-handle: phandle to the phy node connected to the interface.
+
+
+* Ethernet Interface (BGX port) connects to XCV
+
+
+Properties:
+
+- compatible: "cavium,octeon-7360-xcv": Compatibility with cn73xx SOCs.
+
+- reg: The index of the interface within the BGX block.
+
+- local-mac-address: Mac address for the interface.
+
+- phy-handle: phandle to the phy node connected to the interface.
+
+- cavium,rx-clk-delay-bypass: Set to <1> to bypass the rx clock delay setting.
+  Needed by the Micrel PHY.
-- 
2.13.6

[PATCH 2/7] MIPS: Octeon: Enable LMTDMA/LMTST operations.

2017-11-01 Thread David Daney

From: Carlos Munoz 

LMTDMA/LMTST operations move data between cores and I/O devices:

* LMTST operations can send an address and a variable length
  (up to 128 bytes) of data to an I/O device.
* LMTDMA operations can send an address and a variable length
  (up to 128) of data to the I/O device and then return a
  variable length (up to 128 bytes) response from the IOI device.

Signed-off-by: Carlos Munoz 
Signed-off-by: Steven J. Hill 
Signed-off-by: David Daney 
---
 arch/mips/cavium-octeon/setup.c   |  6 ++
 arch/mips/include/asm/octeon/octeon.h | 12 ++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c
index a8034d0dcade..99e6a68bc652 100644
--- a/arch/mips/cavium-octeon/setup.c
+++ b/arch/mips/cavium-octeon/setup.c
@@ -609,6 +609,12 @@ void octeon_user_io_init(void)
 #else
cvmmemctl.s.cvmsegenak = 0;
 #endif
+   if (OCTEON_IS_OCTEON3()) {
+   /* Enable LMTDMA */
+   cvmmemctl.s.lmtena = 1;
+   /* Scratch line to use for LMT operation */
+   cvmmemctl.s.lmtline = 2;
+   }
/* R/W If set, CVMSEG is available for loads/stores in
 * supervisor mode. */
cvmmemctl.s.cvmsegenas = 0;
diff --git a/arch/mips/include/asm/octeon/octeon.h 
b/arch/mips/include/asm/octeon/octeon.h
index c99c4b6a79f4..92a17d67c1fa 100644
--- a/arch/mips/include/asm/octeon/octeon.h
+++ b/arch/mips/include/asm/octeon/octeon.h
@@ -179,7 +179,15 @@ union octeon_cvmemctl {
/* RO 1 = BIST fail, 0 = BIST pass */
__BITFIELD_FIELD(uint64_t wbfbist:1,
/* Reserved */
-   __BITFIELD_FIELD(uint64_t reserved:17,
+   __BITFIELD_FIELD(uint64_t reserved_52_57:6,
+   /* When set, LMTDMA/LMTST operations are permitted */
+   __BITFIELD_FIELD(uint64_t lmtena:1,
+   /* Selects the CVMSEG LM cacheline used by LMTDMA
+* LMTST and wide atomic store operations.
+*/
+   __BITFIELD_FIELD(uint64_t lmtline:6,
+   /* Reserved */
+   __BITFIELD_FIELD(uint64_t reserved_41_44:4,
/* OCTEON II - TLB replacement policy: 0 = bitmask LRU; 1 = NLU.
 * This field selects between the TLB replacement policies:
 * bitmask LRU or NLU. Bitmask LRU maintains a mask of
@@ -275,7 +283,7 @@ union octeon_cvmemctl {
/* R/W Size of local memory in cache blocks, 54 (6912
 * bytes) is max legal value. */
__BITFIELD_FIELD(uint64_t lmemsz:6,
-   ;)
+   ;
} s;
 };
 
-- 
2.13.6

[PATCH 4/7] MIPS: Octeon: Add Free Pointer Unit (FPA) support.

2017-11-01 Thread David Daney

From: Carlos Munoz 

>From the hardware user manual: "The FPA is a unit that maintains
pools of pointers to free L2/DRAM memory. To provide QoS, the pools
are referenced indirectly through 1024 auras. Both core software
and hardware units allocate and free pointers."

Signed-off-by: Carlos Munoz 
Signed-off-by: Steven J. Hill 
Signed-off-by: David Daney 
---
 arch/mips/cavium-octeon/Kconfig   |  10 +
 arch/mips/cavium-octeon/Makefile  |   1 +
 arch/mips/cavium-octeon/octeon-fpa3.c | 363 ++
 arch/mips/include/asm/octeon/octeon.h |  15 ++
 4 files changed, 389 insertions(+)
 create mode 100644 arch/mips/cavium-octeon/octeon-fpa3.c

diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index 5c0b56203bae..211ef5b57214 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -86,4 +86,14 @@ config OCTEON_ILM
  To compile this driver as a module, choose M here.  The module
  will be called octeon-ilm
 
+config OCTEON_FPA3
+   tristate "Octeon III fpa driver"
+   default "n"
+   depends on CPU_CAVIUM_OCTEON
+   help
+ This option enables a Octeon III driver for the Free Pool Unit (FPA).
+ The FPA is a hardware unit that manages pools of pointers to free
+ L2/DRAM memory. This driver provides an interface to reserve,
+ initialize, and fill fpa pools.
+
 endif # CAVIUM_OCTEON_SOC
diff --git a/arch/mips/cavium-octeon/Makefile b/arch/mips/cavium-octeon/Makefile
index 0a299ab8719f..0ef967399702 100644
--- a/arch/mips/cavium-octeon/Makefile
+++ b/arch/mips/cavium-octeon/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_MTD)   += flash_setup.o
 obj-$(CONFIG_SMP)+= smp.o
 obj-$(CONFIG_OCTEON_ILM) += oct_ilm.o
 obj-$(CONFIG_USB)+= octeon-usb.o
+obj-$(CONFIG_OCTEON_FPA3)+= octeon-fpa3.o
diff --git a/arch/mips/cavium-octeon/octeon-fpa3.c 
b/arch/mips/cavium-octeon/octeon-fpa3.c
new file mode 100644
index ..65e8081b6a3b
--- /dev/null
+++ b/arch/mips/cavium-octeon/octeon-fpa3.c
@@ -0,0 +1,363 @@
+/*
+ * Driver for the Octeon III Free Pool Unit (fpa).
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2015-2017 Cavium, Inc.
+ */
+
+#include 
+
+#include 
+
+
+/* Registers are accessed via xkphys */
+#define SET_XKPHYS (1ull << 63)
+#define NODE_OFFSET0x10ull
+#define SET_NODE(node) ((node) * NODE_OFFSET)
+
+#define FPA_BASE   0x12800ull
+#define SET_FPA_BASE(node) (SET_XKPHYS + SET_NODE(node) + FPA_BASE)
+
+#define FPA_GEN_CFG(n) (SET_FPA_BASE(n)   + 0x0050)
+
+#define FPA_POOLX_CFG(n, p)(SET_FPA_BASE(n) + (p<<3)  + 0x1000)
+#define FPA_POOLX_START_ADDR(n, p) (SET_FPA_BASE(n) + (p<<3)  + 0x1050)
+#define FPA_POOLX_END_ADDR(n, p)   (SET_FPA_BASE(n) + (p<<3)  + 0x1060)
+#define FPA_POOLX_STACK_BASE(n, p) (SET_FPA_BASE(n) + (p<<3)  + 0x1070)
+#define FPA_POOLX_STACK_END(n, p)  (SET_FPA_BASE(n) + (p<<3)  + 0x1080)
+#define FPA_POOLX_STACK_ADDR(n, p) (SET_FPA_BASE(n) + (p<<3)  + 0x1090)
+
+#define FPA_AURAX_POOL(n, a)   (SET_FPA_BASE(n) + (a<<3)  + 0x2000)
+#define FPA_AURAX_CFG(n, a)(SET_FPA_BASE(n) + (a<<3)  + 0x2010)
+#define FPA_AURAX_CNT(n, a)(SET_FPA_BASE(n) + (a<<3)  + 0x2020)
+#define FPA_AURAX_CNT_LIMIT(n, a)  (SET_FPA_BASE(n) + (a<<3)  + 0x2040)
+#define FPA_AURAX_CNT_THRESHOLD(n, a)  (SET_FPA_BASE(n) + (a<<3)  + 0x2050)
+#define FPA_AURAX_POOL_LEVELS(n, a)(SET_FPA_BASE(n) + (a<<3)  + 0x2070)
+#define FPA_AURAX_CNT_LEVELS(n, a) (SET_FPA_BASE(n) + (a<<3)  + 0x2080)
+
+static inline u64 oct_csr_read(u64 addr)
+{
+   return __raw_readq((void __iomem *)addr);
+}
+
+static inline void oct_csr_write(u64 data, u64 addr)
+{
+   __raw_writeq(data, (void __iomem *)addr);
+}
+
+static DEFINE_MUTEX(octeon_fpa3_lock);
+
+static int get_num_pools(void)
+{
+   if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+   return 64;
+   if (OCTEON_IS_MODEL(OCTEON_CNF75XX) || OCTEON_IS_MODEL(OCTEON_CN73XX))
+   return 32;
+   return 0;
+}
+
+static int get_num_auras(void)
+{
+   if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+   return 1024;
+   if (OCTEON_IS_MODEL(OCTEON_CNF75XX) || OCTEON_IS_MODEL(OCTEON_CN73XX))
+   return 512;
+   return 0;
+}
+
+/**
+ * octeon_fpa3_init - Initialize the fpa to default values.
+ * @node: Node of fpa to initialize.
+ *
+ * Returns 0 if successful.
+ * Returns <0 for error codes.
+ */
+int octeon_fpa3_init(int node)
+{
+

[PATCH 3/7] MIPS: Octeon: Add a global resource manager.

2017-11-01 Thread David Daney

From: Carlos Munoz 

Add a global resource manager to manage tagged pointers within
bootmem allocated memory. This is used by various functional
blocks in the Octeon core like the FPA, Ethernet nexus, etc.

Signed-off-by: Carlos Munoz 
Signed-off-by: Steven J. Hill 
Signed-off-by: David Daney 
---
 arch/mips/cavium-octeon/Makefile   |   3 +-
 arch/mips/cavium-octeon/resource-mgr.c | 362 +
 arch/mips/include/asm/octeon/octeon.h  |  18 ++
 3 files changed, 382 insertions(+), 1 deletion(-)
 create mode 100644 arch/mips/cavium-octeon/resource-mgr.c

diff --git a/arch/mips/cavium-octeon/Makefile b/arch/mips/cavium-octeon/Makefile
index 7c02e542959a..0a299ab8719f 100644
--- a/arch/mips/cavium-octeon/Makefile
+++ b/arch/mips/cavium-octeon/Makefile
@@ -9,7 +9,8 @@
 # Copyright (C) 2005-2009 Cavium Networks
 #
 
-obj-y := cpu.o setup.o octeon-platform.o octeon-irq.o csrc-octeon.o
+obj-y := cpu.o setup.o octeon-platform.o octeon-irq.o csrc-octeon.o \
+resource-mgr.o
 obj-y += dma-octeon.o
 obj-y += octeon-memcpy.o
 obj-y += executive/
diff --git a/arch/mips/cavium-octeon/resource-mgr.c 
b/arch/mips/cavium-octeon/resource-mgr.c
new file mode 100644
index ..d31b72d56c31
--- /dev/null
+++ b/arch/mips/cavium-octeon/resource-mgr.c
@@ -0,0 +1,362 @@
+/*
+ * Resource manager for Octeon.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2017 Cavium, Inc.
+ */
+#include 
+
+#include 
+#include 
+
+#define RESOURCE_MGR_BLOCK_NAME"cvmx-global-resources"
+#define MAX_RESOURCES  128
+#define INST_AVAILABLE -88
+#define OWNER  0xbadc0de
+
+struct global_resource_entry {
+   struct global_resource_tag tag;
+   u64 phys_addr;
+   u64 size;
+};
+
+struct global_resources {
+#ifdef __LITTLE_ENDIAN_BITFIELD
+   u32 rlock;
+   u32 pad;
+#else
+   u32 pad;
+   u32 rlock;
+#endif
+   u64 entry_cnt;
+   struct global_resource_entry resource_entry[];
+};
+
+static struct global_resources *res_mgr_info;
+
+
+static void res_mgr_lock(void)
+{
+   unsigned int tmp;
+   u64 lock = (u64)_mgr_info->rlock;
+
+   __asm__ __volatile__(
+   ".set noreorder\n"
+   "1: ll   %[tmp], 0(%[addr])\n"
+   "   bnez %[tmp], 1b\n"
+   "   li   %[tmp], 1\n"
+   "   sc   %[tmp], 0(%[addr])\n"
+   "   beqz %[tmp], 1b\n"
+   "   nop\n"
+   ".set reorder\n" :
+   [tmp] "="(tmp) :
+   [addr] "r"(lock) :
+   "memory");
+}
+
+static void res_mgr_unlock(void)
+{
+   u64 lock = (u64)_mgr_info->rlock;
+
+   /* Wait until all resource operations finish before unlocking. */
+   mb();
+   __asm__ __volatile__(
+   "sw $0, 0(%[addr])\n" : :
+   [addr] "r"(lock) :
+   "memory");
+
+   /* Force a write buffer flush. */
+   mb();
+}
+
+static int res_mgr_find_resource(struct global_resource_tag tag)
+{
+   struct global_resource_entry *res_entry;
+   int i;
+
+   for (i = 0; i < res_mgr_info->entry_cnt; i++) {
+   res_entry = _mgr_info->resource_entry[i];
+   if (res_entry->tag.lo == tag.lo && res_entry->tag.hi == tag.hi)
+   return i;
+   }
+   return -1;
+}
+
+/**
+ * res_mgr_create_resource - Create a resource.
+ * @tag: Identifies the resource.
+ * @inst_cnt: Number of resource instances to create.
+ *
+ * Returns 0 if the source was created successfully.
+ * Returns <0 for error codes.
+ */
+int res_mgr_create_resource(struct global_resource_tag tag, int inst_cnt)
+{
+   struct global_resource_entry *res_entry;
+   u64 size;
+   u64 *res_addr;
+   int res_index, i, rc = 0;
+
+   res_mgr_lock();
+
+   /* Make sure resource doesn't already exist. */
+   res_index = res_mgr_find_resource(tag);
+   if (res_index >= 0) {
+   rc = -1;
+   goto err;
+   }
+
+   if (res_mgr_info->entry_cnt >= MAX_RESOURCES) {
+   pr_err("Resource max limit reached, not created\n");
+   rc = -1;
+   goto err;
+   }
+
+   /*
+* Each instance is kept in an array of u64s. The first array element
+* holds the number of allocated instances.
+*/
+   size = sizeof(u64) * (inst_cnt + 1);
+   res_addr = cvmx_bootmem_alloc_range(size, CVMX_CACHE_LINE_SIZE, 0, 0);
+   if (!res_addr) {
+   pr_err("Failed to allocate resource. not created\n");
+   rc = -1;
+   goto err;
+   }
+
+   /* Initialize the newly created resource. */
+   *res_addr = inst_cnt;
+   for

[PATCH 5/7] MIPS: Octeon: Automatically provision CVMSEG space.

2017-11-01 Thread David Daney

Remove CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE and automatically calculate
the amount of CVMSEG space needed.

1st 128-bytes: Use by IOBDMA
2nd 128-bytes: Reserved by kernel for scratch/TLS emulation.
3rd 128-bytes: OCTEON-III LMTLINE

New config variable CONFIG_CAVIUM_OCTEON_EXTRA_CVMSEG provisions
additional lines, defaults to zero.

Signed-off-by: David Daney 
Signed-off-by: Carlos Munoz 
---
 arch/mips/cavium-octeon/Kconfig| 27 
 arch/mips/cavium-octeon/setup.c| 16 ++--
 .../asm/mach-cavium-octeon/kernel-entry-init.h | 20 +--
 arch/mips/include/asm/mipsregs.h   |  2 ++
 arch/mips/include/asm/octeon/octeon.h  |  2 ++
 arch/mips/include/asm/processor.h  |  2 +-
 arch/mips/kernel/octeon_switch.S   |  2 --
 arch/mips/kernel/unaligned.c   |  3 +++
 arch/mips/mm/tlbex.c   | 29 ++
 9 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index 211ef5b57214..fc6a1b44605b 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -10,21 +10,26 @@ config CAVIUM_CN63XXP1
  non-CN63XXP1 hardware, so it is recommended to select "n"
  unless it is known the workarounds are needed.
 
-config CAVIUM_OCTEON_CVMSEG_SIZE
-   int "Number of L1 cache lines reserved for CVMSEG memory"
-   range 0 54
-   default 1
-   help
- CVMSEG LM is a segment that accesses portions of the dcache as a
- local memory; the larger CVMSEG is, the smaller the cache is.
- This selects the size of CVMSEG LM, which is in cache blocks. The
- legally range is from zero to 54 cache blocks (i.e. CVMSEG LM is
- between zero and 6192 bytes).
-
 endif # CPU_CAVIUM_OCTEON
 
 if CAVIUM_OCTEON_SOC
 
+config CAVIUM_OCTEON_EXTRA_CVMSEG
+   int "Number of extra L1 cache lines reserved for CVMSEG memory"
+   range 0 50
+   default 0
+   help
+ CVMSEG LM is a segment that accesses portions of the dcache
+ as a local memory; the larger CVMSEG is, the smaller the
+ cache is.  The kernel uses two or three blocks (one for TLB
+ exception handlers, one for driver IOBDMA operations, and on
+ models that need it, one for LMTDMA operations). This
+ selects an optional extra number of CVMSEG lines for use by
+ other software.
+
+ Normally no extra lines are required, and this parameter
+ should be set to zero.
+
 config CAVIUM_OCTEON_LOCK_L2
bool "Lock often used kernel code in the L2"
default "y"
diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c
index 99e6a68bc652..51c4d3c3cada 100644
--- a/arch/mips/cavium-octeon/setup.c
+++ b/arch/mips/cavium-octeon/setup.c
@@ -68,6 +68,12 @@ extern void pci_console_init(const char *arg);
 static unsigned long long max_memory = ULLONG_MAX;
 static unsigned long long reserve_low_mem;
 
+/*
+ * modified in hernel-entry-init.h, must have an initial value to keep
+ * it from being clobbered when bss is zeroed.
+ */
+u32 octeon_cvmseg_lines = 2;
+
 DEFINE_SEMAPHORE(octeon_bootbus_sem);
 EXPORT_SYMBOL(octeon_bootbus_sem);
 
@@ -604,11 +610,7 @@ void octeon_user_io_init(void)
 
/* R/W If set, CVMSEG is available for loads/stores in
 * kernel/debug mode. */
-#if CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE > 0
cvmmemctl.s.cvmsegenak = 1;
-#else
-   cvmmemctl.s.cvmsegenak = 0;
-#endif
if (OCTEON_IS_OCTEON3()) {
/* Enable LMTDMA */
cvmmemctl.s.lmtena = 1;
@@ -626,9 +628,9 @@ void octeon_user_io_init(void)
 
/* Setup of CVMSEG is done in kernel-entry-init.h */
if (smp_processor_id() == 0)
-   pr_notice("CVMSEG size: %d cache lines (%d bytes)\n",
- CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE,
- CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE * 128);
+   pr_notice("CVMSEG size: %u cache lines (%u bytes)\n",
+ octeon_cvmseg_lines,
+ octeon_cvmseg_lines * 128);
 
if (octeon_has_feature(OCTEON_FEATURE_FAU)) {
union cvmx_iob_fau_timeout fau_timeout;
diff --git a/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h 
b/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h
index c38b38ce5a3d..cdcca60978a2 100644
--- a/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h
+++ b/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h
@@ -26,11 +26,18 @@
# a3 = address of boot descriptor block
.set push
.set arch=octeon
+   mfc0v1, CP0_PRID_REG
+   andiv1, 0xff00
+   li  v0, 0x9500  # cn78XX or later
+   subuv1, v1, v0
+

[PATCH 7/7] MAINTAINERS: Add entry for drivers/net/ethernet/cavium/octeon/octeon3-*

2017-11-01 Thread David Daney

Signed-off-by: David Daney 
---
 MAINTAINERS | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9a24f56e0451..142af33adc35 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3212,6 +3212,12 @@ W:   http://www.cavium.com
 S: Supported
 F: drivers/mmc/host/cavium*
 
+CAVIUM OCTEON-III NETWORK DRIVER
+M: David Daney 
+L: netdev@vger.kernel.org
+S: Supported
+F: drivers/net/ethernet/cavium/octeon/octeon3-*
+
 CAVIUM OCTEON-TX CRYPTO DRIVER
 M: George Cherian 
 L: linux-cry...@vger.kernel.org
-- 
2.13.6

[PATCH 0/7] Cavium OCTEON-III network driver.

2017-11-01 Thread David Daney

We are adding the Cavium OCTEON-III network driver.  But since
interacting with the input and output queues is done via special CPU
local memory, we also need to add support to the MIPS/Octeon
architecture code.  Aren't SoCs nice in this way?

The first five patches add the SoC support needed by the driver, the
last two add the driver and an entry in MAINTAINERS.

Since these touch several subsystems, I would propose merging via
netdev, but defer to the maintainers if they think something else
would work better.

A separate pull request was recently done by Steven Hill for the
firmware required by the driver.

Carlos Munoz (5):
  dt-bindings: Add Cavium Octeon Common Ethernet Interface.
  MIPS: Octeon: Enable LMTDMA/LMTST operations.
  MIPS: Octeon: Add a global resource manager.
  MIPS: Octeon: Add Free Pointer Unit (FPA) support.
  netdev: octeon-ethernet: Add Cavium Octeon III support.

David Daney (2):
  MIPS: Octeon: Automatically provision CVMSEG space.
  MAINTAINERS: Add entry for
drivers/net/ethernet/cavium/octeon/octeon3-*

 .../devicetree/bindings/net/cavium-bgx.txt |   59 +
 MAINTAINERS|6 +
 arch/mips/cavium-octeon/Kconfig|   37 +-
 arch/mips/cavium-octeon/Makefile   |4 +-
 arch/mips/cavium-octeon/octeon-fpa3.c  |  363 
 arch/mips/cavium-octeon/resource-mgr.c |  362 
 arch/mips/cavium-octeon/setup.c|   22 +-
 .../asm/mach-cavium-octeon/kernel-entry-init.h |   20 +-
 arch/mips/include/asm/mipsregs.h   |2 +
 arch/mips/include/asm/octeon/octeon.h  |   47 +-
 arch/mips/include/asm/processor.h  |2 +-
 arch/mips/kernel/octeon_switch.S   |2 -
 arch/mips/kernel/unaligned.c   |3 +
 arch/mips/mm/tlbex.c   |   29 +-
 drivers/net/ethernet/cavium/Kconfig|   28 +-
 drivers/net/ethernet/cavium/octeon/Makefile|6 +
 .../net/ethernet/cavium/octeon/octeon3-bgx-nexus.c |  698 +++
 .../net/ethernet/cavium/octeon/octeon3-bgx-port.c  | 2023 +++
 drivers/net/ethernet/cavium/octeon/octeon3-core.c  | 2075 
 drivers/net/ethernet/cavium/octeon/octeon3-pki.c   |  833 
 drivers/net/ethernet/cavium/octeon/octeon3-pko.c   | 1719 
 drivers/net/ethernet/cavium/octeon/octeon3-sso.c   |  309 +++
 drivers/net/ethernet/cavium/octeon/octeon3.h   |  411 
 23 files changed, 9005 insertions(+), 55 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/cavium-bgx.txt
 create mode 100644 arch/mips/cavium-octeon/octeon-fpa3.c
 create mode 100644 arch/mips/cavium-octeon/resource-mgr.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx-port.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-core.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-pki.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-pko.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-sso.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3.h

-- 
2.13.6

Re: Bond recovery from BOND_LINK_FAIL state not working

2017-11-01 Thread Jay Vosburgh

Jay Vosburgh  wrote:

>Alex Sidorenko  wrote:
>
>>The problem has been found while trying to deploy RHEL7 on HPE Synergy
>>platform, it is seen both in customer's environment and in HPE test lab.
>>
>>There are several bonds configured in TLB mode and miimon=100, all other
>>options are default. Slaves are connected to VirtualConnect
>>modules. Rebooting a VC module should bring one bond slave (ens3f0) down
>>temporarily, but not another one (ens3f1). But what we see is
>>
>>Oct 24 10:37:12 SYDC1LNX kernel: bond0: link status up again after 0 ms for 
>>interface ens3f1

In net-next, I don't see a path in the code that will lead to
this message, as it would apparently require entering
bond_miimon_inspect in state BOND_LINK_FAIL but with downdelay set to 0.
If downdelay is 0, the code will transition to BOND_LINK_DOWN and not
remain in _FAIL state.

>>and it never recovers. When VC reboot is complete, everything goes back to 
>>normal again.
>>
>>Redhat has backported all recent upstream commits and instrumented the
>>bonding driver. We have found the following (when VC goes down)
>>
>>In bond_miimon_inspect() the first slave goes to
>>  bond_propose_link_state(slave, BOND_LINK_FAIL);
>>  and
>>  slave->new_link = BOND_LINK_DOWN;
>>
>>The second slave is still
>>  slave->link = BOND_LINK_UP;
>>  and
>>slave->new_link = BOND_LINK_NOCHANGE;
>>
>>This is as expected. But in bond_miimon_commit() we see that _both_ slaves
>>are in BOND_LINK_FAIL.  That is, something changes the state of the second
>>slave from another thread. We suspect the NetworkManager, as the problem
>>is there _only_ when bonds are controlled by it, if we set
>>NM_CONTROLLED=no everything starts working normally.
>>
>>While we still do not understand how NM affects bond state, I think that
>>bonding driver needs to be made reliable enough to recover even from this
>>state.
>
>   You're suggesting that the bonding driver be able to distinguish
>"false" down states set by network manager from a genuine link failure?
>Am I misunderstanding?
>
>>At this moment when we enter bond_miimon_inspect() with
>>slave->link = BOND_LINK_FAIL and are in the following code
>>
>>/*FALLTHRU*/
>>case BOND_LINK_BACK:
>>if (!link_state) {
>>bond_propose_link_state(slave, 
>> BOND_LINK_DOWN);
>>netdev_info(bond->dev, "link status down
>>again after %d ms for interface %s\n",
>>(bond->params.updelay - 
>> slave->delay) *
>>bond->params.miimon,
>>slave->dev->name);
>>
>>commit++;
>>continue;
>>}
>>
>
>   Looking at bonding in the current net-next, if a slave enters
>bond_miimon_inspect with slave->link == BOND_LINK_FAIL, it will not
>proceed to the BOND_LINK_BACK block of the switch; BOND_LINK_FAIL does
>not fall through that far.
>
>   Did you actually mean the BOND_LINK_FAIL block of the switch?
>I'll assume so for the rest of my reply.
>
>>we propose a new state and do 'commit++', but we do not change
>>slave->new_link from BOND_LINK_NOCHANGE. As a result, bond_miimon_commit()
>>will not process this slave.
>>
>>The following patch fixes the issue:
>>
>>
>>If we enter bond_miimon_inspect() with slave_link=BOND_LINK_FAIL
>>and recover, we do bond_propose_link_state(slave, BOND_LINK_UP);
>>but do not change slave->new_link, so it is left in
>>BOND_LINK_NOCHANGE. As a result, bond_miimon_commit() will not
>>process that slave and it never recovers. We need to set
>>slave->new_link = BOND_LINK_UP to make bond_miimon_commit() work
>
>   What is your downdelay set to?  In principle,
>bond_miimon_inspect should not enter with a slave in BOND_LINK_FAIL
>state if downdelay is 0.
>
>> drivers/net/bonding/bond_main.c | 1 +
>> 1 file changed, 1 insertion(+)
>>
>>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>>index c99dc59..07aa7ba 100644
>>--- a/drivers/net/bonding/bond_main.c
>>+++ b/drivers/net/bonding/bond_main.c
>>@@ -2072,6 +2072,7 @@ static int bond_miimon_inspect(struct bonding *bond)
>>(bond->params.downdelay - 
>> slave->delay) *
>>bond->params.miimon,
>>slave->dev->name);
>>+   slave->new_link = BOND_LINK_UP;
>>commit++;
>>continue;
>>}
>>-- 
>>2.7.4
>
>   Your patch does not apply to net-next, so I'm not absolutely
>sure where this is, but presuming that this is in the BOND_LINK_FAIL
>case of the switch, it looks

Re: [PATCH v2] net: phy: leds: Add support for "link" trigger

2017-11-01 Thread Andrew Lunn

On Thu, Nov 02, 2017 at 12:49:31AM +0100, Maciej S. Szmigiero wrote:
> Hi Andrew,
> 
> On 01.11.2017 13:33, Maciej S. Szmigiero wrote:
> > On 01.11.2017 13:31, Andrew Lunn wrote:
> >>> Yes, I did it the same way as the existing code did for 
> >>> phy->phy_led_triggers
> >>> for reasons of both consistency and also to be on the safe side because
> >>> maybe there is some non-obvious reason why it has to be freed
> >>> explicitly (?).
> >>  
> >> Hi Maciej
> >>
> >> Occasionally, there is a need to call devm_kfree(). But i don't see
> >> anything here why it is needed. So i would drop your devm_kfree(), and
> >> if you feel like it, add an additional patch removing the existing
> >> one.
> > 
> > OK, will do as you suggest.
> > 
> 
> Upon closer inspection of the code it turned out that these devm_kfree()
> calls actually had some purpose - the PHY core ignores the return value
> of phy_led_triggers_register() and will successfully attach a PHY even
> if this function returns an error.
> 
> In that case these LED trigger structures would unnecessary take some
> memory, that's why (probably) the PHY LED code frees them on error
> path.

O.K. Thanks for looking into this.

 Andrew

Re: [PATCH net] tcp: Always cleanup skb before sending

2017-11-01 Thread Christoph Paasch

On 01/11/17 - 14:53:38, Eric Dumazet wrote:
> On Wed, 2017-11-01 at 14:32 -0700, Eric Dumazet wrote:
> > On Wed, Nov 1, 2017 at 2:10 PM, Christoph Paasch  wrote:
> > > Since commit 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache
> > > line misses") IPCB/IP6CB is no more at the beginning of the tcp_skb_cb.
> > > This means that on the output path, we need to make sure that it has
> > > been correctly initialized to 0, as is done in tcp_transmit_skb.
> > >
> > > However, when going through the other code-path in TCP that can send an
> > > skb (e.g., through tcp_v6_send_synack), we end up in a situation where
> > > IP6CB has some of its fields set to unexpected values. Depending on the
> > > layout of tcp_skb_cb across the different kernel-versions this can be
> > > lastopt, flags,...
> > 
> > Or not use tcp_init_nondata_skb() on non fast clones, since it adds
> > unnecessary writes and clears.
> > 
> > tcp_make_synack() really has no business using tcp_init_nondata_skb()
> > and could simply set th->seq = htonl(tcp_rsk(req)->snt_isn);
> 
> Something like :
> 
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 
> 69cfdead0cb49e4365158048a0d1a9bbdd55fa83..5502abc5307f0ce1de610d4b70f3a59c4d5383c5
>  100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -3399,13 +3399,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, 
> struct dst_entry *dst,
> tcp_ecn_make_synack(req, th);
> th->source = htons(ireq->ir_num);
> th->dest = ireq->ir_rmt_port;
> -   /* Setting of flags are superfluous here for callers (and ECE is
> -* not even correctly set)
> -*/
> -   tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
> -TCPHDR_SYN | TCPHDR_ACK);
> -
> -   th->seq = htonl(TCP_SKB_CB(skb)->seq);
> +   skb->ip_summed = CHECKSUM_PARTIAL;
> +   th->seq = htonl(tcp_rsk(req)->snt_isn);
> /* XXX data is queued and acked as is. No buffer/window check */
> th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);

Yes, that looks good to me. Thanks!

But we still need to clean up the skb in tcp_v4_send_reset and
tcp_v4_send_ack, as the fields (e.g., tcp_skb_cb->seq) have been set when
coming from tcp_v4_rcv.


Christoph

[PATCH v3 2/2] net: phy: leds: Add support for "link" trigger

2017-11-01 Thread Maciej S. Szmigiero

Currently, we create a LED trigger for any link speed known to a PHY.
These triggers only fire when their exact link speed had been negotiated
(they aren't cumulative, that is, they don't fire for "their or any higher"
link speed).

What we are missing, however, is a trigger which will fire on any link
speed known to the PHY. Such trigger can then be used for implementing a
poor man's substitute of the "link" LED on boards that lack it.
Let's add it.

Signed-off-by: Maciej S. Szmigiero 
---
Changes from v1: Don't keep the "link" trigger together with link speed
triggers in one array so we don't have to overload a SPEED_UNKNOWN speed
to mean the "link" trigger

Changes from v2: Add refactoring of "no link" handler to the patch
series, set led_link_trigger to NULL on error path because the PHY core
ignores the return value of phy_led_triggers_register() and will call
phy_led_triggers_unregister() even though the registration had failed.

 drivers/net/phy/Kconfig|  7 +--
 drivers/net/phy/phy_led_triggers.c | 43 +++---
 include/linux/phy.h|  2 ++
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index cd931cf9dcc2..3bcc2107ad77 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -191,11 +191,14 @@ config LED_TRIGGER_PHY
  Adds support for a set of LED trigger events per-PHY.  Link
  state change will trigger the events, for consumption by an
  LED class driver.  There are triggers for each link speed currently
- supported by the phy, and are of the form:
+ supported by the PHY and also a one common "link" trigger as a
+ logical-or of all the link speed ones.
+ All these triggers are named according to the following pattern:
  ::
 
  Where speed is in the form:
-   Mbps or Gbps
+   Mbps OR Gbps OR link
+   for any speed known to the PHY.
 
 
 comment "MII PHY device drivers"
diff --git a/drivers/net/phy/phy_led_triggers.c 
b/drivers/net/phy/phy_led_triggers.c
index c736f29b3b2a..39ecad25b201 100644
--- a/drivers/net/phy/phy_led_triggers.c
+++ b/drivers/net/phy/phy_led_triggers.c
@@ -31,6 +31,7 @@ static void phy_led_trigger_no_link(struct phy_device *phy)
 {
if (phy->last_triggered) {
led_trigger_event(>last_triggered->trigger, LED_OFF);
+   led_trigger_event(>led_link_trigger->trigger, LED_OFF);
phy->last_triggered = NULL;
}
 }
@@ -54,6 +55,10 @@ void phy_led_trigger_change_speed(struct phy_device *phy)
}
 
if (plt != phy->last_triggered) {
+   if (!phy->last_triggered)
+   led_trigger_event(>led_link_trigger->trigger,
+ LED_FULL);
+
led_trigger_event(>last_triggered->trigger, LED_OFF);
led_trigger_event(>trigger, LED_FULL);
phy->last_triggered = plt;
@@ -61,6 +66,13 @@ void phy_led_trigger_change_speed(struct phy_device *phy)
 }
 EXPORT_SYMBOL_GPL(phy_led_trigger_change_speed);
 
+static void phy_led_trigger_format_name(struct phy_device *phy, char *buf,
+   size_t size, char *suffix)
+{
+   snprintf(buf, size, PHY_ID_FMT ":%s",
+phy->mdio.bus->id, phy->mdio.addr, suffix);
+}
+
 static int phy_led_trigger_register(struct phy_device *phy,
struct phy_led_trigger *plt,
unsigned int speed)
@@ -77,8 +89,8 @@ static int phy_led_trigger_register(struct phy_device *phy,
snprintf(name_suffix, sizeof(name_suffix), "%dGbps",
 DIV_ROUND_CLOSEST(speed, 1000));
 
-   snprintf(plt->name, sizeof(plt->name), PHY_ID_FMT ":%s",
-phy->mdio.bus->id, phy->mdio.addr, name_suffix);
+   phy_led_trigger_format_name(phy, plt->name, sizeof(plt->name),
+   name_suffix);
plt->trigger.name = plt->name;
 
return led_trigger_register(>trigger);
@@ -99,13 +111,30 @@ int phy_led_triggers_register(struct phy_device *phy)
if (!phy->phy_num_led_triggers)
return 0;
 
+   phy->led_link_trigger = devm_kzalloc(>mdio.dev,
+sizeof(*phy->led_link_trigger),
+GFP_KERNEL);
+   if (!phy->led_link_trigger) {
+   err = -ENOMEM;
+   goto out_clear;
+   }
+
+   phy_led_trigger_format_name(phy, phy->led_link_trigger->name,
+   sizeof(phy->led_link_trigger->name),
+   "link");
+   phy->led_link_trigger->trigger.name = phy->led_link_trigger->name;
+
+   err = led_trigger_register(>led_link_trigger->trigger);
+   if (err)
+

Re: [PATCH v2] net: phy: leds: Add support for "link" trigger

2017-11-01 Thread Maciej S. Szmigiero

Hi Andrew,

On 01.11.2017 13:33, Maciej S. Szmigiero wrote:
> On 01.11.2017 13:31, Andrew Lunn wrote:
>>> Yes, I did it the same way as the existing code did for 
>>> phy->phy_led_triggers
>>> for reasons of both consistency and also to be on the safe side because
>>> maybe there is some non-obvious reason why it has to be freed
>>> explicitly (?).
>>  
>> Hi Maciej
>>
>> Occasionally, there is a need to call devm_kfree(). But i don't see
>> anything here why it is needed. So i would drop your devm_kfree(), and
>> if you feel like it, add an additional patch removing the existing
>> one.
> 
> OK, will do as you suggest.
> 

Upon closer inspection of the code it turned out that these devm_kfree()
calls actually had some purpose - the PHY core ignores the return value
of phy_led_triggers_register() and will successfully attach a PHY even
if this function returns an error.

In that case these LED trigger structures would unnecessary take some
memory, that's why (probably) the PHY LED code frees them on error
path.

Due to this I've decided to keep these calls to devm_kfree().

Maciej

[PATCH v3 1/2] net: phy: leds: Refactor "no link" handler into a separate function

2017-11-01 Thread Maciej S. Szmigiero

Currently, phy_led_trigger_change_speed() is handling a "no link" condition
like it was some kind of an error (using "goto" to a code at the function
end).

However, having no link at PHY is an ordinary operational state, so let's
handle it in an appropriately named separate function so it is more obvious
what the code is doing.

Signed-off-by: Maciej S. Szmigiero 
---
 drivers/net/phy/phy_led_triggers.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/phy/phy_led_triggers.c 
b/drivers/net/phy/phy_led_triggers.c
index 94ca42e630bb..c736f29b3b2a 100644
--- a/drivers/net/phy/phy_led_triggers.c
+++ b/drivers/net/phy/phy_led_triggers.c
@@ -27,12 +27,20 @@ static struct phy_led_trigger 
*phy_speed_to_led_trigger(struct phy_device *phy,
return NULL;
 }
 
+static void phy_led_trigger_no_link(struct phy_device *phy)
+{
+   if (phy->last_triggered) {
+   led_trigger_event(>last_triggered->trigger, LED_OFF);
+   phy->last_triggered = NULL;
+   }
+}
+
 void phy_led_trigger_change_speed(struct phy_device *phy)
 {
struct phy_led_trigger *plt;
 
if (!phy->link)
-   goto out_change_speed;
+   return phy_led_trigger_no_link(phy);
 
if (phy->speed == 0)
return;
@@ -42,7 +50,7 @@ void phy_led_trigger_change_speed(struct phy_device *phy)
netdev_alert(phy->attached_dev,
 "No phy led trigger registered for speed(%d)\n",
 phy->speed);
-   goto out_change_speed;
+   return phy_led_trigger_no_link(phy);
}
 
if (plt != phy->last_triggered) {
@@ -50,14 +58,6 @@ void phy_led_trigger_change_speed(struct phy_device *phy)
led_trigger_event(>trigger, LED_FULL);
phy->last_triggered = plt;
}
-   return;
-
-out_change_speed:
-   if (phy->last_triggered) {
-   led_trigger_event(>last_triggered->trigger,
- LED_OFF);
-   phy->last_triggered = NULL;
-   }
 }
 EXPORT_SYMBOL_GPL(phy_led_trigger_change_speed);

[PATCH net-next] liquidio: synchronize VF representor names with NIC firmware

2017-11-01 Thread Felix Manlunas

From: Vijaya Mohan Guvva 

LiquidIO firmware supports a vswitch that needs to know the names of the
VF representors in the host to maintain compatibility for direct
programming using external Openflow agents.  So, for each VF representor,
send its name to the firmware when it gets registered and when its name
changes.

Signed-off-by: Vijaya Mohan Guvva 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Felix Manlunas 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 15 +
 drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c  | 68 ++
 drivers/net/ethernet/cavium/liquidio/lio_vf_rep.h  |  2 +
 .../net/ethernet/cavium/liquidio/liquidio_common.h |  8 ++-
 4 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index f27f0af..f05045a 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -1639,6 +1639,10 @@ static void liquidio_remove(struct pci_dev *pdev)
if (oct_dev->watchdog_task)
kthread_stop(oct_dev->watchdog_task);
 
+   if (!oct_dev->octeon_id &&
+   oct_dev->fw_info.app_cap_flags & LIQUIDIO_SWITCHDEV_CAP)
+   lio_vf_rep_modexit();
+
if (oct_dev->app_mode && (oct_dev->app_mode == CVM_DRV_NIC_APP))
liquidio_stop_nic_module(oct_dev);
 
@@ -4029,6 +4033,17 @@ static int liquidio_init_nic_module(struct octeon_device 
*oct)
goto octnet_init_failure;
}
 
+   /* Call vf_rep_modinit if the firmware is switchdev capable
+* and do it from the first liquidio function probed.
+*/
+   if (!oct->octeon_id &&
+   oct->fw_info.app_cap_flags & LIQUIDIO_SWITCHDEV_CAP) {
+   if (lio_vf_rep_modinit()) {
+   liquidio_stop_nic_module(oct);
+   goto octnet_init_failure;
+   }
+   }
+
liquidio_ptp_init(oct);
 
dev_dbg(>pci_dev->dev, "Network interfaces ready\n");
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c 
b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c
index de0c80d..2adafa3 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c
@@ -625,3 +625,71 @@ static void lio_vf_rep_get_stats64(struct net_device *dev,
 
oct->vf_rep_list.num_vfs = 0;
 }
+
+static int
+lio_vf_rep_netdev_event(struct notifier_block *nb,
+   unsigned long event, void *ptr)
+{
+   struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+   struct lio_vf_rep_desc *vf_rep;
+   struct lio_vf_rep_req rep_cfg;
+   struct octeon_device *oct;
+   int ret;
+
+   switch (event) {
+   case NETDEV_REGISTER:
+   case NETDEV_CHANGENAME:
+   break;
+
+   default:
+   return NOTIFY_DONE;
+   }
+
+   if (ndev->netdev_ops != _vf_rep_ndev_ops)
+   return NOTIFY_DONE;
+
+   vf_rep = netdev_priv(ndev);
+   oct = vf_rep->oct;
+
+   if (strlen(ndev->name) > LIO_IF_NAME_SIZE) {
+   dev_err(>pci_dev->dev,
+   "Device name change sync failed as the size is > %d\n",
+   LIO_IF_NAME_SIZE);
+   return NOTIFY_DONE;
+   }
+
+   memset(_cfg, 0, sizeof(rep_cfg));
+   rep_cfg.req_type = LIO_VF_REP_REQ_DEVNAME;
+   rep_cfg.ifidx = vf_rep->ifidx;
+   strncpy(rep_cfg.rep_name.name, ndev->name, LIO_IF_NAME_SIZE);
+
+   ret = lio_vf_rep_send_soft_command(oct, _cfg,
+  sizeof(rep_cfg), NULL, 0);
+   if (ret)
+   dev_err(>pci_dev->dev,
+   "vf_rep netdev name change failed with err %d\n", ret);
+
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block lio_vf_rep_netdev_notifier = {
+   .notifier_call = lio_vf_rep_netdev_event,
+};
+
+int
+lio_vf_rep_modinit(void)
+{
+   if (register_netdevice_notifier(_vf_rep_netdev_notifier)) {
+   pr_err("netdev notifier registration failed\n");
+   return -EFAULT;
+   }
+
+   return 0;
+}
+
+void
+lio_vf_rep_modexit(void)
+{
+   if (unregister_netdevice_notifier(_vf_rep_netdev_notifier))
+   pr_err("netdev notifier unregister failed\n");
+}
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.h 
b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.h
index 5a9ec98..bb3cedc 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.h
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.h
@@ -44,4 +44,6 @@ struct lio_vf_rep_sc_ctx {
 
 int lio_vf_rep_create(struct octeon_device *oct);
 void lio_vf_rep_destroy(struct octeon_device *oct);
+int lio_vf_rep_modinit(void);
+void lio_vf_rep_modexit(void);
 #endif
diff --git

Re: TCP connection closed without FIN or RST

2017-11-01 Thread Eric Dumazet

On Wed, 2017-11-01 at 22:22 +, Vitaly Davidovich wrote:
> Eric,
> 

> Yes I agree.  However the thing I’m still puzzled about is the client
> application is not reading/draining the recvq - ok, the client tcp
> stack should start advertising a 0 window size.  Does a 0 window size
> count against the tcp_retries2? Is that what you were alluding to in
> your first reply?
> 

Every time we receive an (valid) ACK, with a win 0 or not, the counter
of attempts is cleared, given the opportunity for the sender to send 15
more probes.
> 
> If it *does* count towards the retries limit then a RST doesn’t seem
> like a bad idea.  The client is responding with segments but the user
> app there just isn’t draining the data.  Presumably that RST has a
> good chance of reaching the client and then unblocking the read()
> there with a peer reset error.  Or am I missing something?
> 
> 
> If it doesn’t count towards the limit then I need to figure out why
> the 0 window size segments weren’t being sent by the client.

Yes please :)
> 
> 
> I will try to double check that the client was indeed advertising 0
> window size.  There’s nothing special about that machine - it’s a
> 4.1.35 kernel as well.  I wouldn’t expect the tcp stack there to be
> unresponsive just because the user app is sleeping.
>

Re: mlx5 broken affinity

2017-11-01 Thread Jes Sorensen

On 11/01/2017 06:41 PM, Saeed Mahameed wrote:
> On Wed, Nov 1, 2017 at 11:20 AM, Jes Sorensen  wrote:
>> On 11/01/2017 01:21 PM, Sagi Grimberg wrote:
>> I am all in favor of making the automatic setup better, but assuming an
>> automatic setup is always right seems problematic. There could be
>> workloads where you may want to assign affinity explicitly.
>>
>> Jes
> 
> I vaguely remember Nacking Sagi's patch as we knew it would break
> mlx5e netdev affinity assumptions.
> Anyway we already submitted the mlx5e patch that removed such assumption
> 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.ozlabs.org_patch_809196_=DwIBaQ=5VD0RTtNlTh3ycd41b3MUw=zRrmoWoylV2tnG53v9ZA2w=Z6xtsiQVL8xhTauY_DrOWYDhci-D49TqNKLWV_HK5Ug=pkxagNCZzy5-ZzMRTxIQ5pDfFq8WOSRdSx5zeQpQdBI=
>  ("net/mlx5e: Distribute RSS
> table among all RX rings")
> Jes please confirm you have it.
> 
> And I agree here that user should be able to read
> /proc/irq/x/smp_affinity and even modify it if required.

Hi Saeed,

I can confirm I have that patch - the problem is also present in Linus'
current tree.

I can read smp_affinity, but I cannot write to it.

Cheers,
Jes

[PATCH net-next 3/3] bpf: add test cases to bpf selftests to cover all meta tests

2017-11-01 Thread Daniel Borkmann

Lets also add test cases to cover all possible data_meta access tests
for good/bad access cases so we keep tracking them.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
Acked-by: John Fastabend 
---
 tools/testing/selftests/bpf/test_verifier.c | 442 
 1 file changed, 442 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 3b38a3d..bb3c4ad 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -7399,6 +7399,448 @@ struct test_val {
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
},
{
+   "XDP pkt read, pkt_meta' > pkt_data, good access",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+   offsetof(struct xdp_md, data_meta)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+   offsetof(struct xdp_md, data)),
+   BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+   BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+   BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .prog_type = BPF_PROG_TYPE_XDP,
+   },
+   {
+   "XDP pkt read, pkt_meta' > pkt_data, bad access 1",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+   offsetof(struct xdp_md, data_meta)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+   offsetof(struct xdp_md, data)),
+   BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+   BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+   BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -4),
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   },
+   .errstr = "R1 offset is outside of the packet",
+   .result = REJECT,
+   .prog_type = BPF_PROG_TYPE_XDP,
+   .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+   },
+   {
+   "XDP pkt read, pkt_meta' > pkt_data, bad access 2",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+   offsetof(struct xdp_md, data_meta)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+   offsetof(struct xdp_md, data)),
+   BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+   BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 0),
+   BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   },
+   .errstr = "R1 offset is outside of the packet",
+   .result = REJECT,
+   .prog_type = BPF_PROG_TYPE_XDP,
+   },
+   {
+   "XDP pkt read, pkt_data > pkt_meta', good access",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+   offsetof(struct xdp_md, data_meta)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+   offsetof(struct xdp_md, data)),
+   BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+   BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+   BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+   BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .prog_type = BPF_PROG_TYPE_XDP,
+   .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+   },
+   {
+   "XDP pkt read, pkt_data > pkt_meta', bad access 1",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+   offsetof(struct xdp_md, data_meta)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+   offsetof(struct xdp_md, data)),
+   BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+   BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1,

[PATCH net-next 2/3] bpf: also improve pattern matches for meta access

2017-11-01 Thread Daniel Borkmann

Follow-up to 0fd4759c5515 ("bpf: fix pattern matches for direct
packet access") to cover also the remaining data_meta/data matches
in the verifier. The matches are also refactored a bit to simplify
handling of all the cases.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
Acked-by: John Fastabend 
---
 kernel/bpf/verifier.c | 165 +-
 1 file changed, 96 insertions(+), 69 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cc3e94..530b685 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2787,6 +2787,99 @@ static void mark_map_regs(struct bpf_verifier_state 
*state, u32 regno,
}
 }
 
+static bool try_match_pkt_pointers(const struct bpf_insn *insn,
+  struct bpf_reg_state *dst_reg,
+  struct bpf_reg_state *src_reg,
+  struct bpf_verifier_state *this_branch,
+  struct bpf_verifier_state *other_branch)
+{
+   if (BPF_SRC(insn->code) != BPF_X)
+   return false;
+
+   switch (BPF_OP(insn->code)) {
+   case BPF_JGT:
+   if ((dst_reg->type == PTR_TO_PACKET &&
+src_reg->type == PTR_TO_PACKET_END) ||
+   (dst_reg->type == PTR_TO_PACKET_META &&
+reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+   /* pkt_data' > pkt_end, pkt_meta' > pkt_data */
+   find_good_pkt_pointers(this_branch, dst_reg,
+  dst_reg->type, false);
+   } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+   src_reg->type == PTR_TO_PACKET) ||
+  (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+   src_reg->type == PTR_TO_PACKET_META)) {
+   /* pkt_end > pkt_data', pkt_data > pkt_meta' */
+   find_good_pkt_pointers(other_branch, src_reg,
+  src_reg->type, true);
+   } else {
+   return false;
+   }
+   break;
+   case BPF_JLT:
+   if ((dst_reg->type == PTR_TO_PACKET &&
+src_reg->type == PTR_TO_PACKET_END) ||
+   (dst_reg->type == PTR_TO_PACKET_META &&
+reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+   /* pkt_data' < pkt_end, pkt_meta' < pkt_data */
+   find_good_pkt_pointers(other_branch, dst_reg,
+  dst_reg->type, true);
+   } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+   src_reg->type == PTR_TO_PACKET) ||
+  (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+   src_reg->type == PTR_TO_PACKET_META)) {
+   /* pkt_end < pkt_data', pkt_data > pkt_meta' */
+   find_good_pkt_pointers(this_branch, src_reg,
+  src_reg->type, false);
+   } else {
+   return false;
+   }
+   break;
+   case BPF_JGE:
+   if ((dst_reg->type == PTR_TO_PACKET &&
+src_reg->type == PTR_TO_PACKET_END) ||
+   (dst_reg->type == PTR_TO_PACKET_META &&
+reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+   /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
+   find_good_pkt_pointers(this_branch, dst_reg,
+  dst_reg->type, true);
+   } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+   src_reg->type == PTR_TO_PACKET) ||
+  (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+   src_reg->type == PTR_TO_PACKET_META)) {
+   /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
+   find_good_pkt_pointers(other_branch, src_reg,
+  src_reg->type, false);
+   } else {
+   return false;
+   }
+   break;
+   case BPF_JLE:
+   if ((dst_reg->type == PTR_TO_PACKET &&
+src_reg->type == PTR_TO_PACKET_END) ||
+   (dst_reg->type == PTR_TO_PACKET_META &&
+reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+   /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
+   find_good_pkt_pointers(other_branch, dst_reg,
+  dst_reg->type, false);
+   } else if ((dst_reg->type ==

[PATCH net-next 0/3] BPF range marking improvements for meta data

2017-11-01 Thread Daniel Borkmann

The set contains improvements for direct packet access range
markings related to data_meta pointer and test cases for all
such access patterns that the verifier matches on.

Thanks!

Daniel Borkmann (3):
  bpf: minor cleanups after merge
  bpf: also improve pattern matches for meta access
  bpf: add test cases to bpf selftests to cover all meta tests

 kernel/bpf/verifier.c   | 167 +
 tools/testing/selftests/bpf/test_verifier.c | 552 +---
 2 files changed, 594 insertions(+), 125 deletions(-)

-- 
1.9.3

[PATCH net-next 1/3] bpf: minor cleanups after merge

2017-11-01 Thread Daniel Borkmann

Two minor cleanups after Dave's recent merge in f8ddadc4db6c
("Merge git://git.kernel.org...") of net into net-next in
order to get the code in line with what was done originally
in the net tree: i) use max() instead of max_t() since both
ranges are u16, ii) don't split the direct access test cases
in the middle with bpf_exit test cases from 390ee7e29fc
("bpf: enforce return code for cgroup-bpf programs").

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
Acked-by: John Fastabend 
---
 kernel/bpf/verifier.c   |   2 +-
 tools/testing/selftests/bpf/test_verifier.c | 144 ++--
 2 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2bb6d6a..2cc3e94 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2532,7 +2532,7 @@ static void find_good_pkt_pointers(struct 
bpf_verifier_state *state,
continue;
reg = >stack[i].spilled_ptr;
if (reg->type == type && reg->id == dst_reg->id)
-   reg->range = max_t(u16, reg->range, new_range);
+   reg->range = max(reg->range, new_range);
}
 }
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 1b93941..3b38a3d 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -7250,78 +7250,6 @@ struct test_val {
.prog_type = BPF_PROG_TYPE_XDP,
},
{
-   "bpf_exit with invalid return code. test1",
-   .insns = {
-   BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-   BPF_EXIT_INSN(),
-   },
-   .errstr = "R0 has value (0x0; 0x)",
-   .result = REJECT,
-   .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-   },
-   {
-   "bpf_exit with invalid return code. test2",
-   .insns = {
-   BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-   BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
-   BPF_EXIT_INSN(),
-   },
-   .result = ACCEPT,
-   .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-   },
-   {
-   "bpf_exit with invalid return code. test3",
-   .insns = {
-   BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-   BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3),
-   BPF_EXIT_INSN(),
-   },
-   .errstr = "R0 has value (0x0; 0x3)",
-   .result = REJECT,
-   .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-   },
-   {
-   "bpf_exit with invalid return code. test4",
-   .insns = {
-   BPF_MOV64_IMM(BPF_REG_0, 1),
-   BPF_EXIT_INSN(),
-   },
-   .result = ACCEPT,
-   .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-   },
-   {
-   "bpf_exit with invalid return code. test5",
-   .insns = {
-   BPF_MOV64_IMM(BPF_REG_0, 2),
-   BPF_EXIT_INSN(),
-   },
-   .errstr = "R0 has value (0x2; 0x0)",
-   .result = REJECT,
-   .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-   },
-   {
-   "bpf_exit with invalid return code. test6",
-   .insns = {
-   BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
-   BPF_EXIT_INSN(),
-   },
-   .errstr = "R0 is not a known value (ctx)",
-   .result = REJECT,
-   .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-   },
-   {
-   "bpf_exit with invalid return code. test7",
-   .insns = {
-   BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-   BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4),
-   BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2),
-   BPF_EXIT_INSN(),
-   },
-   .errstr = "R0 has unknown scalar value",
-   .result = REJECT,
-   .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-   },
-   {
"XDP pkt read, pkt_end >= pkt_data', bad access 1",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
@@ -7470,6 +7398,78 @@ struct test_val {
.prog_type = BPF_PROG_TYPE_XDP,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
},
+   {
+   "bpf_exit with invalid return code. test1",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+   BPF_EXIT_INSN(),
+   },
+   .errstr =

Re: mlx5 broken affinity

2017-11-01 Thread Saeed Mahameed

On Wed, Nov 1, 2017 at 11:20 AM, Jes Sorensen  wrote:
> On 11/01/2017 01:21 PM, Sagi Grimberg wrote:
>>> Hi,
>>
>> Hi Jes,
>>
>>> The below patch seems to have broken PCI IRQ affinity assignments for
>>> mlx5.
>>
>> I wouldn't call it breaking IRQ affinity assignments. It just makes
>> them automatic.
>
> Well it explicitly breaks the option for an admin to assign them
> manually, which contradicts what is written in
> Documentation/IRQ-affinity.txt
>
>>> Prior to this patch I could echo a value to /proc/irq//smp_affinity
>>> and it would get assigned. With this patch applied I get -EIO
>>
>> Adding Christoph,
>>
>> Ideally the affinity assignments would be better left alone, but
>> I wasn't aware that they are now immutable. Christoph?
>>
>>> The actual affinity assignments seem to have changed too, but I assume
>>> this is a result of the generic allocation?
>>
>> The affinity assignments should be giving you perfect linear assignment
>> of the rx/tx rings completion vectors to CPU cores:
>> first  -> 0
>> second -> 1
>> third  -> 2
>> ...
>>
>> Before, mlx5 spread affinity starting from the local numa node as it
>> relied on that when constructing RSS indirection table only to the local
>> numa node (which as a side effect hurt applications running on the far
>> node as the traffic was guaranteed to arrive rx rings located in the
>> "wrong" node).
>>
>> Now the RSS indirection table is linear across both numa nodes just like
>> the irq affinity settings. Another thing that was improved was the
>> pre/post vectors which blacklisted any non rx/tx completion vectors from
>> the affinity assignments (like port events completion vectors from
>> example).
>
> I am all in favor of making the automatic setup better, but assuming an
> automatic setup is always right seems problematic. There could be
> workloads where you may want to assign affinity explicitly.
>
> Jes

I vaguely remember Nacking Sagi's patch as we knew it would break
mlx5e netdev affinity assumptions.
Anyway we already submitted the mlx5e patch that removed such assumption

https://patchwork.ozlabs.org/patch/809196/ ("net/mlx5e: Distribute RSS
table among all RX rings")
Jes please confirm you have it.

And I agree here that user should be able to read
/proc/irq/x/smp_affinity and even modify it if required.

RE: removing bridge in vlan_filtering mode requests delete of attached ports main MAC address

2017-11-01 Thread Keller, Jacob E

> -Original Message-
> From: Toshiaki Makita [mailto:makita.toshi...@lab.ntt.co.jp]
> Sent: Tuesday, October 31, 2017 5:58 PM
> To: Keller, Jacob E ; vyase...@redhat.com;
> netdev@vger.kernel.org
> Cc: Malek, Patryk 
> Subject: Re: removing bridge in vlan_filtering mode requests delete of 
> attached
> ports main MAC address
> 
> On 2017/11/01 9:10, Keller, Jacob E wrote:
> >> -Original Message-
> >> From: Keller, Jacob E
> >> Sent: Thursday, October 26, 2017 1:33 PM
> >> To: Keller, Jacob E ; vyase...@redhat.com;
> >> netdev@vger.kernel.org
> >> Cc: Malek, Patryk 
> >> Subject: RE: removing bridge in vlan_filtering mode requests delete of
> attached
> >> ports main MAC address
> >>
> >>> -Original Message-
> >>> From: netdev-ow...@vger.kernel.org [mailto:netdev-
> >> ow...@vger.kernel.org]
> >>> On Behalf Of Keller, Jacob E
> >>> Sent: Thursday, October 26, 2017 1:27 PM
> >>> To: vyase...@redhat.com; netdev@vger.kernel.org
> >>> Cc: Malek, Patryk 
> >>> Subject: RE: removing bridge in vlan_filtering mode requests delete of
> attached
> >>> ports main MAC address
> >>>
>  -Original Message-
>  From: Vlad Yasevich [mailto:vyase...@redhat.com]
>  Sent: Thursday, October 26, 2017 3:22 AM
>  To: Keller, Jacob E ; netdev@vger.kernel.org
>  Cc: Malek, Patryk 
>  Subject: Re: removing bridge in vlan_filtering mode requests delete of
> >> attached
>  ports main MAC address
> 
>  Hi Jake
> 
>  I think adding a !fdb->local should work.  local fdb contain the address 
>  of
> >>> assigned
>  to
>  the ports of the bridge and those shouldn't be directly removed.
> 
>  If that works,  that looks like the right solution.
> 
>  -vlad
> 
> >>>
> >>> So this does prevent us from removing the port's address. However, if I 
> >>> add
> >> two
> >>> devices to the bridge, then after removing the bridge, each device now
> keeps
> >>> both permanent addresses in their list, which isn't what we want is it?
> >>>
> >>> Do we even want to assign the local fdb addresses to every port?
> >>>
> >>> Obviously, I don't fully understand this code, so I think I'm missing 
> >>> something
> >>> here.
> >>>
> >>> Regards,
> >>> Jake
> >>>
> >>
> >> Ok, I tried this again, and it didn't end up crossing the local device 
> >> addresses to
> >> each port. I'm not sure how that happened the first time yet, so maybe it 
> >> is
> >> correct to skip removing local addresses... but if we skip removing them,
> wouldn't
> >> we want to skip adding them too?
> >>
> >> Thanks,
> >> Jake
> >
> > There's definitely some weirdness going on, because I've been able to get 
> > the
> local port addresses added to the wrong device under some circumstances. It
> seems to be some sort of race condition, since I can't reliably re-create the
> scenario.
> >
> > Either way, some more insight on what the correct fix here would be nice.
> >
> > I'm thinking we want to skip adding or removing local addresses when 
> > switching
> into the static mode configuration.
> 
> If we skip adding them, we cannot receive frames which should be
> received on the bridge device during non-promiscuous mode.
> 
> --
> Toshiaki Makita

This makes sense, but then what removes the addresses upon bridge deletion or 
exiting static mode?

We want to make sure we remove the correct addresses but don't request a delete 
of the permanent MAC address? Or, do we just completely assume that a device 
will never actually delete it's own permanent address, and thus say this is a 
driver's fault for allowing a delete request of its permanent address to do 
anything..?

Thanks,
Jake

Re: KASAN: stack-out-of-bounds Read in xfrm_state_find (2)

2017-11-01 Thread Florian Westphal

syzbot  
wrote:

[ cc Thomas Egerer ]

> syzkaller hit the following crash on
> 36ef71cae353f88fd6e095e2aaa3e5953af1685d
> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> C reproducer is attached
> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> for information about syzkaller reproducers
> 
> BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x303d/0x3170
> net/xfrm/xfrm_state.c:1051
> Read of size 4 at addr 88003adb7760 by task syzkaller429801/2969

Seems this was added in
commit 8444cf712c5f71845cba9dc30d8f530ff0d5ff83
("xfrm: Allow different selector family in temporary state").

No idea how to fix this:

struct xfrm_state *
xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
const struct flowi *fl, struct xfrm_tmpl *tmpl,
struct xfrm_policy *pol, int *err,
unsigned short family)  // AF_INET
{
[..]
unsigned short encap_family = tmpl->encap_family; // AF_INET6
[..]
h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);

saddr, daddr point to ipv4 addresses inside an on-stack flowi4 struct,
i.e. they get hashed as ipv6 addresses which then results in invalid stack 
access.

What is this supposed to do if family != encap_family?

I also don't understand how address comparision is supposed to work in this 
case,
it seems that if saddr/daddr are v4 and template v6 we compare full ipv6 
addresses
(how would that succeed...?) and, if saddr/daddr is v6 add template is v4 we 
just
compare the first 32bit of the ipv6 addresses...?

This fix silences the reproducer, but I am not sure about it, it looks like it
papers over the real problem...

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1359,16 +1359,19 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const 
struct flowi *fl,
  struct xfrm_state **xfrm, unsigned short family)
 {
struct net *net = xp_net(policy);
+   xfrm_address_t tmp, daddr, saddr;
int nx;
int i, error;
-   xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
-   xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
-   xfrm_address_t tmp;
+
+   memset(, 0, sizeof(saddr));
+   memset(, 0, sizeof(daddr));
+
+   xfrm_flowi_addr_get(fl, , , family);
 
for (nx = 0, i = 0; i < policy->xfrm_nr; i++) {
struct xfrm_state *x;
-   xfrm_address_t *remote = daddr;
-   xfrm_address_t *local  = saddr;
+   xfrm_address_t *remote = 
+   xfrm_address_t *local  = 
struct xfrm_tmpl *tmpl = >xfrm_vec[i];
 
if (tmpl->mode == XFRM_MODE_TUNNEL ||
@@ -1389,8 +1392,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const 
struct flowi *fl,
 
if (x && x->km.state == XFRM_STATE_VALID) {
xfrm[nx++] = x;
-   daddr = remote;
-   saddr = local;
+   daddr = *remote;
+   saddr = *local;
continue;
}
if (x) {

Re: TCP connection closed without FIN or RST

2017-11-01 Thread Eric Dumazet

On Wed, 2017-11-01 at 21:45 +, Vitaly Davidovich wrote:
> Hi Eric,
> 
> 
> First, thanks for replying.  A couple of comments inline.
> 
> On Wed, Nov 1, 2017 at 4:51 PM Eric Dumazet 
> wrote:
> 
> On Wed, 2017-11-01 at 13:34 -0700, Eric Dumazet wrote:
> > On Wed, 2017-11-01 at 16:25 -0400, Vitaly Davidovich wrote:
> > > Hi all,
> > >
> > > I'm seeing some puzzling TCP behavior that I'm hoping
> someone on this
> > > list can shed some light on.  Apologies if this isn't the
> right forum
> > > for this type of question.  But here goes anyway :)
> > >
> > > I have client and server x86-64 linux machines with the
> 4.1.35 kernel.
> > > I set up the following test/scenario:
> > >
> > > 1) Client connects to the server and requests a stream of
> data.  The
> > > server (written in Java) starts to send data.
> > > 2) Client then goes to sleep for 15 minutes (I'll explain
> why below).
> > > 3) Naturally, the server's sendq fills up and it blocks on
> a write() syscall.
> > > 4) Similarly, the client's recvq fills up.
> > > 5) After 15 minutes the client wakes up and reads the data
> off the
> > > socket fairly quickly - the recvq is fully drained.
> > > 6) At about the same time, the server's write() fails with
> ETIMEDOUT.
> > > The server then proceeds to close() the socket.
> > > 7) The client, however, remains forever stuck in its
> read() call.
> > >
> > > When the client is stuck in read(), netstat on the server
> does not
> > > show the tcp connection - it's gone.  On the client,
> netstat shows the
> > > connection with 0 recv (and send) queue size and in
> ESTABLISHED state.
> > >
> > > I have done a packet capture (using tcpdump) on the
> server, and
> > > expected to see either a FIN or RST packet to be sent to
> the client -
> > > neither of these are present.  What is present, however,
> is a bunch of
> > > retrans from the server to the client, with what appears
> to be
> > > exponential backoff.  However, the conversation just stops
> around the
> > > time when the ETIMEDOUT error occurred.  I do not see any
> attempt to
> > > abort or gracefully shut down the TCP stream.
> > >
> > > When I strace the server thread that was blocked on
> write(), I do see
> > > the ETIMEDOUT error from write(), followed by a close() on
> the socket
> > > fd.
> > >
> > > Would anyone possibly know what could cause this? Or
> suggestions on
> > > how to troubleshoot further? In particular, are there any
> known cases
> > > where a FIN or RST wouldn't be sent after a write() times
> out due to
> > > too many retrans? I believe this might be related to the
> tcp_retries2
> > > behavior (the system is configured with the default value
> of 15),
> > > where too many retrans attempts will cause write() to
> error with a
> > > timeout.  My understanding is that this shouldn't do
> anything to the
> > > state of the socket on its own - it should stay in the
> ESTABLISHED
> > > state.  But then presumably a close() should start the
> shutdown state
> > > machine by sending a FIN packet to the client and entering
> FIN WAIT1
> > > on the server.
> > >
> > > Ok, as to why I'm doing a test where the client sleeps for
> 15 minutes
> > > - this is an attempt at reproducing a problem that I saw
> with a client
> > > that wasn't sleeping intentionally, but otherwise the
> situation
> > > appeared to be the same - the server write() blocked,
> eventually timed
> > > out, server tcp session was gone, but client was stuck in
> a read()
> > > syscall with the tcp session still in ESTABLISHED state.
> > >
> > > Thanks a lot ahead of time for any insights/help!
> >
> > We might have an issue with win 0 probes (Probe0), hitting a
> max number
> > of retransmits/probes.
> >
> > I can check this
> 
> If the receiver does not reply to window probes, then sender
> consider
> the flow is dead after 10 attempts
> (/proc/sys/net/ipv4/tcp_retries2 )
> Right, except I have it at 15 (which is also the default).
> 
> 
> Not sure why sending a FIN or RST in this state would be okay,
> since
> there is obviously something wrong on the receiver TCP
>

Re: [PATCH net] tcp: Always cleanup skb before sending

2017-11-01 Thread Eric Dumazet

On Wed, 2017-11-01 at 14:32 -0700, Eric Dumazet wrote:
> On Wed, Nov 1, 2017 at 2:10 PM, Christoph Paasch  wrote:
> > Since commit 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache
> > line misses") IPCB/IP6CB is no more at the beginning of the tcp_skb_cb.
> > This means that on the output path, we need to make sure that it has
> > been correctly initialized to 0, as is done in tcp_transmit_skb.
> >
> > However, when going through the other code-path in TCP that can send an
> > skb (e.g., through tcp_v6_send_synack), we end up in a situation where
> > IP6CB has some of its fields set to unexpected values. Depending on the
> > layout of tcp_skb_cb across the different kernel-versions this can be
> > lastopt, flags,...
> 
> Or not use tcp_init_nondata_skb() on non fast clones, since it adds
> unnecessary writes and clears.
> 
> tcp_make_synack() really has no business using tcp_init_nondata_skb()
> and could simply set th->seq = htonl(tcp_rsk(req)->snt_isn);

Something like :

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 
69cfdead0cb49e4365158048a0d1a9bbdd55fa83..5502abc5307f0ce1de610d4b70f3a59c4d5383c5
 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3399,13 +3399,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, 
struct dst_entry *dst,
tcp_ecn_make_synack(req, th);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
-   /* Setting of flags are superfluous here for callers (and ECE is
-* not even correctly set)
-*/
-   tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
-TCPHDR_SYN | TCPHDR_ACK);
-
-   th->seq = htonl(TCP_SKB_CB(skb)->seq);
+   skb->ip_summed = CHECKSUM_PARTIAL;
+   th->seq = htonl(tcp_rsk(req)->snt_isn);
/* XXX data is queued and acked as is. No buffer/window check */
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);

[PATCH net-next v5 3/3] act_vlan: VLAN action rewrite to use RCU lock/unlock and update

2017-11-01 Thread Manish Kurup

Using a spinlock in the VLAN action causes performance issues when the VLAN
action is used on multiple cores. Rewrote the VLAN action to use RCU read
locking for reads and updates instead.

Acked-by: Jamal Hadi Salim 
Acked-by: Jiri Pirko 
Signed-off-by: Manish Kurup 
---
 include/net/tc_act/tc_vlan.h | 46 +--
 net/sched/act_vlan.c | 65 
 2 files changed, 78 insertions(+), 33 deletions(-)

diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index c2090df..22ae260 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -13,12 +13,17 @@
 #include 
 #include 
 
+struct tcf_vlan_params {
+   int   tcfv_action;
+   u16   tcfv_push_vid;
+   __be16tcfv_push_proto;
+   u8tcfv_push_prio;
+   struct rcu_head   rcu;
+};
+
 struct tcf_vlan {
struct tc_actioncommon;
-   int tcfv_action;
-   u16 tcfv_push_vid;
-   __be16  tcfv_push_proto;
-   u8  tcfv_push_prio;
+   struct tcf_vlan_params __rcu *vlan_p;
 };
 #define to_vlan(a) ((struct tcf_vlan *)a)
 
@@ -33,22 +38,45 @@ static inline bool is_tcf_vlan(const struct tc_action *a)
 
 static inline u32 tcf_vlan_action(const struct tc_action *a)
 {
-   return to_vlan(a)->tcfv_action;
+   u32 tcfv_action;
+
+   rcu_read_lock();
+   tcfv_action = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_action;
+   rcu_read_unlock();
+
+   return tcfv_action;
 }
 
 static inline u16 tcf_vlan_push_vid(const struct tc_action *a)
 {
-   return to_vlan(a)->tcfv_push_vid;
+   u16 tcfv_push_vid;
+
+   rcu_read_lock();
+   tcfv_push_vid = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_vid;
+   rcu_read_unlock();
+
+   return tcfv_push_vid;
 }
 
 static inline __be16 tcf_vlan_push_proto(const struct tc_action *a)
 {
-   return to_vlan(a)->tcfv_push_proto;
+   __be16 tcfv_push_proto;
+
+   rcu_read_lock();
+   tcfv_push_proto = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_proto;
+   rcu_read_unlock();
+
+   return tcfv_push_proto;
 }
 
 static inline u8 tcf_vlan_push_prio(const struct tc_action *a)
 {
-   return to_vlan(a)->tcfv_push_prio;
-}
+   u8 tcfv_push_prio;
 
+   rcu_read_lock();
+   tcfv_push_prio = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_prio;
+   rcu_read_unlock();
+
+   return tcfv_push_prio;
+}
 #endif /* __NET_TC_VLAN_H */
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index b093bad..148efc6d 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -26,6 +26,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct 
tc_action *a,
struct tcf_result *res)
 {
struct tcf_vlan *v = to_vlan(a);
+   struct tcf_vlan_params *p;
int action;
int err;
u16 tci;
@@ -33,24 +34,27 @@ static int tcf_vlan(struct sk_buff *skb, const struct 
tc_action *a,
tcf_lastuse_update(>tcf_tm);
bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb);
 
-   spin_lock(>tcf_lock);
-   action = v->tcf_action;
-
/* Ensure 'data' points at mac_header prior calling vlan manipulating
 * functions.
 */
if (skb_at_tc_ingress(skb))
skb_push_rcsum(skb, skb->mac_len);
 
-   switch (v->tcfv_action) {
+   rcu_read_lock();
+
+   action = READ_ONCE(v->tcf_action);
+
+   p = rcu_dereference(v->vlan_p);
+
+   switch (p->tcfv_action) {
case TCA_VLAN_ACT_POP:
err = skb_vlan_pop(skb);
if (err)
goto drop;
break;
case TCA_VLAN_ACT_PUSH:
-   err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid |
-   (v->tcfv_push_prio << VLAN_PRIO_SHIFT));
+   err = skb_vlan_push(skb, p->tcfv_push_proto, p->tcfv_push_vid |
+   (p->tcfv_push_prio << VLAN_PRIO_SHIFT));
if (err)
goto drop;
break;
@@ -69,14 +73,14 @@ static int tcf_vlan(struct sk_buff *skb, const struct 
tc_action *a,
goto drop;
}
/* replace the vid */
-   tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid;
+   tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid;
/* replace prio bits, if tcfv_push_prio specified */
-   if (v->tcfv_push_prio) {
+   if (p->tcfv_push_prio) {
tci &= ~VLAN_PRIO_MASK;
-   tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT;
+   tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT;
}
/* put updated tci

[PATCH net-next v5 2/3] nfp flower action: Modified to use VLAN helper functions

2017-11-01 Thread Manish Kurup

Modified netronome nfp flower action to use VLAN helper functions instead
of accessing the structure directly.

Signed-off-by: Manish Kurup 
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c 
b/drivers/net/ethernet/netronome/nfp/flower/action.c
index de64ced..c1c595f 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -58,7 +58,6 @@ nfp_fl_push_vlan(struct nfp_fl_push_vlan *push_vlan,
 const struct tc_action *action)
 {
size_t act_size = sizeof(struct nfp_fl_push_vlan);
-   struct tcf_vlan *vlan = to_vlan(action);
u16 tmp_push_vlan_tci;
 
push_vlan->head.jump_id = NFP_FL_ACTION_OPCODE_PUSH_VLAN;
@@ -67,8 +66,8 @@ nfp_fl_push_vlan(struct nfp_fl_push_vlan *push_vlan,
push_vlan->vlan_tpid = tcf_vlan_push_proto(action);
 
tmp_push_vlan_tci =
-   FIELD_PREP(NFP_FL_PUSH_VLAN_PRIO, vlan->tcfv_push_prio) |
-   FIELD_PREP(NFP_FL_PUSH_VLAN_VID, vlan->tcfv_push_vid) |
+   FIELD_PREP(NFP_FL_PUSH_VLAN_PRIO, tcf_vlan_push_prio(action)) |
+   FIELD_PREP(NFP_FL_PUSH_VLAN_VID, tcf_vlan_push_vid(action)) |
NFP_FL_PUSH_VLAN_CFI;
push_vlan->vlan_tci = cpu_to_be16(tmp_push_vlan_tci);
 }
-- 
2.7.4

[PATCH net-next v5 1/3] act_vlan: Change stats update to use per-core stats

2017-11-01 Thread Manish Kurup

The VLAN action maintains one set of stats across all cores, and uses a
spinlock to synchronize updates to it from the same. Changed this to use a
per-CPU stats context instead.
This change will result in better performance.

Acked-by: Jamal Hadi Salim 
Acked-by: Jiri Pirko 
Signed-off-by: Manish Kurup 
---
 net/sched/act_vlan.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 16eb067..b093bad 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -30,9 +30,10 @@ static int tcf_vlan(struct sk_buff *skb, const struct 
tc_action *a,
int err;
u16 tci;
 
-   spin_lock(>tcf_lock);
tcf_lastuse_update(>tcf_tm);
-   bstats_update(>tcf_bstats, skb);
+   bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb);
+
+   spin_lock(>tcf_lock);
action = v->tcf_action;
 
/* Ensure 'data' points at mac_header prior calling vlan manipulating
@@ -85,7 +86,8 @@ static int tcf_vlan(struct sk_buff *skb, const struct 
tc_action *a,
 
 drop:
action = TC_ACT_SHOT;
-   v->tcf_qstats.drops++;
+   qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
+
 unlock:
if (skb_at_tc_ingress(skb))
skb_pull_rcsum(skb, skb->mac_len);
@@ -172,7 +174,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr 
*nla,
 
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
-_vlan_ops, bind, false);
+_vlan_ops, bind, true);
if (ret)
return ret;
 
-- 
2.7.4

[PATCH net-next v4 1/3] net sched act_vlan: Change stats update to use per-core stats

2017-11-01 Thread Manish Kurup

The VLAN action maintains one set of stats across all cores, and uses a
spinlock to synchronize updates to it from the same. Changed this to use a
per-CPU stats context instead.
This change will result in better performance.

Acked-by: Jamal Hadi Salim 
Acked-by: Jiri Pirko 
Signed-off-by: Manish Kurup 
---
 net/sched/act_vlan.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 16eb067..b093bad 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -30,9 +30,10 @@ static int tcf_vlan(struct sk_buff *skb, const struct 
tc_action *a,
int err;
u16 tci;
 
-   spin_lock(>tcf_lock);
tcf_lastuse_update(>tcf_tm);
-   bstats_update(>tcf_bstats, skb);
+   bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb);
+
+   spin_lock(>tcf_lock);
action = v->tcf_action;
 
/* Ensure 'data' points at mac_header prior calling vlan manipulating
@@ -85,7 +86,8 @@ static int tcf_vlan(struct sk_buff *skb, const struct 
tc_action *a,
 
 drop:
action = TC_ACT_SHOT;
-   v->tcf_qstats.drops++;
+   qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
+
 unlock:
if (skb_at_tc_ingress(skb))
skb_pull_rcsum(skb, skb->mac_len);
@@ -172,7 +174,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr 
*nla,
 
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
-_vlan_ops, bind, false);
+_vlan_ops, bind, true);
if (ret)
return ret;
 
-- 
2.7.4

[PATCH net-next v5 0/3] act_vlan rewrite, review comments incorporated

2017-11-01 Thread Manish Kurup

Hi everyone,

Modified the netronome drivers (flower action) to use the VLAN helper functions
instead of dereferencing the structure directly. This is required for the VLAN
action patch.

Could you please review?

Here're the changes:
v2: Fixed all helper functions to use RCU (rtnl_dereference) - Eric, Jamal
v2: Fixed indentation, extra line nits - Jamal, Jiri
v2: Moved rcu_head to the end of the struct - Jiri
v2: Re-formatted locals to reverse-christmas-tree - Jiri
v2: Removed mismatched spin_lock() - Cong
v2: Removed spin_lock_bh() in tcf_vlan_init, rtnl_dereference() should
suffice - Cong, Jiri
v4: Modified the nfp flower action code to use the VLAN helper functions
instead of referencing the structure directly. Isolated this into a
separate patch - Pieter Jansen
v5: Got rid of the unlikely() for the allocation case - Simon Horman

Acked-by: Jamal Hadi Salim 
Acked-by: Jiri Pirko 
Signed-off-by: Manish Kurup 

Manish Kurup (3):
  act_vlan: Change stats update to use per-core stats
  nfp flower action: Modified to use VLAN helper functions
  act_vlan: VLAN action rewrite to use RCU lock/unlock and update

 drivers/net/ethernet/netronome/nfp/flower/action.c |  5 +-
 include/net/tc_act/tc_vlan.h   | 46 +++---
 net/sched/act_vlan.c   | 71 ++
 3 files changed, 84 insertions(+), 38 deletions(-)

-- 
2.7.4

Re: [RFC PATCH] xfrm: fix regression introduced by xdst pcpu cache

2017-11-01 Thread Paul Moore

On Tue, Oct 31, 2017 at 7:08 PM, Florian Westphal  wrote:
> Paul Moore  wrote:
>> On Mon, Oct 30, 2017 at 10:58 AM, Stephen Smalley  wrote:
>> > matching before (as in this patch) or after calling xfrm_bundle_ok()?
>>
>> I would probably make the LSM call the last check, as you've done; but
>> I have to say that is just so it is consistent with the "LSM last"
>> philosophy and not because of any performance related argument.
>>
>> > ... Also,
>> > do we need to test xfrm->sel.family before calling xfrm_selector_match
>> > (as in this patch) or not - xfrm_state_look_at() does so when the
>> > state is XFRM_STATE_VALID but not when it is _ERROR or _EXPIRED?
>>
>> Speaking purely from a SELinux perspective, I'm not sure it matters:
>> as long as the labels match we are happy.  However, from a general
>> IPsec perspective it does seem like a reasonable thing.
>>
>> Granted I'm probably missing something, but it seems a little odd that
>> the code isn't already checking that the selectors match (... what am
>> I missing?).  It does check the policies, maybe that is enough in the
>> normal IPsec case?
>
> The assumption was that identical policies would yield the same SAs,
> but thats not correct.

Well, to be fair, I think the assumption is valid for normal,
unlabeled IPsec.  The problem comes when SELinux starts labeling SAs
and now you have multiple SAs for a given policy, each differing only
in the SELinux/LSM label.

Considering that adding the SELinux/LSM label effectively adds an
additional selector, I'm wondering if we should simply add the
SELinux/LSM label matching to xfrm_selector_match()?  Looking quickly
at the code it seems as though we always follow xfrm_selector_match()
with a LSM check anyway, the one exception being in
__xfrm_policy_check() ... which *might* be a valid exception, as we
don't do our access checks for inbound traffic at that point in the
stack.

>> > diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
>> > index 2746b62..171818b 100644
>> > --- a/net/xfrm/xfrm_policy.c
>> > +++ b/net/xfrm/xfrm_policy.c
>> > @@ -1820,6 +1820,11 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy 
>> > **pols, int num_pols,
>> > !xfrm_pol_dead(xdst) &&
>> > memcmp(xdst->pols, pols,
>> >sizeof(struct xfrm_policy *) * num_pols) == 0 &&
>> > +   (!xdst->u.dst.xfrm->sel.family ||
>> > +xfrm_selector_match(>u.dst.xfrm->sel, fl,
>> > +xdst->u.dst.xfrm->sel.family)) &&
>> > +   security_xfrm_state_pol_flow_match(xdst->u.dst.xfrm,
>> > +  xdst->pols[0], fl) &&
>
> ... so this needs to walk the bundle and validate each selector.
>
> Alternatively we could always do template resolution and then check
> that all states found match those of the old pcpu xdst:
>
> diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
> --- a/net/xfrm/xfrm_policy.c
> +++ b/net/xfrm/xfrm_policy.c
> @@ -1786,19 +1786,23 @@ void xfrm_policy_cache_flush(void)
> put_online_cpus();
>  }
>
> -static bool xfrm_pol_dead(struct xfrm_dst *xdst)
> +static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst,
> +   struct xfrm_state * const xfrm[],
> +   int num)
>  {
> -   unsigned int num_pols = xdst->num_pols;
> -   unsigned int pol_dead = 0, i;
> +   const struct dst_entry *dst = >u.dst;
> +   int i;
>
> -   for (i = 0; i < num_pols; i++)
> -   pol_dead |= xdst->pols[i]->walk.dead;
> +   if (xdst->num_xfrms != num)
> +   return false;
>
> -   /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
> -   if (pol_dead)
> -   xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
> +   for (i = 0; i < num; i++) {
> +   if (!dst || dst->xfrm != xfrm[i])
> +   return false;
> +   dst = dst->child;
> +   }
>
> -   return pol_dead;
> +   return xfrm_bundle_ok(xdst);
>  }
>
>  static struct xfrm_dst *
> @@ -1812,26 +1816,28 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy 
> **pols, int num_pols,
> struct dst_entry *dst;
> int err;
>
> +   /* Try to instantiate a bundle */
> +   err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
> +   if (err <= 0) {
> +   if (err != 0 && err != -EAGAIN)
> +   XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
> +   return ERR_PTR(err);
> +   }
> +
> xdst = this_cpu_read(xfrm_last_dst);
> if (xdst &&
> xdst->u.dst.dev == dst_orig->dev &&
> xdst->num_pols == num_pols &&
> -   !xfrm_pol_dead(xdst) &&
> memcmp(xdst->pols, pols,
>sizeof(struct xfrm_policy *) * num_pols) == 0 &&
> -   xfrm_bundle_ok(xdst)) {
> +   xfrm_xdst_can_reuse(xdst,

Re: [RFC PATCH 1/5] security: Add support for SCTP security hooks

2017-11-01 Thread Richard Haines

On Tue, 2017-10-31 at 14:41 -0200, Marcelo Ricardo Leitner wrote:
> On Tue, Oct 17, 2017 at 03:02:47PM +0100, Richard Haines wrote:
> > The SCTP security hooks are explained in:
> > Documentation/security/LSM-sctp.txt
> > 
> > Signed-off-by: Richard Haines 
> > ---
> >  Documentation/security/LSM-sctp.txt | 212
> > 
> >  include/linux/lsm_hooks.h   |  37 +++
> >  include/linux/security.h|  27 +
> >  security/security.c |  23 
> >  4 files changed, 299 insertions(+)
> >  create mode 100644 Documentation/security/LSM-sctp.txt
> > 
> > diff --git a/Documentation/security/LSM-sctp.txt
> > b/Documentation/security/LSM-sctp.txt
> > new file mode 100644
> > index 000..30fe9b5
> > --- /dev/null
> > +++ b/Documentation/security/LSM-sctp.txt
> > @@ -0,0 +1,212 @@
> > +   SCTP LSM Support
> > +  ==
> > +
> > +For security module support, three sctp specific hooks have been
> > implemented:
> > +security_sctp_assoc_request()
> > +security_sctp_bind_connect()
> > +security_sctp_sk_clone()
> > +
> > +Also the following security hook has been utilised:
> > +security_inet_conn_established()
> > +
> > +The usage of these hooks are described below with the SELinux
> > implementation
> > +described in Documentation/security/SELinux-sctp.txt
> > +
> > +
> > +security_sctp_assoc_request()
> > +--
> > +This new hook has been added to net/sctp/sm_statefuns.c where it
> > passes the
> > +@ep and @chunk->skb (the association INIT or INIT ACK packet) to
> > the security
> > +module. Returns 0 on success, error on failure.
> > +
> > +@ep - pointer to sctp endpoint structure.
> > +@skb - pointer to skbuff of association packet.
> > +@sctp_cid - set to sctp packet type (SCTP_CID_INIT or
> > SCTP_CID_INIT_ACK).
> > +
> > +The security module performs the following operations:
> > +  1) If this is the first association on @ep->base.sk, then set
> > the peer sid
> > + to that in @skb. This will ensure there is only one peer sid
> > assigned
> > + to @ep->base.sk that may support multiple associations.
> > +
> > +  2) If not the first association, validate the @ep->base.sk
> > peer_sid against
> > + the @skb peer sid to determine whether the association should
> > be allowed
> > + or denied.
> > +
> > +  3) If @sctp_cid = SCTP_CID_INIT, then set the sctp @ep sid to
> > socket's sid
> > + (from ep->base.sk) with MLS portion taken from @skb peer sid.
> > This will
> > + only be used by SCTP TCP style sockets and peeled off
> > connections as they
> > + cause a new socket to be generated.
> > +
> > + If IP security options are configured (CIPSO/CALIPSO), then
> > the ip options
> > + are set on the socket.
> > +
> > + To support this hook include/net/sctp/structs.h "struct
> > sctp_endpoint"
> > + has been updated with the following:
> > +
> > +   /* Security identifiers from incoming (INIT). These are
> > set by
> > +* security_sctp_assoc_request(). These will only be used
> > by
> > +* SCTP TCP type sockets and peeled off connections as
> > they
> > +* cause a new socket to be generated.
> > security_sctp_sk_clone()
> > +* will then plug these into the new socket.
> > +*/
> > +   u32 secid;
> > +   u32 peer_secid;
> > +
> > +
> > +security_sctp_bind_connect()
> > +-
> > +This new hook has been added to net/sctp/socket.c and
> > net/sctp/sm_make_chunk.c.
> > +It passes one or more ipv4/ipv6 addresses to the security module
> > for
> > +validation based on the @optname that will result in either a bind
> > or connect
> > +service as shown in the permission check tables below.
> > +Returns 0 on success, error on failure.
> > +
> > +@sk  - Pointer to sock structure.
> > +@optname - Name of the option to validate.
> > +@address - One or more ipv4 / ipv6 addresses.
> > +@addrlen - The total length of address(s). This is calculated
> > on each
> > +   ipv4 or ipv6 address using sizeof(struct
> > sockaddr_in) or
> > +   sizeof(struct sockaddr_in6).
> > +
> > +  --
> > 
> > +  | BIND Type
> > Checks   |
> > +  |   @optname | @address
> > contains |
> > +  ||
> > ---|
> > +  | SCTP_SOCKOPT_BINDX_ADD | One or more ipv4 / ipv6 addresses
> > |
> > +  | SCTP_PRIMARY_ADDR  | Single ipv4 or ipv6
> > address   |
> > +  | SCTP_SET_PEER_PRIMARY_ADDR | Single ipv4 or ipv6
> > address   |
> > +  --
> > 
> > +
> > +  --
> > 
> > +  |

Re: [RFC PATCH 2/5] sctp: Add ip option support

2017-11-01 Thread Richard Haines

On Tue, 2017-10-31 at 15:06 -0200, Marcelo Ricardo Leitner wrote:
> Hello,
> 
> On Tue, Oct 17, 2017 at 02:58:06PM +0100, Richard Haines wrote:
> > Add ip option support to allow LSM security modules to utilise
> > CIPSO/IPv4
> > and CALIPSO/IPv6 services.
> > 
> > Signed-off-by: Richard Haines 
> > ---
> >  include/net/sctp/structs.h |  2 ++
> >  net/sctp/chunk.c   |  7 ---
> >  net/sctp/ipv6.c| 37 ++--
> > -
> >  net/sctp/output.c  |  3 ++-
> >  net/sctp/protocol.c| 36
> > 
> >  net/sctp/socket.c  |  5 -
> >  6 files changed, 78 insertions(+), 12 deletions(-)
> > 
> > diff --git a/include/net/sctp/structs.h
> > b/include/net/sctp/structs.h
> > index 5ab29af..7767577 100644
> > --- a/include/net/sctp/structs.h
> > +++ b/include/net/sctp/structs.h
> > @@ -461,6 +461,7 @@ struct sctp_af {
> > void(*ecn_capable)(struct sock *sk);
> > __u16   net_header_len;
> > int sockaddr_len;
> > +   int (*ip_options_len)(struct sock *sk);
> > sa_family_t sa_family;
> > struct list_head list;
> >  };
> > @@ -485,6 +486,7 @@ struct sctp_pf {
> > int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr
> > *addr);
> > void (*to_sk_saddr)(union sctp_addr *, struct sock *sk);
> > void (*to_sk_daddr)(union sctp_addr *, struct sock *sk);
> > +   void (*copy_ip_options)(struct sock *sk, struct sock
> > *newsk);
> > struct sctp_af *af;
> >  };
> >  
> > diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> > index 1323d41..e49e240 100644
> > --- a/net/sctp/chunk.c
> > +++ b/net/sctp/chunk.c
> > @@ -153,7 +153,6 @@ static void sctp_datamsg_assign(struct
> > sctp_datamsg *msg, struct sctp_chunk *chu
> > chunk->msg = msg;
> >  }
> >  
> > -
> >  /* A data chunk can have a maximum payload of (2^16 - 20).  Break
> >   * down any such message into smaller chunks.  Opportunistically,
> > fragment
> >   * the chunks down to the current MTU constraints.  We may get
> > refragmented
> > @@ -190,7 +189,10 @@ struct sctp_datamsg
> > *sctp_datamsg_from_user(struct sctp_association *asoc,
> >  */
> > max_data = asoc->pathmtu -
> >sctp_sk(asoc->base.sk)->pf->af->net_header_len
> > -
> 
> 
> > -  sizeof(struct sctphdr) - sizeof(struct
> > sctp_data_chunk);
> > +  sizeof(struct sctphdr) - sizeof(struct
> > sctp_data_chunk) -
> > +  sctp_sk(asoc->base.sk)->pf->af->
> 
> 
> > +  ip_options_len(asoc->base.sk);
> 
> Please add a var for sctp_sk(asoc->base.sk)->pf->af. That should also
> help to not break the dereferencing into multiple lines.
DONE
> 
> > +
> > max_data = SCTP_TRUNC4(max_data);
> >  
> > /* If the the peer requested that we authenticate DATA
> > chunks
> > @@ -210,7 +212,6 @@ struct sctp_datamsg
> > *sctp_datamsg_from_user(struct sctp_association *asoc,
> >  
> > /* Set first_len and then account for possible bundles on
> > first frag */
> > first_len = max_data;
> > -
> > /* Check to see if we have a pending SACK and try to let
> > it be bundled
> >  * with this message.  Do this if we don't have any data
> > queued already.
> >  * To check that, look at out_qlen and retransmit list.
> > diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
> > index a4b6ffb..49c9011 100644
> > --- a/net/sctp/ipv6.c
> > +++ b/net/sctp/ipv6.c
> > @@ -423,6 +423,33 @@ static void sctp_v6_copy_addrlist(struct
> > list_head *addrlist,
> > rcu_read_unlock();
> >  }
> >  
> > +/* Copy over any ip options */
> > +static void sctp_v6_copy_ip_options(struct sock *sk, struct sock
> > *newsk)
> > +{
> > +   struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
> > +   struct ipv6_txoptions *opt;
> > +
> > +   newnp = inet6_sk(newsk);
> > +
> > +   rcu_read_lock();
> > +   opt = rcu_dereference(np->opt);
> > +   if (opt)
> > +   opt = ipv6_dup_options(newsk, opt);
> > +   RCU_INIT_POINTER(newnp->opt, opt);
> > +   rcu_read_unlock();
> > +}
> > +
> > +/* Account for the IP options */
> > +static int sctp_v6_ip_options_len(struct sock *sk)
> > +{
> > +   struct ipv6_pinfo *inet6 = inet6_sk(sk);
> > +
> > +   if (inet6->opt)
> > +   return inet6->opt->opt_flen + inet6->opt-
> > >opt_nflen;
> 
> Seems we need RCU protection here.
> And please add a var for inet6->opt.
I've changed and tested the following:

/* Account for the IP options */
static int sctp_v6_ip_options_len(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6_txoptions *opt;
int len = 0;

rcu_read_lock();
opt = rcu_dereference(np->opt);
if (opt)
len = opt->opt_flen + opt->opt_nflen;

rcu_read_unlock();
return len;
}
> 
> > +   else
> > +   return 0;
> > +}
> > +
> >

RE::

2017-11-01 Thread Mrs Hsu Wealther

Are you available at your desk? I need you to please check your email box for a 
business letter.

With Regards,

Ms. Hui Weather

Re: [RFC PATCH 5/5] selinux: Add SCTP support

2017-11-01 Thread Richard Haines

On Tue, 2017-10-31 at 15:16 -0200, Marcelo Ricardo Leitner wrote:
> On Tue, Oct 17, 2017 at 02:59:53PM +0100, Richard Haines wrote:
> > The SELinux SCTP implementation is explained in:
> > Documentation/security/SELinux-sctp.txt
> > 
> > Signed-off-by: Richard Haines 
> > ---
> 
> ...
> > diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> > index 33fd061..c3e9600 100644
> > --- a/security/selinux/hooks.c
> > +++ b/security/selinux/hooks.c
> 
> ...
> > @@ -4521,7 +4565,14 @@ static int selinux_socket_connect(struct
> > socket *sock, struct sockaddr *address,
> > unsigned short snum;
> > u32 sid, perm;
> >  
> > -   if (sk->sk_family == PF_INET) {
> > +   /* sctp_connectx(3) calls via
> > +*selinux_sctp_bind_connect() that validates
> > multiple
> > +* connect addresses. Because of this need to
> > check
> > +* address->sa_family as it is possible to have
> > +* sk->sk_family = PF_INET6 with addr->sa_family =
> > AF_INET.
> > +*/
> > +   if (sk->sk_family == PF_INET ||
> > +   address->sa_family ==
> > AF_INET) {
> 
> Not sure which code style applies on this file but the if () above
> looks odd. At least, checkpatch.pl complained about it.
Changed to read:
if (sk->sk_family == PF_INET ||
address->sa_family == AF_INET) {

> 
>   Marcelo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-
> security-module" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Bond recovery from BOND_LINK_FAIL state not working

2017-11-01 Thread Jay Vosburgh

Alex Sidorenko  wrote:

>The problem has been found while trying to deploy RHEL7 on HPE Synergy
>platform, it is seen both in customer's environment and in HPE test lab.
>
>There are several bonds configured in TLB mode and miimon=100, all other
>options are default. Slaves are connected to VirtualConnect
>modules. Rebooting a VC module should bring one bond slave (ens3f0) down
>temporarily, but not another one (ens3f1). But what we see is
>
>Oct 24 10:37:12 SYDC1LNX kernel: bond0: link status up again after 0 ms for 
>interface ens3f1
>
>and it never recovers. When VC reboot is complete, everything goes back to 
>normal again.
>
>Redhat has backported all recent upstream commits and instrumented the
>bonding driver. We have found the following (when VC goes down)
>
>In bond_miimon_inspect() the first slave goes to
>   bond_propose_link_state(slave, BOND_LINK_FAIL);
>   and
>   slave->new_link = BOND_LINK_DOWN;
>
>The second slave is still
>   slave->link = BOND_LINK_UP;
>   and
>slave->new_link = BOND_LINK_NOCHANGE;
>
>This is as expected. But in bond_miimon_commit() we see that _both_ slaves
>are in BOND_LINK_FAIL.  That is, something changes the state of the second
>slave from another thread. We suspect the NetworkManager, as the problem
>is there _only_ when bonds are controlled by it, if we set
>NM_CONTROLLED=no everything starts working normally.
>
>While we still do not understand how NM affects bond state, I think that
>bonding driver needs to be made reliable enough to recover even from this
>state.

You're suggesting that the bonding driver be able to distinguish
"false" down states set by network manager from a genuine link failure?
Am I misunderstanding?

>At this moment when we enter bond_miimon_inspect() with
>slave->link = BOND_LINK_FAIL and are in the following code
>
>/*FALLTHRU*/
>case BOND_LINK_BACK:
>if (!link_state) {
>bond_propose_link_state(slave, BOND_LINK_DOWN);
>netdev_info(bond->dev, "link status down
>again after %d ms for interface %s\n",
>(bond->params.updelay - 
> slave->delay) *
>bond->params.miimon,
>slave->dev->name);
>
>commit++;
>continue;
>}
>

Looking at bonding in the current net-next, if a slave enters
bond_miimon_inspect with slave->link == BOND_LINK_FAIL, it will not
proceed to the BOND_LINK_BACK block of the switch; BOND_LINK_FAIL does
not fall through that far.

Did you actually mean the BOND_LINK_FAIL block of the switch?
I'll assume so for the rest of my reply.

>we propose a new state and do 'commit++', but we do not change
>slave->new_link from BOND_LINK_NOCHANGE. As a result, bond_miimon_commit()
>will not process this slave.
>
>The following patch fixes the issue:
>
>
>If we enter bond_miimon_inspect() with slave_link=BOND_LINK_FAIL
>and recover, we do bond_propose_link_state(slave, BOND_LINK_UP);
>but do not change slave->new_link, so it is left in
>BOND_LINK_NOCHANGE. As a result, bond_miimon_commit() will not
>process that slave and it never recovers. We need to set
>slave->new_link = BOND_LINK_UP to make bond_miimon_commit() work

What is your downdelay set to?  In principle,
bond_miimon_inspect should not enter with a slave in BOND_LINK_FAIL
state if downdelay is 0.

> drivers/net/bonding/bond_main.c | 1 +
> 1 file changed, 1 insertion(+)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index c99dc59..07aa7ba 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -2072,6 +2072,7 @@ static int bond_miimon_inspect(struct bonding *bond)
>(bond->params.downdelay - 
> slave->delay) *
>bond->params.miimon,
>slave->dev->name);
>+   slave->new_link = BOND_LINK_UP;
>commit++;
>continue;
>}
>-- 
>2.7.4

Your patch does not apply to net-next, so I'm not absolutely
sure where this is, but presuming that this is in the BOND_LINK_FAIL
case of the switch, it looks like both BOND_LINK_FAIL and BOND_LINK_BACK
will have the issue that if the link recovers or fails, respectively,
within the delay window (for down/updelay > 0) it won't set a
slave->new_link.

Looks like this got lost somewhere along the line, as originally
the transition back to UP (or DOWN) happened immediately, and that has
been lost somewhere.

I'll have to dig out when that broke, but I'll see about a test
patch this

Re: [PATCH net] tcp: Always cleanup skb before sending

2017-11-01 Thread Eric Dumazet

On Wed, Nov 1, 2017 at 2:10 PM, Christoph Paasch  wrote:
> Since commit 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache
> line misses") IPCB/IP6CB is no more at the beginning of the tcp_skb_cb.
> This means that on the output path, we need to make sure that it has
> been correctly initialized to 0, as is done in tcp_transmit_skb.
>
> However, when going through the other code-path in TCP that can send an
> skb (e.g., through tcp_v6_send_synack), we end up in a situation where
> IP6CB has some of its fields set to unexpected values. Depending on the
> layout of tcp_skb_cb across the different kernel-versions this can be
> lastopt, flags,...

Or not use tcp_init_nondata_skb() on non fast clones, since it adds
unnecessary writes and clears.

tcp_make_synack() really has no business using tcp_init_nondata_skb()
and could simply set th->seq = htonl(tcp_rsk(req)->snt_isn);

Re: [PATCH iproute2] bridge: fdb: print NDA_SRC_VNI if available

2017-11-01 Thread Stephen Hemminger

On Thu, 26 Oct 2017 10:12:55 -0700
Roopa Prabhu  wrote:

> From: Roopa Prabhu 
> 
> Signed-off-by: Roopa Prabhu 

In general, this looks ok and I will apply it.

But why is there no ability to set source vni? The kernel accepts
it as a parameter to vxlan, but there is no option to set it with
ip command when creating vxlan.

Re: [iproute2 1/2] tc: Add support for the CBS qdisc

2017-11-01 Thread Stephen Hemminger

On Thu, 26 Oct 2017 10:17:48 -0700
Jeff Kirsher  wrote:

> From: Vinicius Costa Gomes 
> 
> The Credit Based Shaper (CBS) queueing discipline allows bandwidth
> reservation with sub-milisecond precision. It is defined by the
> 802.1Q-2014 specification (section 8.6.8.2 and Annex L).
> 
> The syntax is:
> 
> tc qdisc add dev DEV parent NODE cbs locredit 
>   hicredit  sendslope 
>   idleslope 
> 
> (The order is not important)
> 
> Signed-off-by: Vinicius Costa Gomes 
> Signed-off-by: Jeff Kirsher 

Applied to net-next branch (now that CBS is upstream in kernel).

Re: [PATCH] ip/ipvlan: enhance ability to add mode flags to existing modes

2017-11-01 Thread Stephen Hemminger

On Mon, 30 Oct 2017 13:57:51 -0700
Mahesh Bandewar  wrote:

> From: Mahesh Bandewar 
> 
> IPvlan supported bridge-only functionality prior to commits
> a190d04db937 ('ipvlan: introduce 'private' attribute for all
> existing modes.') and fe89aa6b250c ('ipvlan: implement VEPA mode').
> These two commits allow to configure the VEPA and private modes now.
> This patch adds those options in ip command.
> 
> e.g.
>   bash:~# ip link add link eth0 name ipvl0 type ipvlan mode l2 private
>   -or-
>   bash:~# ip link add link eth0 type ipvl0 type ipvlan mode l2 vepa
> 
> Also the output will reflect the mode and the mode-flag accordingly.
> e.g.
>   bash:~# ip -details link show ipvl0
>   4: ipvl0@eth0:  mtu 1500 qdisc ...
>  link/ether 00:1a:11:44:a5:3e brd ff:ff:ff:ff:ff:ff promiscuity 0
>  ipvlan  mode l2 private addrgenmode eui64 numtxqueues 1 ...
> 
> Signed-off-by: Mahesh Bandewar 

Applied to net-next branch (now that this is upstream).
Thanks Mahesh

[PATCH net] tcp: Always cleanup skb before sending

2017-11-01 Thread Christoph Paasch

Since commit 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache
line misses") IPCB/IP6CB is no more at the beginning of the tcp_skb_cb.
This means that on the output path, we need to make sure that it has
been correctly initialized to 0, as is done in tcp_transmit_skb.

However, when going through the other code-path in TCP that can send an
skb (e.g., through tcp_v6_send_synack), we end up in a situation where
IP6CB has some of its fields set to unexpected values. Depending on the
layout of tcp_skb_cb across the different kernel-versions this can be
lastopt, flags,...

This patch makes sure that IPCB/IP6CB is always set to 0 before sending.

Cc: Eric Dumazet 
Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line 
misses")
Signed-off-by: Christoph Paasch 
---
 include/net/tcp.h |  2 ++
 net/ipv4/tcp_ipv4.c   |  6 ++
 net/ipv4/tcp_output.c | 20 
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e6d0002a1b0b..a375ee8fc534 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2032,6 +2032,8 @@ static inline void tcp_listendrop(const struct sock *sk)
 
 enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 
+void tcp_skb_cleanup(struct sk_buff *skb);
+
 /*
  * Interface for adding Upper Level Protocols over TCP
  */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5b027c69cbc5..db7dd65b1f19 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -709,6 +709,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb)
 
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
+
+   tcp_skb_cleanup(skb);
+
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
  skb, _SKB_CB(skb)->header.h4.opt,
@@ -795,6 +798,9 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.bound_dev_if = oif;
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
+
+   tcp_skb_cleanup(skb);
+
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
  skb, _SKB_CB(skb)->header.h4.opt,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 823003eef3a2..6935a68d449b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -973,6 +973,16 @@ static void tcp_internal_pacing(struct sock *sk, const 
struct sk_buff *skb)
  HRTIMER_MODE_ABS_PINNED);
 }
 
+void tcp_skb_cleanup(struct sk_buff *skb)
+{
+   /* Our usage of tstamp should remain private */
+   skb->tstamp = 0;
+
+   /* Cleanup our debris for IP stacks */
+   memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
+  sizeof(struct inet6_skb_parm)));
+}
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -1115,12 +1125,7 @@ static int tcp_transmit_skb(struct sock *sk, struct 
sk_buff *skb, int clone_it,
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
 
-   /* Our usage of tstamp should remain private */
-   skb->tstamp = 0;
-
-   /* Cleanup our debris for IP stacks */
-   memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
-  sizeof(struct inet6_skb_parm)));
+   tcp_skb_cleanup(skb);
 
err = icsk->icsk_af_ops->queue_xmit(sk, skb, >cork.fl);
 
@@ -3204,8 +3209,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, 
struct dst_entry *dst,
rcu_read_unlock();
 #endif
 
-   /* Do not fool tcpdump (if any), clean our debris */
-   skb->tstamp = 0;
+   tcp_skb_cleanup(skb);
return skb;
 }
 EXPORT_SYMBOL(tcp_make_synack);
-- 
2.14.1

Re: [PATCH v2 iproute2] ip: add fastopen_no_cookie option to ip route

2017-11-01 Thread Stephen Hemminger

On Tue, 31 Oct 2017 14:54:52 -0700
Christoph Paasch  wrote:

> This patch adds fastopen_no_cookie option to enable/disable TCP fastopen
> without a cookie on a per-route basis.
> 
> Support in Linux was added with 71c02379c762 (tcp: Configure TFO without
> cookie per socket and/or per route).
> 
> Cc: Stephen Hemminger 
> Signed-off-by: Christoph Paasch 

Applied to net-next branch of iproute2.

[PATCH] [net-next,v2] ibmvnic: Feature implementation of Vital Product Data (VPD) for the ibmvnic driver

2017-11-01 Thread Desnes Augusto Nunes do Rosario

This patch implements and enables VDP support for the ibmvnic driver.
Moreover, it includes the implementation of suitable structs, signal
 transmission/handling and functions which allows the retrival of firmware
 information from the ibmvnic card.

Signed-off-by: Desnes A. Nunes do Rosario 
Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 135 -
 drivers/net/ethernet/ibm/ibmvnic.h |  27 +++-
 2 files changed, 159 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index d0cff28..ee7a27b 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -573,6 +573,15 @@ static int reset_tx_pools(struct ibmvnic_adapter *adapter)
return 0;
 }
 
+static void release_vpd_data(struct ibmvnic_adapter *adapter)
+{
+   if (!adapter->vpd)
+   return;
+
+   kfree(adapter->vpd->buff);
+   kfree(adapter->vpd);
+}
+
 static void release_tx_pools(struct ibmvnic_adapter *adapter)
 {
struct ibmvnic_tx_pool *tx_pool;
@@ -753,6 +762,8 @@ static void release_resources(struct ibmvnic_adapter 
*adapter)
 {
int i;
 
+   release_vpd_data(adapter);
+
release_tx_pools(adapter);
release_rx_pools(adapter);
 
@@ -833,6 +844,53 @@ static int set_real_num_queues(struct net_device *netdev)
return rc;
 }
 
+static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
+{
+   struct device *dev = >vdev->dev;
+   union ibmvnic_crq crq;
+   dma_addr_t dma_addr;
+   int len;
+
+   if (adapter->vpd->buff)
+   len = adapter->vpd->len;
+
+   reinit_completion(>fw_done);
+   crq.get_vpd_size.first = IBMVNIC_CRQ_CMD;
+   crq.get_vpd_size.cmd = GET_VPD_SIZE;
+   ibmvnic_send_crq(adapter, );
+   wait_for_completion(>fw_done);
+
+   if (!adapter->vpd->buff)
+   adapter->vpd->buff = kzalloc(adapter->vpd->len, GFP_KERNEL);
+   else if (adapter->vpd->len != len)
+   adapter->vpd->buff =
+   krealloc(adapter->vpd->buff,
+adapter->vpd->len, GFP_KERNEL);
+
+   if (!adapter->vpd->buff) {
+   dev_err(dev, "Could allocate VPD buffer\n");
+   return -ENOMEM;
+   }
+
+   adapter->vpd->dma_addr =
+   dma_map_single(dev, adapter->vpd->buff, adapter->vpd->len,
+  DMA_FROM_DEVICE);
+   if (dma_mapping_error(dev, dma_addr)) {
+   dev_err(dev, "Could not map VPD buffer\n");
+   return -ENOMEM;
+   }
+
+   reinit_completion(>fw_done);
+   crq.get_vpd.first = IBMVNIC_CRQ_CMD;
+   crq.get_vpd.cmd = GET_VPD;
+   crq.get_vpd.ioba = cpu_to_be32(adapter->vpd->dma_addr);
+   crq.get_vpd.len = cpu_to_be32((u32)adapter->vpd->len);
+   ibmvnic_send_crq(adapter, );
+   wait_for_completion(>fw_done);
+
+   return 0;
+}
+
 static int init_resources(struct ibmvnic_adapter *adapter)
 {
struct net_device *netdev = adapter->netdev;
@@ -850,6 +908,10 @@ static int init_resources(struct ibmvnic_adapter *adapter)
if (rc)
return rc;
 
+   adapter->vpd = kzalloc(sizeof(*adapter->vpd), GFP_KERNEL);
+   if (!adapter->vpd)
+   return -ENOMEM;
+
adapter->map_id = 1;
adapter->napi = kcalloc(adapter->req_rx_queues,
sizeof(struct napi_struct), GFP_KERNEL);
@@ -950,6 +1012,10 @@ static int ibmvnic_open(struct net_device *netdev)
 
rc = __ibmvnic_open(netdev);
netif_carrier_on(netdev);
+
+   /* Vital Product Data (VPD) */
+   ibmvnic_get_vpd(adapter);
+
mutex_unlock(>reset_lock);
 
return rc;
@@ -1878,11 +1944,15 @@ static int ibmvnic_get_link_ksettings(struct net_device 
*netdev,
return 0;
 }
 
-static void ibmvnic_get_drvinfo(struct net_device *dev,
+static void ibmvnic_get_drvinfo(struct net_device *netdev,
struct ethtool_drvinfo *info)
 {
+   struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+
strlcpy(info->driver, ibmvnic_driver_name, sizeof(info->driver));
strlcpy(info->version, IBMVNIC_DRIVER_VERSION, sizeof(info->version));
+   strlcpy(info->fw_version, adapter->fw_version,
+   sizeof(info->fw_version));
 }
 
 static u32 ibmvnic_get_msglevel(struct net_device *netdev)
@@ -3076,6 +3146,63 @@ static void send_cap_queries(struct ibmvnic_adapter 
*adapter)
ibmvnic_send_crq(adapter, );
 }
 
+static void handle_vpd_size_rsp(union ibmvnic_crq *crq,
+   struct ibmvnic_adapter *adapter)
+{
+   struct device *dev = >vdev->dev;
+
+   if (crq->get_vpd_size_rsp.rc.code) {
+   dev_err(dev, "Error retrieving VPD size, rc=%x\n",
+

Re: [PATCH iproute2 v2 1/1] ip netns: use strtol() instead of atoi()

2017-11-01 Thread Stephen Hemminger

On Tue, 31 Oct 2017 14:24:19 -0400
Roman Mashak  wrote:

> Use strtol-based API to parse and validate integer input; atoi() does
> not detect errors and may yield undefined behaviour if result can't be
> represented.
> 
> v2: use get_unsigned() since network namespace is really an unsigned value.
> 
> Signed-off-by: Roman Mashak 

Applied. Thanks Roman

Re: [PATCH iproute2 3/3] xfrm_{state,policy}: Allow to deleteall polices/states with marks

2017-11-01 Thread Stephen Hemminger

On Wed, 1 Nov 2017 10:52:54 +0100
Thomas Egerer  wrote:

> Using 'ip deleteall' with policies that have marks, fails unless you
> eplicitely specify the mark values. This is very uncomfortable when
> bulk-deleting policies and states. With this patch all relevant states
> and policies are wiped by 'ip deleteall' regardless of their mark
> values.
> 
> Signed-off-by: Thomas Egerer 

Like netdev to the kernel, you need to resend the whole patch series.

Re: [PATCH iproute2] Add "show" subcommand to "ip fou"

2017-11-01 Thread Stephen Hemminger

On Tue, 31 Oct 2017 13:00:47 -0700
Greg Greenway  wrote:

> + if (tb[FOU_ATTR_AF]) {
> + family = rta_getattr_u8(tb[FOU_ATTR_AF]);
> + if (family == AF_INET)
> + family_str = "AF_INET";
> + else if (family == AF_INET6)
> + family_str = "AF_INET6";
> + else
> + family_str = "unknown";
> + fprintf(fp, "af %s ", family_str);

The unwritten rule for ip commands is that the show function
must format the output with same command syntax as the other commands 
set/add/delete.
Since there is no "af AF_INET" option to ip fou, this breaks that convention.
Either ignore the address family, change the add command, or output with same
syntax (-6); preferably the latter.

Re: TCP connection closed without FIN or RST

2017-11-01 Thread Eric Dumazet

On Wed, 2017-11-01 at 13:34 -0700, Eric Dumazet wrote:
> On Wed, 2017-11-01 at 16:25 -0400, Vitaly Davidovich wrote:
> > Hi all,
> > 
> > I'm seeing some puzzling TCP behavior that I'm hoping someone on this
> > list can shed some light on.  Apologies if this isn't the right forum
> > for this type of question.  But here goes anyway :)
> > 
> > I have client and server x86-64 linux machines with the 4.1.35 kernel.
> > I set up the following test/scenario:
> > 
> > 1) Client connects to the server and requests a stream of data.  The
> > server (written in Java) starts to send data.
> > 2) Client then goes to sleep for 15 minutes (I'll explain why below).
> > 3) Naturally, the server's sendq fills up and it blocks on a write() 
> > syscall.
> > 4) Similarly, the client's recvq fills up.
> > 5) After 15 minutes the client wakes up and reads the data off the
> > socket fairly quickly - the recvq is fully drained.
> > 6) At about the same time, the server's write() fails with ETIMEDOUT.
> > The server then proceeds to close() the socket.
> > 7) The client, however, remains forever stuck in its read() call.
> > 
> > When the client is stuck in read(), netstat on the server does not
> > show the tcp connection - it's gone.  On the client, netstat shows the
> > connection with 0 recv (and send) queue size and in ESTABLISHED state.
> > 
> > I have done a packet capture (using tcpdump) on the server, and
> > expected to see either a FIN or RST packet to be sent to the client -
> > neither of these are present.  What is present, however, is a bunch of
> > retrans from the server to the client, with what appears to be
> > exponential backoff.  However, the conversation just stops around the
> > time when the ETIMEDOUT error occurred.  I do not see any attempt to
> > abort or gracefully shut down the TCP stream.
> > 
> > When I strace the server thread that was blocked on write(), I do see
> > the ETIMEDOUT error from write(), followed by a close() on the socket
> > fd.
> > 
> > Would anyone possibly know what could cause this? Or suggestions on
> > how to troubleshoot further? In particular, are there any known cases
> > where a FIN or RST wouldn't be sent after a write() times out due to
> > too many retrans? I believe this might be related to the tcp_retries2
> > behavior (the system is configured with the default value of 15),
> > where too many retrans attempts will cause write() to error with a
> > timeout.  My understanding is that this shouldn't do anything to the
> > state of the socket on its own - it should stay in the ESTABLISHED
> > state.  But then presumably a close() should start the shutdown state
> > machine by sending a FIN packet to the client and entering FIN WAIT1
> > on the server.
> > 
> > Ok, as to why I'm doing a test where the client sleeps for 15 minutes
> > - this is an attempt at reproducing a problem that I saw with a client
> > that wasn't sleeping intentionally, but otherwise the situation
> > appeared to be the same - the server write() blocked, eventually timed
> > out, server tcp session was gone, but client was stuck in a read()
> > syscall with the tcp session still in ESTABLISHED state.
> > 
> > Thanks a lot ahead of time for any insights/help!
> 
> We might have an issue with win 0 probes (Probe0), hitting a max number
> of retransmits/probes.
> 
> I can check this

If the receiver does not reply to window probes, then sender consider
the flow is dead after 10 attempts (/proc/sys/net/ipv4/tcp_retries2 )

Not sure why sending a FIN or RST in this state would be okay, since
there is obviously something wrong on the receiver TCP implementation.

If after sending 10 probes, we need to add 10 more FIN packets just in
case there is still something at the other end, it adds a lot of
overhead on the network.

Re: Any hardware limitation for bpf testing?

2017-11-01 Thread Daniel Borkmann


On 11/01/2017 12:16 PM, Orson Zhai wrote:
[...]

We have ran some bpf test within kselftest for 4.14.0-rc5-next-20171018 at
some hardware resource limited devices. Say Hikey Board (arm64 core) with
2GB memory.

The test processes was killed by OOM which made the test failed.

We investigate the reason and find bpf test will require a large mount of
memory to run.

Like some code in selftests/bpf/test_maps.c


static void test_hashmap_walk(int task, void *data)
{
  int fd, i, max_entries = 10;

  

   ..
  }


The test will pass smoothly when we set max_entries to a smaller number.


Feel free to send a patch to set this to something lower, maybe 10k
or such, if that will work for your setup.

Thanks,
Daniel

Re: TCP connection closed without FIN or RST

2017-11-01 Thread Eric Dumazet

On Wed, 2017-11-01 at 16:25 -0400, Vitaly Davidovich wrote:
> Hi all,
> 
> I'm seeing some puzzling TCP behavior that I'm hoping someone on this
> list can shed some light on.  Apologies if this isn't the right forum
> for this type of question.  But here goes anyway :)
> 
> I have client and server x86-64 linux machines with the 4.1.35 kernel.
> I set up the following test/scenario:
> 
> 1) Client connects to the server and requests a stream of data.  The
> server (written in Java) starts to send data.
> 2) Client then goes to sleep for 15 minutes (I'll explain why below).
> 3) Naturally, the server's sendq fills up and it blocks on a write() syscall.
> 4) Similarly, the client's recvq fills up.
> 5) After 15 minutes the client wakes up and reads the data off the
> socket fairly quickly - the recvq is fully drained.
> 6) At about the same time, the server's write() fails with ETIMEDOUT.
> The server then proceeds to close() the socket.
> 7) The client, however, remains forever stuck in its read() call.
> 
> When the client is stuck in read(), netstat on the server does not
> show the tcp connection - it's gone.  On the client, netstat shows the
> connection with 0 recv (and send) queue size and in ESTABLISHED state.
> 
> I have done a packet capture (using tcpdump) on the server, and
> expected to see either a FIN or RST packet to be sent to the client -
> neither of these are present.  What is present, however, is a bunch of
> retrans from the server to the client, with what appears to be
> exponential backoff.  However, the conversation just stops around the
> time when the ETIMEDOUT error occurred.  I do not see any attempt to
> abort or gracefully shut down the TCP stream.
> 
> When I strace the server thread that was blocked on write(), I do see
> the ETIMEDOUT error from write(), followed by a close() on the socket
> fd.
> 
> Would anyone possibly know what could cause this? Or suggestions on
> how to troubleshoot further? In particular, are there any known cases
> where a FIN or RST wouldn't be sent after a write() times out due to
> too many retrans? I believe this might be related to the tcp_retries2
> behavior (the system is configured with the default value of 15),
> where too many retrans attempts will cause write() to error with a
> timeout.  My understanding is that this shouldn't do anything to the
> state of the socket on its own - it should stay in the ESTABLISHED
> state.  But then presumably a close() should start the shutdown state
> machine by sending a FIN packet to the client and entering FIN WAIT1
> on the server.
> 
> Ok, as to why I'm doing a test where the client sleeps for 15 minutes
> - this is an attempt at reproducing a problem that I saw with a client
> that wasn't sleeping intentionally, but otherwise the situation
> appeared to be the same - the server write() blocked, eventually timed
> out, server tcp session was gone, but client was stuck in a read()
> syscall with the tcp session still in ESTABLISHED state.
> 
> Thanks a lot ahead of time for any insights/help!

We might have an issue with win 0 probes (Probe0), hitting a max number
of retransmits/probes.

I can check this.

Re: [PATCH net-next] security: bpf: replace include of linux/bpf.h with forward declarations

2017-11-01 Thread Daniel Borkmann


On 11/01/2017 07:48 PM, Jakub Kicinski wrote:

Touching linux/bpf.h makes us rebuild a surprisingly large
portion of the kernel.  Remove the unnecessary dependency
from security.h, it only needs forward declarations.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 


Acked-by: Daniel Borkmann

TCP connection closed without FIN or RST

2017-11-01 Thread Vitaly Davidovich

Hi all,

I'm seeing some puzzling TCP behavior that I'm hoping someone on this
list can shed some light on.  Apologies if this isn't the right forum
for this type of question.  But here goes anyway :)

I have client and server x86-64 linux machines with the 4.1.35 kernel.
I set up the following test/scenario:

1) Client connects to the server and requests a stream of data.  The
server (written in Java) starts to send data.
2) Client then goes to sleep for 15 minutes (I'll explain why below).
3) Naturally, the server's sendq fills up and it blocks on a write() syscall.
4) Similarly, the client's recvq fills up.
5) After 15 minutes the client wakes up and reads the data off the
socket fairly quickly - the recvq is fully drained.
6) At about the same time, the server's write() fails with ETIMEDOUT.
The server then proceeds to close() the socket.
7) The client, however, remains forever stuck in its read() call.

When the client is stuck in read(), netstat on the server does not
show the tcp connection - it's gone.  On the client, netstat shows the
connection with 0 recv (and send) queue size and in ESTABLISHED state.

I have done a packet capture (using tcpdump) on the server, and
expected to see either a FIN or RST packet to be sent to the client -
neither of these are present.  What is present, however, is a bunch of
retrans from the server to the client, with what appears to be
exponential backoff.  However, the conversation just stops around the
time when the ETIMEDOUT error occurred.  I do not see any attempt to
abort or gracefully shut down the TCP stream.

When I strace the server thread that was blocked on write(), I do see
the ETIMEDOUT error from write(), followed by a close() on the socket
fd.

Would anyone possibly know what could cause this? Or suggestions on
how to troubleshoot further? In particular, are there any known cases
where a FIN or RST wouldn't be sent after a write() times out due to
too many retrans? I believe this might be related to the tcp_retries2
behavior (the system is configured with the default value of 15),
where too many retrans attempts will cause write() to error with a
timeout.  My understanding is that this shouldn't do anything to the
state of the socket on its own - it should stay in the ESTABLISHED
state.  But then presumably a close() should start the shutdown state
machine by sending a FIN packet to the client and entering FIN WAIT1
on the server.

Ok, as to why I'm doing a test where the client sleeps for 15 minutes
- this is an attempt at reproducing a problem that I saw with a client
that wasn't sleeping intentionally, but otherwise the situation
appeared to be the same - the server write() blocked, eventually timed
out, server tcp session was gone, but client was stuck in a read()
syscall with the tcp session still in ESTABLISHED state.

Thanks a lot ahead of time for any insights/help!

Re: Oops with HTB on net-next

2017-11-01 Thread Dave Taht

On Wed, Nov 1, 2017 at 9:04 AM, Cong Wang  wrote:
> On Tue, Oct 31, 2017 at 11:42 PM, Dave Taht  wrote:
>> I am using a fairly complex htb + three tiers of fq_codel and a couple
>> tc filters (it's the sqm-scripts "simple.qos" model). I rebased on
>> net-next head an hour ago, and
>>
>> [8.357963] IPv6: ADDRCONF(NETDEV_CHANGE): enp2s0: link becomes ready
>> [9.759942] u32 classifier
>> [9.759944] Actions configured
>> [9.762152] Mirror/redirect action on
>> [9.859138] BUG: unable to handle kernel NULL pointer dereference
>> at 0018
>> [9.859149] IP: tcf_block_put+0xf/0x50
>
> Do you have this fix?
>
> commit 822e86d997e4d8f942818ea6ac1711f59a66ebef
> Author: Cong Wang 
> Date:   Mon Oct 30 11:10:09 2017 -0700
>
> net_sched: remove tcf_block_put_deferred()

No.

That is not in net-next, and the "net" version of that one patch does
not apply to net-next. The relevant thread says "... another fun merge
into net-next".

Please let me know when the fun is done, and I'll retest.

-- 

Dave Täht
CEO, TekLibre, LLC
http://www.teklibre.com
Tel: 1-669-226-2619

Re: [PATCH V3 net-next 1/2] liquidio: switchdev support for LiquidIO NIC

2017-11-01 Thread Jakub Kicinski

On Wed, 1 Nov 2017 12:29:48 -0700, Felix Manlunas wrote:
> > > +lio_vf_rep_phys_port_name(struct net_device *dev,
> > > +   char *buf, size_t len)
> > > +{
> > > + struct lio_vf_rep_desc *vf_rep = netdev_priv(dev);
> > > + struct octeon_device *oct = vf_rep->oct;
> > > + int ret;
> > > +
> > > + ret = snprintf(buf, len, "pf%dvf%d", oct->pf_num,
> > > +vf_rep->ifidx - oct->pf_num * 64 - 1);
> > > + if (ret >= len)
> > > + return -EOPNOTSUPP;  
> > 
> > EOPNOTSUPP seems an odd return code for too short a buffer?  
> 
> We will replace that with ENOBUFS in a future patch.

Strangely enough mlx5 and bnxt also use EOPNOTSUPP if string doesn't
fit.  Others use EINVAL, IMHO that's more appropriate.

Re: [PATCH v10 2/8] dt-bindings: net: dwmac-sun8i: update documentation about integrated PHY

2017-11-01 Thread Rob Herring

On Tue, Oct 31, 2017 at 09:19:09AM +0100, Corentin Labbe wrote:
> This patch add documentation about the MDIO switch used on sun8i-h3-emac
> for integrated PHY.
> 
> Signed-off-by: Corentin Labbe 
> Acked-by: Florian Fainelli 
> Reviewed-by: Andrew Lunn 
> ---
>  .../devicetree/bindings/net/dwmac-sun8i.txt| 147 
> +++--
>  1 file changed, 135 insertions(+), 12 deletions(-)

Acked-by: Rob Herring

Re: SKB Reference Question

2017-11-01 Thread Eric Dumazet

On Wed, 2017-11-01 at 12:22 -0700, Joe Smith wrote:
> On Wed, Nov 1, 2017 at 10:33 AM, Eric Dumazet  wrote:
> > On Wed, 2017-11-01 at 10:27 -0700, Joe Smith wrote:
> >> How strictly are references on the SKB enforced. For example,
> >> tcp_transmit_skb() clones the SKB and adds a TCP header. Can I assume
> >> that in case of re-transmission the header added will be there and can
> >> be reused instead of creating a new one from scratch. Some fields like
> >> time stamp would need to be updated but they should be unmodified.
> >
> >
> > Not sure what you are trying to do, but this seems messy ;)
> 
> As an example, consider reusing options that might be expensive to
> calculate. Assuming no coalescing.
> 
> >
> > At rtx time, there is no guarantee that the master skb has not been
> > changed, so the content of TCP header might be wrong anyway.
> 
> I would have thought that each layer below TCP would only add itś own
> header and would not touch anything else. Is that not the guarantee
> that SKB references provideś, or else the data could be changed.
> 
> Can you give an example why the TCP header could change.

Before rtx, skbs can be split or coalesced.

Depending on ACK received from opposite peer, available cwnd budget,
pacing rate...

Also ECN can play a role.

Re: [v2] can: Use common error handling code in vxcan_newlink()

2017-11-01 Thread SF Markus Elfring

> Acked-by: Oliver Hartkopp 

Thanks for another positive feedback.


> Again: Posting such a patch on linux-...@vger.kernel.org is ENOUGH!

I was informed in an other way for Linux software patches.


> No need to cross post such a patch additionally on
> 
> netdev@vger.kernel.org
> linux-ker...@vger.kernel.org

These addresses were suggested (or recommended?) by the script 
“get_maintainer.pl”.


> kernel-janit...@vger.kernel.org

I tend to add this address also for my update suggestions.


> and to each of the maintainers
…
> We all subscribed the mailing list and listen to it.

This is generally fine. - But I do not know how long such subscriptions
will be active.


> That's the intention of a mailing list ...

I know …


> Cross posting is not appreciated in the community.

How does this view fit to the information in the section “5) Select
the recipients for your patch” from the document “submitting-patches.rst”?

Regards,
Markus

Re: [PATCH] net: vrf: correct FRA_L3MDEV encode type

2017-11-01 Thread David Ahern

On 11/1/17 8:58 AM, Jeff Barnhill wrote:
> FRA_L3MDEV is defined as U8, but is being added as a U32 attribute. On
> big endian architecture, this results in the l3mdev entry not being
> added to the FIB rules.
> 
> Fixes: 1aa6c4f6b8cd8 ("net: vrf: Add l3mdev rules on first device create")
> Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
> ---
>  drivers/net/vrf.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
> index 9b243e6f3008..7dc3bcac3506 100644
> --- a/drivers/net/vrf.c
> +++ b/drivers/net/vrf.c
> @@ -1165,7 +1165,7 @@ static int vrf_fib_rule(const struct net_device *dev, 
> __u8 family, bool add_it)
>   frh->family = family;
>   frh->action = FR_ACT_TO_TBL;
>  
> - if (nla_put_u32(skb, FRA_L3MDEV, 1))
> + if (nla_put_u8(skb, FRA_L3MDEV, 1))
>   goto nla_put_failure;
>  
>   if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF))
> 

Acked-by: David Ahern

[PATCH net] xfrm: defer daddr pointer assignment after spi parsing

2017-11-01 Thread Florian Westphal

syzbot reports:
BUG: KASAN: use-after-free in __xfrm_state_lookup+0x695/0x6b0
Read of size 4 at addr 8801d434e538 by task syzkaller647520/2991
[..]
__xfrm_state_lookup+0x695/0x6b0 net/xfrm/xfrm_state.c:833
xfrm_state_lookup+0x8a/0x160 net/xfrm/xfrm_state.c:1592
xfrm_input+0x8e5/0x22f0 net/xfrm/xfrm_input.c:302

The use-after-free is the ipv4 destination address, which points
to an skb head area that has been reallocated:
  pskb_expand_head+0x36b/0x1210 net/core/skbuff.c:1494
  __pskb_pull_tail+0x14a/0x17c0 net/core/skbuff.c:1877
  pskb_may_pull include/linux/skbuff.h:2102 [inline]
  xfrm_parse_spi+0x3d3/0x4d0 net/xfrm/xfrm_input.c:170
  xfrm_input+0xce2/0x22f0 net/xfrm/xfrm_input.c:291

so the real bug is that xfrm_parse_spi() uses pskb_may_pull, but
for now do smaller workaround that makes xfrm_input fetch daddr
after spi parsing.

Reported-by: syzbot 
Signed-off-by: Florian Westphal 
---
 net/xfrm/xfrm_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 8ac9d32fb79d..1c6051cb7733 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -265,8 +265,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 
spi, int encap_type)
goto lock;
}
 
-   daddr = (xfrm_address_t *)(skb_network_header(skb) +
-  XFRM_SPI_SKB_CB(skb)->daddroff);
family = XFRM_SPI_SKB_CB(skb)->family;
 
/* if tunnel is present override skb->mark value with tunnel i_key */
@@ -293,6 +291,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 
spi, int encap_type)
goto drop;
}
 
+   daddr = (xfrm_address_t *)(skb_network_header(skb) +
+  XFRM_SPI_SKB_CB(skb)->daddroff);
do {
if (skb->sp->len == XFRM_MAX_DEPTH) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
-- 
2.13.6

Re: [PATCH V3 net-next 1/2] liquidio: switchdev support for LiquidIO NIC

2017-11-01 Thread Felix Manlunas

On Wed, Nov 01, 2017 at 02:02:07PM +0100, Andrew Lunn wrote:
> > +static int
> > +lio_pf_switchdev_attr_get(struct net_device *dev, struct switchdev_attr 
> > *attr)
> > +{
> > +   struct lio *lio = GET_LIO(dev);
> > +
> > +   switch (attr->id) {
> > +   case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
> > +   attr->u.ppid.id_len = ETH_ALEN;
> > +   ether_addr_copy(attr->u.ppid.id,
> > +   (void *)>linfo.hw_addr + 2);
> 
> The + 2 seems odd. Please could you explain why it is there?

The "+ 2" is the byte offset of the start of the mac address.  In a
future patch, we will replace "(void *)>linfo.hw_addr + 2" with
"dev->dev_addr" which is an elegant equivalent.


> > +static int lio_vf_rep_open(struct net_device *ndev);
> > +static int lio_vf_rep_stop(struct net_device *ndev);
> > +static int lio_vf_rep_pkt_xmit(struct sk_buff *skb, struct net_device 
> > *ndev);
> > +static void lio_vf_rep_tx_timeout(struct net_device *netdev);
> > +static int lio_vf_rep_phys_port_name(struct net_device *dev,
> > +char *buf, size_t len);
> > +static void lio_vf_rep_get_stats64(struct net_device *dev,
> > +  struct rtnl_link_stats64 *stats64);
> > +static int lio_vf_rep_change_mtu(struct net_device *ndev, int new_mtu);
> > +
> > +static const struct net_device_ops lio_vf_rep_ndev_ops = {
> > +   .ndo_open = lio_vf_rep_open,
> > +   .ndo_stop = lio_vf_rep_stop,
> > +   .ndo_start_xmit = lio_vf_rep_pkt_xmit,
> > +   .ndo_tx_timeout = lio_vf_rep_tx_timeout,
> > +   .ndo_get_phys_port_name = lio_vf_rep_phys_port_name,
> > +   .ndo_get_stats64 = lio_vf_rep_get_stats64,
> > +   .ndo_change_mtu = lio_vf_rep_change_mtu,
> > +};
> 
> Please don't use forward references. Change the order of the code and
> put this structure towards the end of the file.

We will fix this in a future patch.


> > +lio_vf_rep_phys_port_name(struct net_device *dev,
> > + char *buf, size_t len)
> > +{
> > +   struct lio_vf_rep_desc *vf_rep = netdev_priv(dev);
> > +   struct octeon_device *oct = vf_rep->oct;
> > +   int ret;
> > +
> > +   ret = snprintf(buf, len, "pf%dvf%d", oct->pf_num,
> > +  vf_rep->ifidx - oct->pf_num * 64 - 1);
> > +   if (ret >= len)
> > +   return -EOPNOTSUPP;
> 
> EOPNOTSUPP seems an odd return code for too short a buffer?

We will replace that with ENOBUFS in a future patch.

Re: SKB Reference Question

2017-11-01 Thread Joe Smith

On Wed, Nov 1, 2017 at 10:33 AM, Eric Dumazet  wrote:
> On Wed, 2017-11-01 at 10:27 -0700, Joe Smith wrote:
>> How strictly are references on the SKB enforced. For example,
>> tcp_transmit_skb() clones the SKB and adds a TCP header. Can I assume
>> that in case of re-transmission the header added will be there and can
>> be reused instead of creating a new one from scratch. Some fields like
>> time stamp would need to be updated but they should be unmodified.
>
>
> Not sure what you are trying to do, but this seems messy ;)

As an example, consider reusing options that might be expensive to
calculate. Assuming no coalescing.

>
> At rtx time, there is no guarantee that the master skb has not been
> changed, so the content of TCP header might be wrong anyway.

I would have thought that each layer below TCP would only add itś own
header and would not touch anything else. Is that not the guarantee
that SKB references provideś, or else the data could be changed.

Can you give an example why the TCP header could change.

Thanks a lot for your time.

>
>
>

-- 
JS

Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method

2017-11-01 Thread Alexei Starovoitov

On Wed, Nov 01, 2017 at 03:59:48PM +0200, Michael S. Tsirkin wrote:
> On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:
> > 
> > 
> > On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
> > > > +static void __tun_set_steering_ebpf(struct tun_struct *tun,
> > > > +   struct bpf_prog *new)
> > > > +{
> > > > +   struct bpf_prog *old;
> > > > +
> > > > +   old = rtnl_dereference(tun->steering_prog);
> > > > +   rcu_assign_pointer(tun->steering_prog, new);
> > > > +
> > > > +   if (old) {
> > > > +   synchronize_net();
> > > > +   bpf_prog_destroy(old);
> > > > +   }
> > > > +}
> > > > +
> > > Is this really called under rtnl?
> > 
> > Yes it is __tun_chr_ioctl() will call rtnl_lock().
> 
> Is the call from tun_free_netdev under rtnl too?
> 
> > > If no then rtnl_dereference
> > > is wrong. If yes I'm not sure you can call synchronize_net
> > > under rtnl.
> > > 
> > 
> > Are you worrying about the long wait? Looking at synchronize_net(), it does:
> > 
> > void synchronize_net(void)
> > {
> >     might_sleep();
> >     if (rtnl_is_locked())
> >         synchronize_rcu_expedited();
> >     else
> >         synchronize_rcu();
> > }
> > EXPORT_SYMBOL(synchronize_net);
> > 
> > Thanks
> 
> Not the wait - expedited is not a good thing to allow unpriveledged
> userspace to do, it interrupts all VMs running on the same box.
> 
> We could use a callback though the docs warn userspace can use that
> to cause a DOS and needs to be limited.

the whole __tun_set_steering_ebpf() looks odd to me.
There is tun_attach_filter/tun_detach_filter pattern
that works for classic BPF. Why for eBPF this strange
synchronize_net() is there?

Re: [PATCH v2] can: Use common error handling code in vxcan_newlink()

2017-11-01 Thread Oliver Hartkopp

On 11/01/2017 03:16 PM, SF Markus Elfring wrote:

From: Markus Elfring 
Date: Wed, 1 Nov 2017 14:56:15 +0100

Add a jump target so that a bit of exception handling can be better reused
at the end of this function.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring 

Acked-by: Oliver Hartkopp 

Again: Posting such a patch on linux-...@vger.kernel.org is ENOUGH!

No need to cross post such a patch additionally on

netdev@vger.kernel.org
linux-ker...@vger.kernel.org
kernel-janit...@vger.kernel.org

and to each of the maintainers

m...@pengutronix.de
w...@grandegger.com
socket...@hartkopp.net

We all subscribed the mailing list and listen to it.
That's the intention of a mailing list ...

Cross posting is not appreciated in the community.

Thanks,
Oliver

---

v2:
An approach to make two checks for a failure predicate a bit safer
was rejected on 2017-10-28.
The possibility remains to reconsider such an adjustment later again.
https://lkml.org/lkml/2017/10/28/125
https://lkml.kernel.org/r/<264b3c2b-8354-5769-639c-ac8d2fcbe...@hartkopp.net>

  drivers/net/can/vxcan.c | 16 
  1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index 8404e8852a0f..5d1753cfacea 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -227,10 +227,8 @@ static int vxcan_newlink(struct net *net, struct 
net_device *dev,
netif_carrier_off(peer);

  	err = rtnl_configure_link(peer, ifmp);

-   if (err < 0) {
-   unregister_netdevice(peer);
-   return err;
-   }
+   if (err < 0)
+   goto unregister_network_device;

  	/* register first device */

if (tb[IFLA_IFNAME])
@@ -239,10 +237,8 @@ static int vxcan_newlink(struct net *net, struct 
net_device *dev,
snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");

  	err = register_netdevice(dev);

-   if (err < 0) {
-   unregister_netdevice(peer);
-   return err;
-   }
+   if (err < 0)
+   goto unregister_network_device;

  	netif_carrier_off(dev);

@@ -254,6 +250,10 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,

rcu_assign_pointer(priv->peer, dev);

  	return 0;

+
+unregister_network_device:
+   unregister_netdevice(peer);
+   return err;
  }

  static void vxcan_dellink(struct net_device *dev, struct list_head *head)

Re: WARNING in skb_warn_bad_offload

2017-11-01 Thread Dmitry Vyukov

On Wed, Nov 1, 2017 at 9:48 PM, syzbot

wrote:
> Hello,
>
> syzkaller hit the following crash on
> 720bbe532b7c8f5613b48dea627fc58ed9ace707
> git://git.cmpxchg.org/linux-mmots.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> C reproducer is attached
> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> for information about syzkaller reproducers


This also happens on more recent commits, including linux-next
36ef71cae353f88fd6e095e2aaa3e5953af1685d (Oct 20):

syz0: caps=(0x040058c1, 0x) len=4203
data_len=2810 gso_size=8465 gso_type=3 ip_summed=0
[ cut here ]
WARNING: CPU: 0 PID: 3473 at net/core/dev.c:2618
skb_warn_bad_offload.cold.139+0x224/0x261 net/core/dev.c:2613
Kernel panic - not syncing: panic_on_warn set ...

CPU: 0 PID: 3473 Comm: a.out Not tainted 4.14.0-rc5-next-20171018 #15
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x1a8/0x272 lib/dump_stack.c:52
 panic+0x21e/0x4b7 kernel/panic.c:183
 __warn.cold.6+0x182/0x187 kernel/panic.c:546
 report_bug+0x232/0x330 lib/bug.c:183
 fixup_bug+0x3f/0x90 arch/x86/kernel/traps.c:177
 do_trap_no_signal arch/x86/kernel/traps.c:211 [inline]
 do_trap+0x132/0x280 arch/x86/kernel/traps.c:260
 do_error_trap+0x11f/0x390 arch/x86/kernel/traps.c:297
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:310
 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
RIP: 0010:skb_warn_bad_offload.cold.139+0x224/0x261 net/core/dev.c:2613
RSP: 0018:880064797038 EFLAGS: 00010286
RAX: 006f RBX: 88006365efe8 RCX: 
RDX: 006f RSI: 815c88c1 RDI: ed000c8f2dfd
RBP: 880064797090 R08: 8800686f86c0 R09: 0002
R10: 8800686f86c0 R11:  R12: 8800538b1680
R13:  R14: 8800538b1680 R15: 2111
 __skb_gso_segment+0x69e/0x860 net/core/dev.c:2824
 skb_gso_segment include/linux/netdevice.h:3971 [inline]
 validate_xmit_skb+0x29f/0xca0 net/core/dev.c:3074
 validate_xmit_skb_list+0xb7/0x120 net/core/dev.c:3125
 sch_direct_xmit+0x5b5/0x710 net/sched/sch_generic.c:181
 __dev_xmit_skb net/core/dev.c:3206 [inline]
 __dev_queue_xmit+0x1e41/0x2350 net/core/dev.c:3473
 dev_queue_xmit+0x17/0x20 net/core/dev.c:3538
 packet_snd net/packet/af_packet.c:2956 [inline]
 packet_sendmsg+0x487a/0x64b0 net/packet/af_packet.c:2981
 sock_sendmsg_nosec net/socket.c:632 [inline]
 sock_sendmsg+0xd2/0x120 net/socket.c:642
 ___sys_sendmsg+0x7cc/0x900 net/socket.c:2048
 __sys_sendmsg+0xe6/0x220 net/socket.c:2082
 SYSC_sendmsg net/socket.c:2093 [inline]
 SyS_sendmsg+0x36/0x60 net/socket.c:2089
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x44bab9
RSP: 002b:007eff18 EFLAGS: 0246 ORIG_RAX: 002e
RAX: ffda RBX: 20001046 RCX: 0044bab9
RDX: 4010 RSI: 207fcfc8 RDI: 0004
RBP: 0086 R08: 850b2da14d2a3706 R09: 
R10: 1b91126b7f398aaa R11: 0246 R12: 
R13: 00407950 R14: 004079e0 R15: 





> [ cut here ]
> WARNING: CPU: 0 PID: 2986 at net/core/dev.c:2585
> skb_warn_bad_offload+0x2a9/0x380 net/core/dev.c:2580
> Kernel panic - not syncing: panic_on_warn set ...
>
> CPU: 0 PID: 2986 Comm: syzkaller546001 Not tainted 4.13.0-mm1+ #7
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:16 [inline]
>  dump_stack+0x194/0x257 lib/dump_stack.c:52
>  panic+0x1e4/0x417 kernel/panic.c:181
>  __warn+0x1c4/0x1d9 kernel/panic.c:542
>  report_bug+0x211/0x2d0 lib/bug.c:183
>  fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178
>  do_trap_no_signal arch/x86/kernel/traps.c:212 [inline]
>  do_trap+0x260/0x390 arch/x86/kernel/traps.c:261
>  do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298
>  do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311
>  invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
> RIP: 0010:skb_warn_bad_offload+0x2a9/0x380 net/core/dev.c:2580
> RSP: 0018:8801ce73f0a0 EFLAGS: 00010282
> RAX: 006f RBX: 8801cd84cde0 RCX: 
> RDX: 006f RSI: 110039ce7dd4 RDI: ed0039ce7e08
> RBP: 8801ce73f0f8 R08: 8801ce73e790 R09: 
> R10:  R11:  R12: 8801ce7802c0
> R13:  R14: 8801ce7802c0 R15: 2111
>  __skb_gso_segment+0x607/0x7f0 net/core/dev.c:2791
>  skb_gso_segment include/linux/netdevice.h:3951 [inline]
>  validate_xmit_skb+0x4ba/0xb20 net/core/dev.c:3041
>  validate_xmit_skb_list+0xb7/0x120 net/core/dev.c:3092
>  sch_direct_xmit+0x3b6/0x6d0 net/sched/sch_generic.c:181
>  __dev_xmit_skb net/core/dev.c:3173 [inline]
>

Re: [PATCH net-next] security: bpf: replace include of linux/bpf.h with forward declarations

2017-11-01 Thread Alexei Starovoitov

On Wed, Nov 01, 2017 at 11:48:00AM -0700, Jakub Kicinski wrote:
> Touching linux/bpf.h makes us rebuild a surprisingly large
> portion of the kernel.  Remove the unnecessary dependency
> from security.h, it only needs forward declarations.
> 
> Signed-off-by: Jakub Kicinski 
> Reviewed-by: Quentin Monnet 

Acked-by: Alexei Starovoitov 

thanks for fixing! I noticed larger rebuilds as well.

[PATCH 2/2] netfilter: nf_reject_ipv4: Fix use-after-free in send_reset

2017-11-01 Thread Pablo Neira Ayuso

From: Tejaswi Tanikella 

niph is not updated after pskb_expand_head changes the skb head. It
still points to the freed data, which is then used to update tot_len and
checksum. This could cause use-after-free poison crash.

Update niph, if ip_route_me_harder does not fail.

This only affects the interaction with REJECT targets and br_netfilter.

Signed-off-by: Tejaswi Tanikella 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/nf_reject_ipv4.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c 
b/net/ipv4/netfilter/nf_reject_ipv4.c
index eeacbdaf7cdf..5cd06ba3535d 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -132,6 +132,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, 
int hook)
if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
goto free_nskb;
 
+   niph = ip_hdr(nskb);
+
/* "Never happens" */
if (nskb->len > dst_mtu(skb_dst(nskb)))
goto free_nskb;
-- 
2.11.0

[PATCH 1/2] netfilter: nft_set_hash: disable fast_ops for 2-len keys

2017-11-01 Thread Pablo Neira Ayuso

From: Anatole Denis 

jhash_1word of a u16 is a different value from jhash of the same u16 with
length 2.
Since elements are always inserted in sets using jhash over the actual
klen, this would lead to incorrect lookups on fixed-size sets with a key
length of 2, as they would be inserted with hash value jhash(key, 2) and
looked up with hash value jhash_1word(key), which is different.

Example reproducer(v4.13+), using anonymous sets which always have a
fixed size:

  table inet t {
  chain c {
  type filter hook output priority 0; policy accept;
  tcp dport { 10001, 10003, 10005, 10007, 10009 } counter 
packets 4 bytes 240 reject
  tcp dport 10001 counter packets 4 bytes 240 reject
  tcp dport 10003 counter packets 4 bytes 240 reject
  tcp dport 10005 counter packets 4 bytes 240 reject
  tcp dport 10007 counter packets 0 bytes 0 reject
  tcp dport 10009 counter packets 4 bytes 240 reject
  }
  }

then use nc -z localhost  to probe; incorrectly hashed ports will
pass through the set lookup and increment the counter of an individual
rule.

jhash being seeded with a random value, it is not deterministic which
ports will incorrectly hash, but in testing with 5 ports in the set I
always had 4 or 5 with an incorrect hash value.

Signed-off-by: Anatole Denis 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nft_set_hash.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 0fa01d772c5e..9c0d5a7ce5f9 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -643,7 +643,6 @@ nft_hash_select_ops(const struct nft_ctx *ctx, const struct 
nft_set_desc *desc,
 {
if (desc->size) {
switch (desc->klen) {
-   case 2:
case 4:
return _hash_fast_ops;
default:
-- 
2.11.0

[PATCH 0/2] Netfilter fixes for net

2017-11-01 Thread Pablo Neira Ayuso

Hi David,

The following patchset contains two one-liner fixes for your net tree,
they are:

1) Disable fast hash operations for 2-bytes length keys which is leading
   to incorrect lookups in nf_tables, from Anatole Denis.

2) Reload pointer ipv4 header after ip_route_me_harder() given this may
   result in use-after-free due to skbuff header reallocation, patch
   from Tejaswi Tanikella.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!



The following changes since commit 28e33f9d78eefe98ea86673ab31e988b37a9a738:

  bpf: disallow arithmetic operations on context pointer (2017-10-18 13:21:13 
+0100)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to 7400bb4b5800831581a82f71700af6a5e815c3c8:

  netfilter: nf_reject_ipv4: Fix use-after-free in send_reset (2017-11-01 
12:15:29 +0100)


Anatole Denis (1):
  netfilter: nft_set_hash: disable fast_ops for 2-len keys

Tejaswi Tanikella (1):
  netfilter: nf_reject_ipv4: Fix use-after-free in send_reset

 net/ipv4/netfilter/nf_reject_ipv4.c | 2 ++
 net/netfilter/nft_set_hash.c| 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

[PATCH net-next] security: bpf: replace include of linux/bpf.h with forward declarations

2017-11-01 Thread Jakub Kicinski

Touching linux/bpf.h makes us rebuild a surprisingly large
portion of the kernel.  Remove the unnecessary dependency
from security.h, it only needs forward declarations.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 include/linux/security.h | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/security.h b/include/linux/security.h
index 18800b0911e5..73f1ef625d40 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,7 +31,6 @@
 #include 
 #include 
 #include 
-#include 
 
 struct linux_binprm;
 struct cred;
@@ -1732,6 +1731,10 @@ static inline void securityfs_remove(struct dentry 
*dentry)
 #endif
 
 #ifdef CONFIG_BPF_SYSCALL
+union bpf_attr;
+struct bpf_map;
+struct bpf_prog;
+struct bpf_prog_aux;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
-- 
2.14.1

[PATCH net-next] net: systemport: Only inspect valid switch port & queues

2017-11-01 Thread Florian Fainelli

Hesoteric board configurations where port 0 is not available would still
make SYSTEMPORT inspect the switch port 0, queue 0, which, not being
enabled, would cause transmit timeouts over time. Just ignore those
unconfigured rings instead.

Fixes: 84ff33eeb23d ("net: systemport: Establish DSA network device queue 
mapping")
Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 9 +++--
 drivers/net/ethernet/broadcom/bcmsysport.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c 
b/drivers/net/ethernet/broadcom/bcmsysport.c
index dcee843d05d7..e6da9b165bbe 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1420,8 +1420,12 @@ static int bcm_sysport_init_tx_ring(struct 
bcm_sysport_priv *priv,
/* Configure QID and port mapping */
reg = tdma_readl(priv, TDMA_DESC_RING_MAPPING(index));
reg &= ~(RING_QID_MASK | RING_PORT_ID_MASK << RING_PORT_ID_SHIFT);
-   reg |= ring->switch_queue & RING_QID_MASK;
-   reg |= ring->switch_port << RING_PORT_ID_SHIFT;
+   if (ring->inspect) {
+   reg |= ring->switch_queue & RING_QID_MASK;
+   reg |= ring->switch_port << RING_PORT_ID_SHIFT;
+   } else {
+   reg |= RING_IGNORE_STATUS;
+   }
tdma_writel(priv, reg, TDMA_DESC_RING_MAPPING(index));
tdma_writel(priv, 0, TDMA_DESC_RING_PCP_DEI_VID(index));
 
@@ -2108,6 +2112,7 @@ static int bcm_sysport_map_queues(struct net_device *dev,
 */
ring->switch_queue = q;
ring->switch_port = port;
+   ring->inspect = true;
priv->ring_map[q + port * num_tx_queues] = ring;
 
/* Set all queues as being used now */
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h 
b/drivers/net/ethernet/broadcom/bcmsysport.h
index 82f70a6783cb..f5a984c1c986 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -714,6 +714,7 @@ struct bcm_sysport_tx_ring {
unsigned long   bytes;  /* bytes statistics */
unsigned intswitch_queue;   /* switch port queue number */
unsigned intswitch_port;/* switch port queue number */
+   boolinspect;/* inspect switch port and queue */
 };
 
 /* Driver private structure */
-- 
2.9.3

Re: [PATCH] net: recvmsg: Unconditionally zero struct sockaddr_storage

2017-11-01 Thread Kees Cook

On Wed, Nov 1, 2017 at 5:48 AM, Eric W. Biederman  wrote:
> Eric Dumazet  writes:
>
>> On Tue, 2017-10-31 at 09:14 -0700, Kees Cook wrote:
>>> Some protocols do not correctly wipe the contents of the on-stack
>>> struct sockaddr_storage sent down into recvmsg() (e.g. SCTP), and leak
>>> kernel stack contents to userspace. This wipes it unconditionally before
>>> per-protocol handlers run.
>>>
>>> Note that leaks like this are mitigated by building with
>>> CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y
>>>
>>> Reported-by: Alexander Potapenko 
>>> Cc: "David S. Miller" 
>>> Cc: netdev@vger.kernel.org
>>> Signed-off-by: Kees Cook 
>>> ---
>>>  net/socket.c | 1 +
>>>  1 file changed, 1 insertion(+)
>>>
>>> diff --git a/net/socket.c b/net/socket.c
>>> index c729625eb5d3..34183f4fbdf8 100644
>>> --- a/net/socket.c
>>> +++ b/net/socket.c
>>> @@ -2188,6 +2188,7 @@ static int ___sys_recvmsg(struct socket *sock, struct 
>>> user_msghdr __user *msg,
>>>  struct sockaddr __user *uaddr;
>>>  int __user *uaddr_len = COMPAT_NAMELEN(msg);
>>>
>>> +memset(, 0, sizeof(addr));
>>>  msg_sys->msg_name = 
>>>
>>
>> This kind of patch comes every year.
>>
>> Standard answer is : We fix the buggy protocol, we do not make
>> everything slower just because we are lazy.
>>
>> struct sockaddr is 128 bytes, but IPV4 only uses a fraction of it.
>>
>> Also memset() is using long word stores, so next 4-byte or 2-byte stores
>> on same location hit a performance problem on x86.
>>
>> By adding all these defensive programming, we would give strong
>> incentives to bypass the kernel for networking. That would be bad.
>
> In this case it looks like the root cause is something in sctp
> not filling in the ipv6 sin6_scope_id.
>
> Which is not only a leak but a correctness bug.
>
> I ran the reproducer test program and while none of the leak checkers
> are telling me anything I have gotten as far as seeing that the returned
> length is correct and sometimes nonsense.
>
> Hmm.
>
> At a quick look it looks like all that is necessary is to do this:
>
> diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
> index 51c488769590..6301913d0516 100644
> --- a/net/sctp/ipv6.c
> +++ b/net/sctp/ipv6.c
> @@ -807,9 +807,10 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, 
> char *msgname,
> addr->v6.sin6_flowinfo = 0;
> addr->v6.sin6_port = sh->source;
> addr->v6.sin6_addr = ipv6_hdr(skb)->saddr;
> -   if (ipv6_addr_type(>v6.sin6_addr) & 
> IPV6_ADDR_LINKLOCAL) {
> +   if (ipv6_addr_type(>v6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
> addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb);
> -   }
> +   else
> +   addr->v6.sin6_scope_id = 0;
> }
>
> *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr);
>
> Eric
>

Thanks for digging into this Eric! Alexander, can you confirm this
fixes it for you when CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL is not
enabled?

-Kees

-- 
Kees Cook
Pixel Security

1 2 3 >

1 - 100 of 263 matches

Mail list logo