Re: [RFC PATCH net-next 5/6] net: dsa: mv88e6060: add register defines header file

2015-11-02 Thread Andrew Lunn
On Mon, Nov 02, 2015 at 11:57:27AM +0100, Neil Armstrong wrote:
> To align with the mv88e6xxx code, add a similar header file
> with all the register defines.
> The file is based on the mv88e6xxx header for coherency.

Hi Neil

I did a --side-by-side diff between this and mv88e6xxx.h. I don't
think there is enough the same to allow using mv88e6xxx.c functions
with the mv88e6060 driver :-(

 Andrew

> 
> Signed-off-by: Neil Armstrong 
> ---
>  drivers/net/dsa/mv88e6060.h | 108 
> 
>  1 file changed, 108 insertions(+)
>  create mode 100644 drivers/net/dsa/mv88e6060.h
> 
> diff --git a/drivers/net/dsa/mv88e6060.h b/drivers/net/dsa/mv88e6060.h
> new file mode 100644
> index 000..ed3b3ac
> --- /dev/null
> +++ b/drivers/net/dsa/mv88e6060.h
> @@ -0,0 +1,108 @@
> +/*
> + * net/dsa/mv88e6060.h - Marvell 88e6060 switch chip support
> + * Copyright (c) 2008 Marvell Semiconductor
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +
> +#ifndef __MV88E6060_H
> +#define __MV88E6060_H
> +
> +#define MV88E6060_PORTS  6
> +
> +#define REG_PORT(p)  (0x8 + (p))
> +#define PORT_STATUS  0x00
> +#define PORT_STATUS_PAUSE_EN BIT(15)
> +#define PORT_STATUS_MY_PAUSE BIT(14)
> +#define PORT_STATUS_FC   (PORT_STATUS_MY_PAUSE | 
> PORT_STATUS_PAUSE_EN)
> +#define PORT_STATUS_RESOLVED BIT(13)
> +#define PORT_STATUS_LINK BIT(12)
> +#define PORT_STATUS_PORTMODE BIT(11)
> +#define PORT_STATUS_PHYMODE  BIT(10)
> +#define PORT_STATUS_DUPLEX   BIT(9)
> +#define PORT_STATUS_SPEEDBIT(8)
> +#define PORT_SWITCH_ID   0x03
> +#define PORT_SWITCH_ID_6060  0x0600
> +#define PORT_SWITCH_ID_6060_MASK 0xfff0
> +#define PORT_SWITCH_ID_6060_R1   0x0601
> +#define PORT_SWITCH_ID_6060_R2   0x0602
> +#define PORT_CONTROL 0x04
> +#define PORT_CONTROL_FORCE_FLOW_CTRL BIT(15)
> +#define PORT_CONTROL_TRAILER BIT(14)
> +#define PORT_CONTROL_HEADER  BIT(11)
> +#define PORT_CONTROL_INGRESS_MODEBIT(8)
> +#define PORT_CONTROL_VLAN_TUNNEL BIT(7)
> +#define PORT_CONTROL_STATE_MASK  0x03
> +#define PORT_CONTROL_STATE_DISABLED  0x00
> +#define PORT_CONTROL_STATE_BLOCKING  0x01
> +#define PORT_CONTROL_STATE_LEARNING  0x02
> +#define PORT_CONTROL_STATE_FORWARDING0x03
> +#define PORT_VLAN_MAP0x06
> +#define PORT_VLAN_MAP_DBNUM_SHIFT12
> +#define PORT_VLAN_MAP_TABLE_MASK 0x1f
> +#define PORT_ASSOC_VECTOR0x0b
> +#define PORT_ASSOC_VECTOR_MONITORBIT(15)
> +#define PORT_ASSOC_VECTOR_PAV_MASK   0x1f
> +#define PORT_RX_CNTR 0x10
> +#define PORT_TX_CNTR 0x11
> +
> +#define REG_GLOBAL   0x0f
> +#define GLOBAL_STATUS0x00
> +#define GLOBAL_STATUS_SW_MODE_MASK   (0x3 << 12)
> +#define GLOBAL_STATUS_SW_MODE_0  (0x0 << 12)
> +#define GLOBAL_STATUS_SW_MODE_1  (0x1 << 12)
> +#define GLOBAL_STATUS_SW_MODE_2  (0x2 << 12)
> +#define GLOBAL_STATUS_SW_MODE_3  (0x3 << 12)
> +#define GLOBAL_STATUS_INIT_READY BIT(11)
> +#define GLOBAL_STATUS_ATU_FULL   BIT(3)
> +#define GLOBAL_STATUS_ATU_DONE   BIT(2)
> +#define GLOBAL_STATUS_PHY_INTBIT(1)
> +#define GLOBAL_STATUS_EEINT  BIT(0)
> +#define GLOBAL_MAC_010x01
> +#define GLOBAL_MAC_01_DIFF_ADDR  BIT(8)
> +#define GLOBAL_MAC_230x02
> +#define GLOBAL_MAC_450x03
> +#define GLOBAL_CONTROL   0x04
> +#define GLOBAL_CONTROL_DISCARD_EXCESSBIT(13)
> +#define GLOBAL_CONTROL_MAX_FRAME_1536BIT(10)
> +#define GLOBAL_CONTROL_RELOAD_EEPROM BIT(9)
> +#define GLOBAL_CONTROL_CTRMODE   BIT(8)
> +#define GLOBAL_CONTROL_ATU_FULL_EN   BIT(3)
> +#define GLOBAL_CONTROL_ATU_DONE_EN   BIT(2)
> +#define GLOBAL_CONTROL_PHYINT_EN BIT(1)
> +#define GLOBAL_CONTROL_EEPROM_DONE_ENBIT(0)
> +#define GLOBAL_ATU_CONTROL   0x0a
> +#define GLOBAL_ATU_CONTROL_SWRESET   BIT(15)
> +#define GLOBAL_ATU_CONTROL_LEARNDIS  BIT(14)
> +#define GLOBAL_ATU_CONTROL_ATUSIZE_256   (0x0 << 12)
> +#define GLOBAL_ATU_CONTROL_ATUSIZE_512   (0x1 << 12)
> +#define GLOBAL_ATU_CONTROL_ATUSIZE_1024  (0x2 << 12)
> +#define GLOBAL_ATU_CONTROL_ATE_AGE_SHIFT 4
> +#define GLOBAL_ATU_CONTROL_ATE_AGE_MASK  (0xff << 4)
> +#define GLOBAL_ATU_CONTROL_ATE_AGE_5MIN  (0x13 << 4)
> +#define GLOBAL_ATU_OP0x0b
> +#define GLOBAL_ATU_OP_BUSY   BIT(15)
> +#define GLOBAL_ATU_OP_NOP(0 << 12)
> +#define GLOBAL_ATU_OP_FLUSH_ALL  ((1 << 12) | GLOBAL_ATU_OP_BUSY)
> +#define GLOBAL_ATU_OP_FLUSH_UNLOCKED ((2 << 12) | GLOBAL_ATU_OP_BUSY)
> +#define GLOBAL_ATU_OP_LOAD_DB((3 << 12) | GLOBAL_ATU_OP_BUSY)
> +#define GLOBAL_ATU_OP_GET_NEXT_DB((4 << 12) | 

Re: [PATCH v2 net-next] net: dsa: mv88e6xxx: assert SMI lock

2015-11-02 Thread Andrew Lunn
On Fri, Oct 30, 2015 at 06:56:45PM -0400, Vivien Didelot wrote:
> It's easy to forget to lock the smi_mutex before calling the low-level
> _mv88e6xxx_reg_{read,write}, so add a assert_smi_lock function in them.
> 
> Signed-off-by: Vivien Didelot 

Acked-by: Andrew Lunn 


Since there is no followup fixes patch, i assume we actually have it
correct at the moment?

Thanks
Andrew

> ---
>  drivers/net/dsa/mv88e6xxx.c | 25 ++---
>  1 file changed, 14 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
> index b1b14f5..78a179b 100644
> --- a/drivers/net/dsa/mv88e6xxx.c
> +++ b/drivers/net/dsa/mv88e6xxx.c
> @@ -24,6 +24,16 @@
>  #include 
>  #include "mv88e6xxx.h"
>  
> +static void assert_smi_lock(struct dsa_switch *ds)
> +{
> + struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
> +
> + if (unlikely(!mutex_is_locked(>smi_mutex))) {
> + dev_err(ds->master_dev, "SMI lock not held!\n");
> + dump_stack();
> + }
> +}
> +
>  /* If the switch's ADDR[4:0] strap pins are strapped to zero, it will
>   * use all 32 SMI bus addresses on its SMI bus, and all switch registers
>   * will be directly accessible on some {device address,register address}
> @@ -80,12 +90,13 @@ int __mv88e6xxx_reg_read(struct mii_bus *bus, int 
> sw_addr, int addr, int reg)
>   return ret & 0x;
>  }
>  
> -/* Must be called with SMI mutex held */
>  static int _mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg)
>  {
>   struct mii_bus *bus = dsa_host_dev_to_mii_bus(ds->master_dev);
>   int ret;
>  
> + assert_smi_lock(ds);
> +
>   if (bus == NULL)
>   return -EINVAL;
>  
> @@ -143,12 +154,13 @@ int __mv88e6xxx_reg_write(struct mii_bus *bus, int 
> sw_addr, int addr,
>   return 0;
>  }
>  
> -/* Must be called with SMI mutex held */
>  static int _mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg,
>   u16 val)
>  {
>   struct mii_bus *bus = dsa_host_dev_to_mii_bus(ds->master_dev);
>  
> + assert_smi_lock(ds);
> +
>   if (bus == NULL)
>   return -EINVAL;
>  
> @@ -204,7 +216,6 @@ int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 
> *addr)
>   return 0;
>  }
>  
> -/* Must be called with SMI mutex held */
>  static int _mv88e6xxx_phy_read(struct dsa_switch *ds, int addr, int regnum)
>  {
>   if (addr >= 0)
> @@ -212,7 +223,6 @@ static int _mv88e6xxx_phy_read(struct dsa_switch *ds, int 
> addr, int regnum)
>   return 0x;
>  }
>  
> -/* Must be called with SMI mutex held */
>  static int _mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum,
>   u16 val)
>  {
> @@ -538,7 +548,6 @@ out:
>   mutex_unlock(>smi_mutex);
>  }
>  
> -/* Must be called with SMI mutex held */
>  static int _mv88e6xxx_stats_wait(struct dsa_switch *ds)
>  {
>   int ret;
> @@ -553,7 +562,6 @@ static int _mv88e6xxx_stats_wait(struct dsa_switch *ds)
>   return -ETIMEDOUT;
>  }
>  
> -/* Must be called with SMI mutex held */
>  static int _mv88e6xxx_stats_snapshot(struct dsa_switch *ds, int port)
>  {
>   int ret;
> @@ -576,7 +584,6 @@ static int _mv88e6xxx_stats_snapshot(struct dsa_switch 
> *ds, int port)
>   return 0;
>  }
>  
> -/* Must be called with SMI mutex held */
>  static void _mv88e6xxx_stats_read(struct dsa_switch *ds, int stat, u32 *val)
>  {
>   u32 _val;
> @@ -789,7 +796,6 @@ void mv88e6xxx_get_regs(struct dsa_switch *ds, int port,
>   }
>  }
>  
> -/* Must be called with SMI lock held */
>  static int _mv88e6xxx_wait(struct dsa_switch *ds, int reg, int offset,
>  u16 mask)
>  {
> @@ -839,14 +845,12 @@ int mv88e6xxx_eeprom_busy_wait(struct dsa_switch *ds)
> GLOBAL2_EEPROM_OP_BUSY);
>  }
>  
> -/* Must be called with SMI lock held */
>  static int _mv88e6xxx_atu_wait(struct dsa_switch *ds)
>  {
>   return _mv88e6xxx_wait(ds, REG_GLOBAL, GLOBAL_ATU_OP,
>  GLOBAL_ATU_OP_BUSY);
>  }
>  
> -/* Must be called with SMI mutex held */
>  static int _mv88e6xxx_phy_read_indirect(struct dsa_switch *ds, int addr,
>   int regnum)
>  {
> @@ -865,7 +869,6 @@ static int _mv88e6xxx_phy_read_indirect(struct dsa_switch 
> *ds, int addr,
>   return _mv88e6xxx_reg_read(ds, REG_GLOBAL2, GLOBAL2_SMI_DATA);
>  }
>  
> -/* Must be called with SMI mutex held */
>  static int _mv88e6xxx_phy_write_indirect(struct dsa_switch *ds, int addr,
>int regnum, u16 val)
>  {
> -- 
> 2.6.2
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH net-next 0/6] net: dsa: mv88e6060: cleanup and fix setup

2015-11-02 Thread Vivien Didelot
Hi Neil,

On Nov. Monday 02 (45) 11:57 AM, Neil Armstrong wrote:
> This patchset introduces somes fixes and a registers addressing cleanup for
> the mv88e6060 DSA driver.
> 
> The first patch removes the poll_link as mv88e6xxx.
> The 3 following patchs fixes the setup in regards of the datasheet.
> The 2 last patches introduces a clean header and replaces all magic values.
> 
> Neil Armstrong (6):
>   net: dsa: mv88e6060: remove poll_link callback
>   net: dsa: mv88e6060: use the correct InitReady bit
>   net: dsa: mv88e6060: use the correct MaxFrameSize bit
>   net: dsa: mv88e6060: use the correct bit shift for mac0
>   net: dsa: mv88e6060: add register defines header file
>   net: dsa: mv88e6060: replace magic values with register defines

Nice cleanup. I'll just be a bit picky here, so you may not consider my
comment for this patchset, but maybe for the future ones. Unless I'm
mistaken, there is no reason to group all these patches together.

The first 4 patches are independent fixes, and thus could have been sent
separately to netdev -net.

Then the last 2 ones could have been squashed together, because I don't
see a real value to seperate them since you duplicate some defines, e.g.
REG_PORT. And this patch would be a candidate for netdev -net-next.

Thanks,
-v
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next] net: dsa: mv88e6xxx: assert SMI lock

2015-11-02 Thread Vivien Didelot
Hi Andrew,

On Nov. Monday 02 (45) 04:02 PM, Andrew Lunn wrote:
> On Fri, Oct 30, 2015 at 06:56:45PM -0400, Vivien Didelot wrote:
> > It's easy to forget to lock the smi_mutex before calling the low-level
> > _mv88e6xxx_reg_{read,write}, so add a assert_smi_lock function in them.
> > 
> > Signed-off-by: Vivien Didelot 
> 
> Acked-by: Andrew Lunn 
> 
> 
> Since there is no followup fixes patch, i assume we actually have it
> correct at the moment?

Yes, this v2 contains the 2 fixups you mentioned (removing the inline
keyword and keep the dsa_host_dev_to_mii_bus call as is). Thus this
version is correct.

Thanks,
-v
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kernel panic in 4.2.3, rb_erase in sch_fq

2015-11-02 Thread Eric Dumazet
On Mon, 2015-11-02 at 16:11 +0200, Denys Fedoryshchenko wrote:
> Hi!
> 
> Actually seems i was getting this panic for a while (once per week) on 
> loaded pppoe server, but just now was able to get full panic message.
> After checking commit logs on sch_fq.c i didnt seen any fixes, so 
> probably upgrading to newer kernel wont help?

I do not think we support sch_fq as a HTB leaf.

If you want both HTB and sch_fq, you need to setup a bonding device.

HTB on bond0

sch_fq on the slaves

Sure, the kernel should not crash, but HTB+sch_fq on same net device is
certainly not something that will work anyway.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Convert smsc911x to use ACPI as well as DT

2015-11-02 Thread Jeremy Linton

On 09/09/2015 11:10 AM, Marc Zyngier wrote:

Jeremy,

I can see two issues here: we have a screaming interrupt, and
we seem to corrupt some workqueue.

How did you get this to work? Firmware release?


Marc,

I'm responding because its been a month or so since my last response, 
and I haven't forgotten about this issue.


First, any custom tianocore build (*) should work. The required changes 
have been in the last few linaro snapshots as well 
(http://snapshots.linaro.org/components/kernel/linaro-edk2/, currently 
at 40) but I personally haven't had a lot of luck with the prebuilt 
images due to problems unrelated to this change. Others may have more luck.


* For those that don't know, tianno core is at:

https://github.com/tianocore/edk2.git
Use the master branch

After setting the environment variables/dependencies appropriately:

make -f ArmPlatformPkg/ArmJunoPkg/Makefile all

Will create a functional ACPI firmware image for all recent kernels, 
including ACPI/PCIe ones.


Thanks for everyone's patience on this,






--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/8] mm: memcontrol: account socket memory in unified hierarchy

2015-11-02 Thread Vladimir Davydov
On Thu, Oct 29, 2015 at 10:52:28AM -0700, Johannes Weiner wrote:
...
> Now, you mentioned that you'd rather see the socket buffers accounted
> at the allocator level, but I looked at the different allocation paths
> and network protocols and I'm not convinced that this makes sense. We
> don't want to be in the hotpath of every single packet when a lot of
> them are small, short-lived management blips that don't involve user
> space to let the kernel dispose of them.
> 
> __sk_mem_schedule() on the other hand is already wired up to exactly
> those consumers we are interested in for memory isolation: those with
> bigger chunks of data attached to them and those that have exploding
> receive queues when userspace fails to read(). UDP and TCP.
> 
> I mean, there is a reason why the global memory limits apply to only
> those types of packets in the first place: everything else is noise.
> 
> I agree that it's appealing to account at the allocator level and set
> page->mem_cgroup etc. but in this case we'd pay extra to capture a lot
> of noise, and I don't want to pay that just for aesthetics. In this
> case it's better to track ownership on the socket level and only count
> packets that can accumulate a significant amount of memory consumed.

Sigh, you seem to be right. Moreover, I can't even think of a neat way
to account skb pages to memcg, because rcv skbs are generated in device
drivers, where we don't know which socket/memcg it will go to. We could
recharge individual pages when skb gets to the network or transport
layer, but it would result in unjustified overhead.

> 
> > > We tried using the per-memcg tcp limits, and that prevents the OOMs
> > > for sure, but it's horrendous for network performance. There is no
> > > "stop growing" phase, it just keeps going full throttle until it hits
> > > the wall hard.
> > > 
> > > Now, we could probably try to replicate the global knobs and add a
> > > per-memcg soft limit. But you know better than anyone else how hard it
> > > is to estimate the overall workingset size of a workload, and the
> > > margins on containerized loads are razor-thin. Performance is much
> > > more sensitive to input errors, and often times parameters must be
> > > adjusted continuously during the runtime of a workload. It'd be
> > > disasterous to rely on yet more static, error-prone user input here.
> > 
> > Yeah, but the dynamic approach proposed in your patch set doesn't
> > guarantee we won't hit OOM in memcg due to overgrown buffers. It just
> > reduces this possibility. Of course, memcg OOM is far not as disastrous
> > as the global one, but still it usually means the workload breakage.
> 
> Right now, the entire machine breaks. Confining it to a faulty memcg,
> as well as reducing the likelihood of that OOM in many cases seems
> like a good move in the right direction, no?

It seems. However, memcg OOM is also bad, we should strive to avoid it
if we can.

> 
> And how likely are memcg OOMs because of this anyway? There is of

Frankly, I've no idea. Your arguments below sound reassuring though.

> course a scenario imaginable where the packets pile up, followed by
> some *other* part of the workload, the one that doesn't read() and
> process packets, trying to expand--which then doesn't work and goes
> OOM. But that seems like a complete corner case. In the vast majority
> of cases, the application will be in full operation and just fail to
> read() fast enough--because the network bandwidth is enormous compared
> to the container's size, or because it shares the CPU with thousands
> of other workloads and there is scheduling latency.
> 
> This would be the perfect point to reign in the transmit window...
> 
> > The static approach is error-prone for sure, but it has existed for
> > years and worked satisfactory AFAIK.
> 
> ...but that point is not a fixed amount of memory consumed. It depends
> on the workload and the random interactions it's having with thousands
> of other containers on that same machine.
> 
> The point of containers is to maximize utilization of your hardware
> and systematically eliminate slack in the system. But it's exactly
> that slack on dedicated bare-metal machines that allowed us to take a
> wild guess at the settings and then tune them based on observing a
> handful of workloads. This approach is not going to work anymore when
> we pack the machine to capacity and still expect every single
> container out of thousands to perform well. We need that automation.

But we do use static approach when setting memory limits, no?
memory.{low,high,max} - they are all static.

I understand it's appealing to have just one knob - memory size - like
in case of virtual machines, but it doesn't seem to work with
containers. You added memory.low and memory.high knobs. VMs don't have
anything like that. How is one supposed to set them? Depends on the
workload, I guess. Also, there is the pids cgroup for limiting the
number of pids that can be used by a cgroup, because 

net: lockdep warning in ip_mc_msfget (net/ipv4/igmp.c:2400)

2015-11-02 Thread Sasha Levin
Hi all,

While fuzzing with syzkaller inside a KVM tools guest running the latest -next, 
I saw
the following warning:

[ 2391.993558] ==
[ 2391.995441] [ INFO: possible circular locking dependency detected ]
[ 2391.995771] 4.3.0-rc6-next-20151022-sasha-00042-g2b253a1-dirty #2618 Not 
tainted
[ 2391.995771] ---
[ 2391.995771] syzkaller_execu/14105 is trying to acquire lock:
[ 2391.995771] (rtnl_mutex){+.+.+.}, at: rtnl_lock (net/core/rtnetlink.c:71)
[ 2391.995771] Mutex: counter: 1 owner: None
[ 2391.995771]
[ 2391.995771] but task is already holding lock:
[ 2391.995771] (sk_lock-AF_INET){+.+.+.}, at: do_ip_getsockopt 
(net/ipv4/ip_sockglue.c:1274)
[ 2391.995771]
[ 2391.995771] which lock already depends on the new lock.
[ 2391.995771]
[ 2391.995771]
[ 2391.995771] the existing dependency chain (in reverse order) is:
[ 2391.995771] -> #1 (sk_lock-AF_INET){+.+.+.}:
[ 2391.995771] lock_acquire (kernel/locking/lockdep.c:3620)
[ 2391.995771] lock_sock_nested (include/linux/bottom_half.h:31 
net/core/sock.c:2411)
[ 2391.995771] do_ip_setsockopt.isra.9 (net/ipv4/ip_sockglue.c:623)
[ 2391.995771] ip_setsockopt (net/ipv4/ip_sockglue.c:1202)
[ 2391.995771]ff, 0x0)
[ 2391.995771] sock_common_setsockopt (net/core/sock.c:2610)
[ 2391.995771] SyS_setsockopt (net/socket.c:1756 net/socket.c:1736)
[ 2391.995771] entry_SYSCALL_64_fastpath (arch/x86/entry/entry_64.S:188)
[ 2391.995771] -> #0 (rtnl_mutex){+.+.+.}:
[ 2391.995771] __lock_acquire (kernel/locking/lockdep.c:1877 
kernel/locking/lockdep.c:1982 kernel/locking/lockdep.c:2168 
kernel/locking/lockdep.c:3239)
[ 2391.995771] lock_acquire (kernel/locking/lockdep.c:3620)
[ 2391.995771] mutex_lock_nested (kernel/locking/mutex.c:526 
kernel/locking/mutex.c:618)
[ 2391.995771] rtnl_lock (net/core/rtnetlink.c:71)
[ 2391.995771] ip_mc_msfget (net/ipv4/igmp.c:2400)
[ 2391.995771] do_ip_getsockopt (net/ipv4/ip_sockglue.c:1401)
[ 2391.995771] ip_getsockopt (net/ipv4/ip_sockglue.c:1498)
[ 2391.995771] raw_getsockopt (net/ipv4/raw.c:851)
[ 2391.995771] sock_common_getsockopt (net/core/sock.c:2569)
[ 2391.995771] SyS_getsockopt (net/socket.c:1787 net/socket.c:1770)
[ 2391.995771] entry_SYSCALL_64_fastpath (arch/x86/entry/entry_64.S:188)
[ 2391.995771]
[ 2391.995771] other info that might help us debug this:
[ 2391.995771]
[ 2391.995771]  Possible unsafe locking scenario:
[ 2391.995771]
[ 2391.995771]CPU0CPU1
[ 2391.995771]
[ 2391.995771]   lock(sk_lock-AF_INET);
[ 2391.995771]lock(rtnl_mutex);
[ 2391.995771]lock(sk_lock-AF_INET);
[ 2391.995771]   lock(rtnl_mutex);
[ 2391.995771]
[ 2391.995771]  *** DEADLOCK ***
[ 2391.995771]
[ 2391.995771] 1 lock held by syzkaller_execu/14105:
[ 2391.995771] #0: (sk_lock-AF_INET){+.+.+.}, at: do_ip_getsockopt 
(net/ipv4/ip_sockglue.c:1274)
[ 2391.995771]
[ 2391.995771] stack backtrace:
[ 2391.995771] CPU: 1 PID: 14105 Comm: syzkaller_execu Not tainted 
4.3.0-rc6-next-20151022-sasha-00042-g2b253a1-dirty #2618
[ 2391.995771]  0001 c179c8c9 8800a403f550 
ade32a2b
[ 2391.995771]  bb7f5a50 bb84a4a0 bb7f5a50 
8800a403f5a0
[ 2391.995771]  ac43fca8 8800a403f690 a3e18000 
8800a3e18000
[ 2391.995771] Call Trace:
[ 2391.995771] dump_stack (lib/dump_stack.c:52)
[ 2391.995771] print_circular_bug (kernel/locking/lockdep.c:1250)
[ 2391.995771] __lock_acquire (kernel/locking/lockdep.c:1877 
kernel/locking/lockdep.c:1982 kernel/locking/lockdep.c:2168 
kernel/locking/lockdep.c:3239)
[ 2391.995771] lock_acquire (kernel/locking/lockdep.c:3620)
[ 2391.995771] mutex_lock_nested (kernel/locking/mutex.c:526 
kernel/locking/mutex.c:618)
[ 2391.995771] rtnl_lock (net/core/rtnetlink.c:71)
[ 2391.995771] ip_mc_msfget (net/ipv4/igmp.c:2400)
[ 2391.995771] do_ip_getsockopt (net/ipv4/ip_sockglue.c:1401)
[ 2391.995771] ip_getsockopt (net/ipv4/ip_sockglue.c:1498)
[ 2391.995771] raw_getsockopt (net/ipv4/raw.c:851)
[ 2391.995771] sock_common_getsockopt (net/core/sock.c:2569)
[ 2391.995771] SyS_getsockopt (net/socket.c:1787 net/socket.c:1770)
[ 2391.995771] entry_SYSCALL_64_fastpath (arch/x86/entry/entry_64.S:188)


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 2/4] sfc: allocate rx pages on the same node as the interrupt

2015-11-02 Thread Daniel Pieczko (dpieczko)
On 28/10/15 15:59, Eric Dumazet wrote:
> On Wed, 2015-10-28 at 15:01 +, Shradha Shah wrote:
>> From: Daniel Pieczko 
>>
>> When the interrupt servicing a channel is on a NUMA node that is
>> not local to the device, performance is improved by allocating
>> rx pages on the node local to the interrupt (remote to the device)
>>
>> The performance-optimal case, where interrupts and applications
>> are pinned to CPUs on the same node as the device, is not altered
>> by this change.
>>
>> This change gave a 1% improvement in transaction rate using Nginx
>> with all interrupts and Nginx threads on the node remote to the
>> device. It also gave a small reduction in round-trip latency,
>> again with the interrupt and application on a different node to
>> the device.
>>
>> Allocating rx pages based on the channel->irq_node value is only
>> valid for the initial driver-load interrupt affinities; if an
>> interrupt is moved later, the wrong node may be used for the
>> allocation.
>>
>> Signed-off-by: Shradha Shah 
>> ---
>>  drivers/net/ethernet/sfc/efx.c|  1 +
>>  drivers/net/ethernet/sfc/net_driver.h |  3 +++
>>  drivers/net/ethernet/sfc/rx.c | 14 +-
>>  3 files changed, 13 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
>> index 974637d..89fbd03 100644
>> --- a/drivers/net/ethernet/sfc/efx.c
>> +++ b/drivers/net/ethernet/sfc/efx.c
>> @@ -445,6 +445,7 @@ efx_alloc_channel(struct efx_nic *efx, int i, struct 
>> efx_channel *old_channel)
>>  channel->efx = efx;
>>  channel->channel = i;
>>  channel->type = _default_channel_type;
>> +channel->irq_node = NUMA_NO_NODE;
>>  
>>  for (j = 0; j < EFX_TXQ_TYPES; j++) {
>>  tx_queue = >tx_queue[j];
>> diff --git a/drivers/net/ethernet/sfc/net_driver.h 
>> b/drivers/net/ethernet/sfc/net_driver.h
>> index ad56231..0ab9080a 100644
>> --- a/drivers/net/ethernet/sfc/net_driver.h
>> +++ b/drivers/net/ethernet/sfc/net_driver.h
>> @@ -419,6 +419,7 @@ enum efx_sync_events_state {
>>   * @sync_events_state: Current state of sync events on this channel
>>   * @sync_timestamp_major: Major part of the last ptp sync event
>>   * @sync_timestamp_minor: Minor part of the last ptp sync event
>> + * @irq_node: NUMA node of interrupt
>>   */
>>  struct efx_channel {
>>  struct efx_nic *efx;
>> @@ -477,6 +478,8 @@ struct efx_channel {
>>  enum efx_sync_events_state sync_events_state;
>>  u32 sync_timestamp_major;
>>  u32 sync_timestamp_minor;
>> +
>> +int irq_node;
>>  };
>>  
>>  #ifdef CONFIG_NET_RX_BUSY_POLL
>> diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
>> index 3f0e129..c5ef1e8 100644
>> --- a/drivers/net/ethernet/sfc/rx.c
>> +++ b/drivers/net/ethernet/sfc/rx.c
>> @@ -168,11 +168,15 @@ static int efx_init_rx_buffers(struct efx_rx_queue 
>> *rx_queue, bool atomic)
>>   * context in such a case.  So, use __GFP_NO_WARN
>>   * in case of atomic.
>>   */
>> -page = alloc_pages(__GFP_COLD | __GFP_COMP |
>> -   (atomic ?
>> -(GFP_ATOMIC | __GFP_NOWARN)
>> -: GFP_KERNEL),
>> -   efx->rx_buffer_order);
>> +struct efx_channel *channel;
>> +
>> +channel = efx_rx_queue_channel(rx_queue);
>> +page = alloc_pages_node(channel->irq_node, __GFP_COMP |
>> +(atomic ?
>> + (GFP_ATOMIC | __GFP_NOWARN)
>> + : GFP_KERNEL),
>> +efx->rx_buffer_order);
>> +
>>  if (unlikely(page == NULL))
>>  return -ENOMEM;
>>  dma_addr =
>>
>
> Sorry, I do not understand this patch, and why the following one is not
> squashed on this one.
>
> irq_node is always NUMA_NO_NODE (in this patch)
>
> So you claim a 1% improvement, switching from alloc_pages(...) to
> alloc_pages_node(NUMA_NO_NODE, ...) ???
>

You're correct that this doesn't make sense as it is.  There is something 
missing in this patch (channel->irq_node should be set) and also changing the 
order of some patches could make this clearer.  The series will need to be 
resent.


Daniel

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH net-next 0/6] net: dsa: mv88e6060: cleanup and fix setup

2015-11-02 Thread Andrew Lunn
On Mon, Nov 02, 2015 at 11:57:01AM +0100, Neil Armstrong wrote:
> This patchset introduces somes fixes and a registers addressing cleanup for
> the mv88e6060 DSA driver.
> 
> The first patch removes the poll_link as mv88e6xxx.
> The 3 following patchs fixes the setup in regards of the datasheet.
> The 2 last patches introduces a clean header and replaces all magic values.

Hi Neil

Nice patchset. Once you have fixed Sergei's comment, it looks good to
go.

Acked-by: Andrew Lunn 

  Andrew
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next] net: dsa: mv88e6xxx: lookup switch name

2015-11-02 Thread Andrew Lunn
On Fri, Oct 30, 2015 at 07:39:48PM -0400, Vivien Didelot wrote:
> All the mv88e6xxx drivers use the exact same code in their probe
> function to lookup the switch name given its ID. Thus introduce a
> mv88e6xxx_switch_id structure and a mv88e6xxx_lookup_name function in
> the common mv88e6xxx code.
> 
> In the meantime make __mv88e6xxx_reg_{read,write} static since we do not
> need to expose these low-level r/w routines anymore.

Acked-by: Andrew Lunn 

Thanks
Andrew
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net] net: avoid NULL deref in inet_ctl_sock_destroy()

2015-11-02 Thread Eric Dumazet
From: Eric Dumazet 

Under low memory conditions, tcp_sk_init() and icmp_sk_init()
can both iterate on all possible cpus and call inet_ctl_sock_destroy(),
with eventual NULL pointer.

Signed-off-by: Eric Dumazet 
Reported-by: Dmitry Vyukov 
---
 include/net/inet_common.h |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 279f83591971..109e3ee9108c 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -41,7 +41,8 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int 
len,
 
 static inline void inet_ctl_sock_destroy(struct sock *sk)
 {
-   sock_release(sk->sk_socket);
+   if (sk)
+   sock_release(sk->sk_socket);
 }
 
 #endif


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bpf: convert hashtab lock to raw lock

2015-11-02 Thread Shi, Yang

On 10/31/2015 11:37 AM, Daniel Borkmann wrote:

On 10/31/2015 02:47 PM, Steven Rostedt wrote:

On Fri, 30 Oct 2015 17:03:58 -0700
Alexei Starovoitov  wrote:

On Fri, Oct 30, 2015 at 03:16:26PM -0700, Yang Shi wrote:

When running bpf samples on rt kernel, it reports the below warning:

BUG: sleeping function called from invalid context at
kernel/locking/rtmutex.c:917
in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping
Preemption disabled at:[] kprobe_perf_func+0x30/0x228

...

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d..972b76b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,7 +17,7 @@
  struct bpf_htab {
  struct bpf_map map;
  struct hlist_head *buckets;
-spinlock_t lock;
+raw_spinlock_t lock;


How do we address such things in general?
I bet there are tons of places around the kernel that
call spin_lock from atomic.
I'd hate to lose the benefits of lockdep of non-raw spin_lock
just to make rt happy.


You wont lose any benefits of lockdep. Lockdep still checks
raw_spin_lock(). The only difference between raw_spin_lock and
spin_lock is that in -rt spin_lock turns into an rt_mutex() and
raw_spin_lock stays a spin lock.


( Btw, Yang, would have been nice if your commit description would have
   already included such info, not only that you convert it, but also why
   it's okay to do so. )


I think Thomas's document will include all the information about rt spin 
lock/raw spin lock, etc.


Alexei & Daniel,

If you think such info is necessary, I definitely could add it into the 
commit log in v2.





The error is that in -rt, you called a mutex and not a spin lock while
atomic.


You are right, I think this happens due to the preempt_disable() in the
trace_call_bpf() handler. So, I think the patch seems okay. The dep_map
is btw union'ed in the struct spinlock case to the same offset of the
dep_map from raw_spinlock.

It's a bit inconvenient, though, when we add other library code as maps
in future, f.e. things like rhashtable as they would first need to be
converted to raw_spinlock_t as well, but judging from the git log, it
looks like common practice.


Yes, it is common practice for converting sleepable spin lock to raw 
spin lock in -rt to avoid scheduling in atomic context bug.


Thanks,
Yang



Thanks,
Daniel


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net] sfc: push partner queue for skb->xmit_more

2015-11-02 Thread Martin Habets
When the IP stack passes SKBs the sfc driver puts them in 2 different TX
queues (called partners), one for checksummed and one for not checksummed.
If the SKB has xmit_more set the driver will delay pushing the work to the
NIC.

When later it does decide to push the buffers this patch ensures it also
pushes the partner queue, if that also has any delayed work. Before this
fix the work in the partner queue would be left for a long time and cause
a netdev watchdog.

Fixes: 70b33fb ("sfc: add support for skb->xmit_more")

Reported-by: Jianlin Shi 
Signed-off-by: Martin Habets 
---
 drivers/net/ethernet/sfc/ef10.c   |  4 +++-
 drivers/net/ethernet/sfc/farch.c  |  4 +++-
 drivers/net/ethernet/sfc/net_driver.h |  2 ++
 drivers/net/ethernet/sfc/tx.c | 30 --
 4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index ff649ebef637..286cc6b69d57 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -1849,7 +1849,9 @@ static void efx_ef10_tx_write(struct efx_tx_queue 
*tx_queue)
unsigned int write_ptr;
efx_qword_t *txd;
 
-   BUG_ON(tx_queue->write_count == tx_queue->insert_count);
+   tx_queue->xmit_more_available = false;
+   if (unlikely(tx_queue->write_count == tx_queue->insert_count))
+   return;
 
do {
write_ptr = tx_queue->write_count & tx_queue->ptr_mask;
diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c
index f08266f0eca2..5a1c5a8f278a 100644
--- a/drivers/net/ethernet/sfc/farch.c
+++ b/drivers/net/ethernet/sfc/farch.c
@@ -321,7 +321,9 @@ void efx_farch_tx_write(struct efx_tx_queue *tx_queue)
unsigned write_ptr;
unsigned old_write_count = tx_queue->write_count;
 
-   BUG_ON(tx_queue->write_count == tx_queue->insert_count);
+   tx_queue->xmit_more_available = false;
+   if (unlikely(tx_queue->write_count == tx_queue->insert_count))
+   return;
 
do {
write_ptr = tx_queue->write_count & tx_queue->ptr_mask;
diff --git a/drivers/net/ethernet/sfc/net_driver.h 
b/drivers/net/ethernet/sfc/net_driver.h
index c530e1c4cb4f..24038ef96d9f 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -219,6 +219,7 @@ struct efx_tx_buffer {
  * @tso_packets: Number of packets via the TSO xmit path
  * @pushes: Number of times the TX push feature has been used
  * @pio_packets: Number of times the TX PIO feature has been used
+ * @xmit_more_available: Are any packets waiting to be pushed to the NIC
  * @empty_read_count: If the completion path has seen the queue as empty
  * and the transmission path has not yet checked this, the value of
  * @read_count bitwise-added to %EFX_EMPTY_COUNT_VALID; otherwise 0.
@@ -253,6 +254,7 @@ struct efx_tx_queue {
unsigned int tso_packets;
unsigned int pushes;
unsigned int pio_packets;
+   bool xmit_more_available;
/* Statistics to supplement MAC stats */
unsigned long tx_packets;
 
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 1833a0146571..67f6afaa022f 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -431,8 +431,20 @@ finish_packet:
efx_tx_maybe_stop_queue(tx_queue);
 
/* Pass off to hardware */
-   if (!skb->xmit_more || netif_xmit_stopped(tx_queue->core_txq))
+   if (!skb->xmit_more || netif_xmit_stopped(tx_queue->core_txq)) {
+   struct efx_tx_queue *txq2 = efx_tx_queue_partner(tx_queue);
+
+   /* There could be packets left on the partner queue if those
+* SKBs had skb->xmit_more set. If we do not push those they
+* could be left for a long time and cause a netdev watchdog.
+*/
+   if (txq2->xmit_more_available)
+   efx_nic_push_buffers(txq2);
+
efx_nic_push_buffers(tx_queue);
+   } else {
+   tx_queue->xmit_more_available = skb->xmit_more;
+   }
 
tx_queue->tx_packets++;
 
@@ -722,6 +734,7 @@ void efx_init_tx_queue(struct efx_tx_queue *tx_queue)
tx_queue->read_count = 0;
tx_queue->old_read_count = 0;
tx_queue->empty_read_count = 0 | EFX_EMPTY_COUNT_VALID;
+   tx_queue->xmit_more_available = false;
 
/* Set up TX descriptor ring */
efx_nic_init_tx(tx_queue);
@@ -747,6 +760,7 @@ void efx_fini_tx_queue(struct efx_tx_queue *tx_queue)
 
++tx_queue->read_count;
}
+   tx_queue->xmit_more_available = false;
netdev_tx_reset_queue(tx_queue->core_txq);
 }
 
@@ -1302,8 +1316,20 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue 
*tx_queue,
efx_tx_maybe_stop_queue(tx_queue);
 
/* Pass off to hardware */
-   if 

Re: [RFC PATCH net-next 2/6] net: dsa: mv88e6060: use the correct InitReady bit

2015-11-02 Thread Sergei Shtylyov

Hello.

On 11/2/2015 1:57 PM, Neil Armstrong wrote:


According to the mv88e6060 datasheet, the InitReady bit position
is 11 and the polarity is inverted.
Use the bit correctly to detect the end of initialization.

Signed-off-by: Neil Armstrong 
---
  drivers/net/dsa/mv88e6060.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index 6885ef5..c10880f 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -102,7 +102,7 @@ static int mv88e6060_switch_reset(struct dsa_switch *ds)
timeout = jiffies + 1 * HZ;
while (time_before(jiffies, timeout)) {
ret = REG_READ(REG_GLOBAL, 0x00);
-   if ((ret & 0x8000) == 0x)
+   if ((ret & 0x800) != 0x)


   You could as well drop != 0.

[...]

MBR, Sergei

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next] net/core: generic support for disabling netdev features down stack

2015-11-02 Thread Jarod Wilson
There are some netdev features, which when disabled on an upper device,
such as a bonding master or a bridge, must be disabled and cannot be
re-enabled on underlying devices.

This is a rework of an earlier more heavy-handed appraoch, which simply
disables and prevents re-enabling of netdev features listed in a new
define in include/net/netdev_features.h, NETIF_F_UPPER_DISABLES. Any upper
device that disables a flag in that feature mask, the disabling will
propagate down the stack, and any lower device that has any upper device
with one of those flags disabled should not be able to enable said flag.

Initially, only LRO is included for proof of concept, and because this
code effectively does the same thing as dev_disable_lro(), though it will
also activate from the ethtool path, which was one of the goals here.

[root@dell-per730-01 ~]# ethtool -k bond0 |grep large
large-receive-offload: on
[root@dell-per730-01 ~]# ethtool -k p5p1 |grep large
large-receive-offload: on
[root@dell-per730-01 ~]# ethtool -K bond0 lro off
[root@dell-per730-01 ~]# ethtool -k bond0 |grep large
large-receive-offload: off
[root@dell-per730-01 ~]# ethtool -k p5p1 |grep large
large-receive-offload: off

dmesg dump:

[ 1033.277986] bond0: Disabling feature 0x8000 on lower dev p5p2.
[ 1034.067949] bnx2x :06:00.1 p5p2: using MSI-X  IRQs: sp 74  fp[0] 76 ... 
fp[7] 83
[ 1034.753612] bond0: Disabling feature 0x8000 on lower dev p5p1.
[ 1035.591019] bnx2x :06:00.0 p5p1: using MSI-X  IRQs: sp 62  fp[0] 64 ... 
fp[7] 71

This has been successfully tested with bnx2x, qlcnic and netxen network
cards as slaves in a bond interface. Turning LRO on or off on the master
also turns it on or off on each of the slaves, new slaves are added with
LRO in the same state as the master, and LRO can't be toggled on the
slaves.

Also, this should largely remove the need for dev_disable_lro(), and most,
if not all, of its call sites can be replaced by simply making sure
NETIF_F_LRO isn't included in the relevant device's feature flags.

Note that this patch is driven by bug reports from users saying it was
confusing that bonds and slaves had different settings for the same
features, and while it won't be 100% in sync if a lower device doesn't
support a feature like LRO, I think this is a good step in the right
direction.

CC: "David S. Miller" 
CC: Eric Dumazet 
CC: Jay Vosburgh 
CC: Veaceslav Falico 
CC: Andy Gospodarek 
CC: Jiri Pirko 
CC: Nikolay Aleksandrov 
CC: Michal Kubecek 
CC: Alexander Duyck 
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson 
---
Note: this replaces "[RFC PATCH net-next] net/core: initial support for
stacked dev feature toggles" for consideration.

 include/linux/netdev_features.h | 11 +
 net/core/dev.c  | 52 +
 2 files changed, 63 insertions(+)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 9672781..0f5837a 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -125,6 +125,11 @@ enum {
 #define NETIF_F_HW_L2FW_DOFFLOAD   __NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL  __NETIF_F(BUSY_POLL)
 
+#define for_each_netdev_feature(mask_addr, feature)
\
+   int bit;
\
+   for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) 
\
+   feature = __NETIF_F_BIT(bit);
+
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
 #define NETIF_F_NEVER_CHANGE   (NETIF_F_VLAN_CHALLENGED | \
@@ -167,6 +172,12 @@ enum {
  */
 #define NETIF_F_ALL_FOR_ALL(NETIF_F_NOCACHE_COPY | NETIF_F_FSO)
 
+/*
+ * If upper/master device has these features disabled, they must be disabled
+ * on all lower/slave devices as well.
+ */
+#define NETIF_F_UPPER_DISABLES NETIF_F_LRO
+
 /* changeable features with no special hardware requirements */
 #define NETIF_F_SOFT_FEATURES  (NETIF_F_GSO | NETIF_F_GRO)
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 13f49f8..3a8dbbc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6288,9 +6288,51 @@ static void rollback_registered(struct net_device *dev)
list_del();
 }
 
+static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
+   struct net_device *upper, netdev_features_t features)
+{
+   netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+   netdev_features_t feature;
+
+   for_each_netdev_feature(_disables, feature) {
+   if (!(upper->wanted_features & feature)
+   && (features & feature)) {
+   netdev_dbg(lower, "Dropping feature %pNF, upper dev %s 
has it 

Re: [PATCH net-next] net/core: generic support for disabling netdev features down stack

2015-11-02 Thread Alexander Duyck

On 11/02/2015 09:53 AM, Jarod Wilson wrote:

There are some netdev features, which when disabled on an upper device,
such as a bonding master or a bridge, must be disabled and cannot be
re-enabled on underlying devices.

This is a rework of an earlier more heavy-handed appraoch, which simply
disables and prevents re-enabling of netdev features listed in a new
define in include/net/netdev_features.h, NETIF_F_UPPER_DISABLES. Any upper
device that disables a flag in that feature mask, the disabling will
propagate down the stack, and any lower device that has any upper device
with one of those flags disabled should not be able to enable said flag.

Initially, only LRO is included for proof of concept, and because this
code effectively does the same thing as dev_disable_lro(), though it will
also activate from the ethtool path, which was one of the goals here.

[root@dell-per730-01 ~]# ethtool -k bond0 |grep large
large-receive-offload: on
[root@dell-per730-01 ~]# ethtool -k p5p1 |grep large
large-receive-offload: on
[root@dell-per730-01 ~]# ethtool -K bond0 lro off
[root@dell-per730-01 ~]# ethtool -k bond0 |grep large
large-receive-offload: off
[root@dell-per730-01 ~]# ethtool -k p5p1 |grep large
large-receive-offload: off

dmesg dump:

[ 1033.277986] bond0: Disabling feature 0x8000 on lower dev p5p2.
[ 1034.067949] bnx2x :06:00.1 p5p2: using MSI-X  IRQs: sp 74  fp[0] 76 ... 
fp[7] 83
[ 1034.753612] bond0: Disabling feature 0x8000 on lower dev p5p1.
[ 1035.591019] bnx2x :06:00.0 p5p1: using MSI-X  IRQs: sp 62  fp[0] 64 ... 
fp[7] 71

This has been successfully tested with bnx2x, qlcnic and netxen network
cards as slaves in a bond interface. Turning LRO on or off on the master
also turns it on or off on each of the slaves, new slaves are added with
LRO in the same state as the master, and LRO can't be toggled on the
slaves.

Also, this should largely remove the need for dev_disable_lro(), and most,
if not all, of its call sites can be replaced by simply making sure
NETIF_F_LRO isn't included in the relevant device's feature flags.

Note that this patch is driven by bug reports from users saying it was
confusing that bonds and slaves had different settings for the same
features, and while it won't be 100% in sync if a lower device doesn't
support a feature like LRO, I think this is a good step in the right
direction.

CC: "David S. Miller" 
CC: Eric Dumazet 
CC: Jay Vosburgh 
CC: Veaceslav Falico 
CC: Andy Gospodarek 
CC: Jiri Pirko 
CC: Nikolay Aleksandrov 
CC: Michal Kubecek 
CC: Alexander Duyck 
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson 
---
Note: this replaces "[RFC PATCH net-next] net/core: initial support for
stacked dev feature toggles" for consideration.

  include/linux/netdev_features.h | 11 +
  net/core/dev.c  | 52 +
  2 files changed, 63 insertions(+)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 9672781..0f5837a 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -125,6 +125,11 @@ enum {
  #define NETIF_F_HW_L2FW_DOFFLOAD  __NETIF_F(HW_L2FW_DOFFLOAD)
  #define NETIF_F_BUSY_POLL __NETIF_F(BUSY_POLL)

+#define for_each_netdev_feature(mask_addr, feature)
\
+   int bit;
\
+   for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) 
\
+   feature = __NETIF_F_BIT(bit);
+
  /* Features valid for ethtool to change */
  /* = all defined minus driver/device-class-related */
  #define NETIF_F_NEVER_CHANGE  (NETIF_F_VLAN_CHALLENGED | \
@@ -167,6 +172,12 @@ enum {
   */
  #define NETIF_F_ALL_FOR_ALL   (NETIF_F_NOCACHE_COPY | NETIF_F_FSO)

+/*
+ * If upper/master device has these features disabled, they must be disabled
+ * on all lower/slave devices as well.
+ */
+#define NETIF_F_UPPER_DISABLES NETIF_F_LRO
+
  /* changeable features with no special hardware requirements */
  #define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO)

diff --git a/net/core/dev.c b/net/core/dev.c
index 13f49f8..3a8dbbc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6288,9 +6288,51 @@ static void rollback_registered(struct net_device *dev)
list_del();
  }

+static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
+   struct net_device *upper, netdev_features_t features)
+{
+   netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+   netdev_features_t feature;
+
+   for_each_netdev_feature(_disables, feature) {
+   if (!(upper->wanted_features & feature)
+   && (features & feature)) {
+   netdev_dbg(lower, 

Re: [BUG] Any-IP IPv6 support broken

2015-11-02 Thread Gilberto Bertin

> On 29 Oct 2015, at 17:44, Maciej Żenczykowski  wrote:
> 
> What are you trying to do?

I would like to have a "bind-to-subnet" semantic with IPv6.

This is currently working with IPv4, and the setup is the follow:

- setup a dummy network device configured with any-IP
- add an any-IP route
- bind() on the dummy device

in this way we can effectively bind a process to a particular subnet
(by binding it to a dummy device which is receiving all the packets
from a particular subnet).

The point of using dummy devices is that we can configure multiple ones
(and so we can bind multiple processes to multiple subnets).

> 
> Does what you're trying to do work on an older kernel?  Which kernel
> version does it break at?
> 
> btw. afaik any-ip doesn't work with IPv4 on any un-patched kernel (the
> IPv4 support patch was reverted).


This is actually working with IPv4 (as I said I'm using a recent kernel,
4.1), and the fact that you say it's not supposed to work leads me to
think that maybe we are not talking about the same feature.

Cheers,
gilberto


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[v7, 0/6] Freescale DPAA FMan

2015-11-02 Thread igal.liberman
From: Igal Liberman 

The Freescale Data Path Acceleration Architecture (DPAA) is a set
of hardware components on specific QorIQ multicore processors.
This architecture provides the infrastructure to support
simplified sharing of networking interfaces and accelerators
by multiple CPU cores and the accelerators.

One of the DPAA accelerators is the Frame Manager (FMan)
which contains a series of hardware blocks: ports, Ethernet MACs,
a multi user RAM (MURAM) and Storage Profile (SP).

This patch set introduce the FMan drivers.
Each driver configures and initializes the corresponding
FMan hardware module (described above).
The MAC driver offers support for three different
types of MACs (eTSEC, TGEC, MEMAC).

v6 --> v7:
- Addressed compilation issue on non-PPC targets
- Removed B4860 rev 1 support

v5 --> v6:
- Addressed feedback from Scott:
- Moved kernel doc to source files
- Removed a series of configurable settings
- Miscellaneous code updates

v4 --> v5:
- Addressed feedback from David Miller:
- Removed driver layering
- Reduce namespace pollution
- Reduce code complexity and size

v3 --> v4:
- Remove device_initcall call in driver registration (redundant)
- Remove hot/cold labels
- Minor update in FMan Clock read from device-tree
- Update fixed-link support
- Addressed feedback from Stephen Hemminger
- Remove bogus blank line

v2 --> v3:
- Addressed feedback from Scott:
- Remove typedefs
- Remove unnecessary memory barriers
- Remove unnecessary casting
- Remove KConfig options
- Remove early_params
- Remove Hungarian notation
- Remove __packed__  attribute and padding from structures
- Remove unlikely attribute (where it's not needed)
- Use proper error codes and remove unnecessary prints
- Use proper values for sleep routines
- Replace complex Macros with functions
- Improve device tree processing code
- Use symbolic defines
- Add time-out in busy-wait loops
- Removed exit code (loadable module support will be added 
later)
- Fixed "fixed-link" issue raised by Joakim Tjernlund

v1 --> v2:
- Addressed feedback from Paul Bolle:
- General feedback of FMan Driver layer
- Remove Errata defines
- Aligned comments to Kernel Doc
- Remove Loadable Module support (not yet supported)
- Removed not needed KConfig dependencies
- Addressed feedback from Scott Wood
- Use Kernel ioread/iowrite services
- Squash FLIB source and header patches together

This submission is based on the prior Freescale DPAA FMan V3,RFC submission.
Several issues addresses in this submission:
- Reduced MAC layering and complexity
- Reduced code base
- T1024/T2080 10G best effort support

Igal Liberman (6):
  fsl/fman: Add FMan MURAM support
  fsl/fman: Add FMan support
  fsl/fman: Add FMan MAC support
  fsl/fman: Add FMan SP support
  fsl/fman: Add FMan Port Support
  fsl/fman: Add FMan MAC driver

 drivers/net/ethernet/freescale/Kconfig |1 +
 drivers/net/ethernet/freescale/Makefile|2 +
 drivers/net/ethernet/freescale/fman/Kconfig|8 +
 drivers/net/ethernet/freescale/fman/Makefile   |7 +
 .../net/ethernet/freescale/fman/crc_mac_addr_ext.h |  314 +++
 drivers/net/ethernet/freescale/fman/fman.c | 2876 
 drivers/net/ethernet/freescale/fman/fman.h |  325 +++
 drivers/net/ethernet/freescale/fman/fman_dtsec.c   | 1609 +++
 drivers/net/ethernet/freescale/fman/fman_dtsec.h   |   59 +
 drivers/net/ethernet/freescale/fman/fman_mac.h |  276 ++
 drivers/net/ethernet/freescale/fman/fman_memac.c   | 1307 +
 drivers/net/ethernet/freescale/fman/fman_memac.h   |   60 +
 drivers/net/ethernet/freescale/fman/fman_muram.c   |  159 ++
 drivers/net/ethernet/freescale/fman/fman_muram.h   |   51 +
 drivers/net/ethernet/freescale/fman/fman_port.c| 1779 
 drivers/net/ethernet/freescale/fman/fman_port.h|  151 +
 drivers/net/ethernet/freescale/fman/fman_sp.c  |  167 ++
 drivers/net/ethernet/freescale/fman/fman_sp.h  |  103 +
 drivers/net/ethernet/freescale/fman/fman_tgec.c|  798 ++
 drivers/net/ethernet/freescale/fman/fman_tgec.h|   55 +
 drivers/net/ethernet/freescale/fman/mac.c  |  980 +++
 drivers/net/ethernet/freescale/fman/mac.h  |   97 +
 22 files changed, 11184 insertions(+)
 create mode 100644 drivers/net/ethernet/freescale/fman/Kconfig
 create mode 100644 

Re: [PATCH v2 net-next] net: dsa: mv88e6xxx: assert SMI lock

2015-11-02 Thread Vivien Didelot
On Nov. Monday 02 (45) 04:02 PM, Andrew Lunn wrote:
> On Fri, Oct 30, 2015 at 06:56:45PM -0400, Vivien Didelot wrote:
> > It's easy to forget to lock the smi_mutex before calling the low-level
> > _mv88e6xxx_reg_{read,write}, so add a assert_smi_lock function in them.
> > 
> > Signed-off-by: Vivien Didelot 
> 
> Acked-by: Andrew Lunn 
> 
> 
> Since there is no followup fixes patch, i assume we actually have it
> correct at the moment?

Ho, I just caught what you meant ;-)

>From my (minimal) tests, I didn't see any stack dump yet from setup, FDB
or VLAN operations, looks good so far.

Thanks,
-v
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next v4 2/8] dpaa_eth: add support for DPAA Ethernet

2015-11-02 Thread Madalin Bucur
This introduces the Freescale Data Path Acceleration Architecture
(DPAA) Ethernet driver (dpaa_eth) that builds upon the DPAA QMan,
BMan, PAMU and FMan drivers to deliver Ethernet connectivity on
the Freescale DPAA QorIQ platforms.

Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/Kconfig |2 +
 drivers/net/ethernet/freescale/Makefile|1 +
 drivers/net/ethernet/freescale/dpaa/Kconfig|   22 +
 drivers/net/ethernet/freescale/dpaa/Makefile   |   11 +
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c |  819 
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h |  432 +++
 .../net/ethernet/freescale/dpaa/dpaa_eth_common.c  | 1299 
 .../net/ethernet/freescale/dpaa/dpaa_eth_common.h  |   98 ++
 drivers/net/ethernet/freescale/dpaa/dpaa_eth_sg.c  |  408 ++
 9 files changed, 3092 insertions(+)
 create mode 100644 drivers/net/ethernet/freescale/dpaa/Kconfig
 create mode 100644 drivers/net/ethernet/freescale/dpaa/Makefile
 create mode 100644 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
 create mode 100644 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
 create mode 100644 drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
 create mode 100644 drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h
 create mode 100644 drivers/net/ethernet/freescale/dpaa/dpaa_eth_sg.c

diff --git a/drivers/net/ethernet/freescale/Kconfig 
b/drivers/net/ethernet/freescale/Kconfig
index f3f89cc..92198be 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -92,4 +92,6 @@ config GIANFAR
  and MPC86xx family of chips, the eTSEC on LS1021A and the FEC
  on the 8540.
 
+source "drivers/net/ethernet/freescale/dpaa/Kconfig"
+
 endif # NET_VENDOR_FREESCALE
diff --git a/drivers/net/ethernet/freescale/Makefile 
b/drivers/net/ethernet/freescale/Makefile
index 4097c58..ae13dc5 100644
--- a/drivers/net/ethernet/freescale/Makefile
+++ b/drivers/net/ethernet/freescale/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
 obj-$(CONFIG_FSL_PQ_MDIO) += fsl_pq_mdio.o
 obj-$(CONFIG_FSL_XGMAC_MDIO) += xgmac_mdio.o
 obj-$(CONFIG_GIANFAR) += gianfar_driver.o
+obj-$(CONFIG_FSL_DPAA_ETH) += dpaa/
 obj-$(CONFIG_PTP_1588_CLOCK_GIANFAR) += gianfar_ptp.o
 gianfar_driver-objs := gianfar.o \
gianfar_ethtool.o
diff --git a/drivers/net/ethernet/freescale/dpaa/Kconfig 
b/drivers/net/ethernet/freescale/dpaa/Kconfig
new file mode 100644
index 000..022d5aa
--- /dev/null
+++ b/drivers/net/ethernet/freescale/dpaa/Kconfig
@@ -0,0 +1,22 @@
+menuconfig FSL_DPAA_ETH
+   tristate "DPAA Ethernet"
+   depends on FSL_SOC && FSL_BMAN && FSL_QMAN && FSL_FMAN
+   select PHYLIB
+   select FSL_FMAN_MAC
+   ---help---
+ Data Path Acceleration Architecture Ethernet driver,
+ supporting the Freescale QorIQ chips.
+ Depends on Freescale Buffer Manager and Queue Manager
+ driver and Frame Manager Driver.
+
+if FSL_DPAA_ETH
+
+config FSL_DPAA_ETH_FRIENDLY_IF_NAME
+   bool "Use fmX-macY names for the DPAA interfaces"
+   default y
+   ---help---
+ The DPAA Ethernet netdevices are created for each FMan port available
+ on a certain board. Enable this to get interface names derived from
+ the underlying FMan hardware for a simple identification.
+
+endif # FSL_DPAA_ETH
diff --git a/drivers/net/ethernet/freescale/dpaa/Makefile 
b/drivers/net/ethernet/freescale/dpaa/Makefile
new file mode 100644
index 000..3847ec7
--- /dev/null
+++ b/drivers/net/ethernet/freescale/dpaa/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the Freescale DPAA Ethernet controllers
+#
+
+# Include FMan headers
+FMAN= $(srctree)/drivers/net/ethernet/freescale/fman
+ccflags-y += -I$(FMAN)
+
+obj-$(CONFIG_FSL_DPAA_ETH) += fsl_dpa.o
+
+fsl_dpa-objs += dpaa_eth.o dpaa_eth_sg.o dpaa_eth_common.o
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
new file mode 100644
index 000..8381616
--- /dev/null
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -0,0 +1,819 @@
+/* Copyright 2008 - 2015 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in the
+ *  documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ *  names of its contributors may be used to endorse or promote products
+ *  derived from this software without specific prior written permission.
+ 

[net-next v4 6/8] dpaa_eth: add ethtool statistics

2015-11-02 Thread Madalin Bucur
Add a series of counters to be exported through ethtool:
- add detailed counters for reception errors;
- add detailed counters for QMan enqueue reject events;
- count the number of fragmented skbs received from the stack;
- count all frames received on the Tx confirmation path;
- add congestion group statistics;
- count the number of interrupts for each CPU.

Signed-off-by: Ioana Ciornei 
Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c |  12 ++
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h |  34 
 .../net/ethernet/freescale/dpaa/dpaa_eth_common.c  |  40 -
 .../net/ethernet/freescale/dpaa/dpaa_eth_common.h  |   2 +
 drivers/net/ethernet/freescale/dpaa/dpaa_eth_sg.c  |   1 +
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 183 +
 6 files changed, 270 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 894f1a7..0b3332a 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -102,6 +102,15 @@ static void _dpa_rx_error(struct net_device *net_dev,
 
percpu_priv->stats.rx_errors++;
 
+   if (fd->status & FM_FD_ERR_DMA)
+   percpu_priv->rx_errors.dme++;
+   if (fd->status & FM_FD_ERR_PHYSICAL)
+   percpu_priv->rx_errors.fpe++;
+   if (fd->status & FM_FD_ERR_SIZE)
+   percpu_priv->rx_errors.fse++;
+   if (fd->status & FM_FD_ERR_PRS_HDR_ERR)
+   percpu_priv->rx_errors.phe++;
+
dpa_fd_release(net_dev, fd);
 }
 
@@ -167,6 +176,8 @@ static void _dpa_tx_conf(struct net_device *net_dev,
percpu_priv->stats.tx_errors++;
}
 
+   percpu_priv->tx_confirm++;
+
skb = _dpa_cleanup_tx_fd(priv, fd);
 
dev_kfree_skb(skb);
@@ -302,6 +313,7 @@ static void priv_ern(struct qman_portal *portal,
 
percpu_priv->stats.tx_dropped++;
percpu_priv->stats.tx_fifo_errors++;
+   count_ern(percpu_priv, msg);
 
/* If we intended this buffer to go into the pool
 * when the FM was done, we need to put it in
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
index 87577cf..ccaadd9 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
@@ -192,6 +192,25 @@ struct dpa_bp {
void (*free_buf_cb)(void *addr);
 };
 
+struct dpa_rx_errors {
+   u64 dme;/* DMA Error */
+   u64 fpe;/* Frame Physical Error */
+   u64 fse;/* Frame Size Error */
+   u64 phe;/* Header Error */
+};
+
+/* Counters for QMan ERN frames - one counter per rejection code */
+struct dpa_ern_cnt {
+   u64 cg_tdrop;   /* Congestion group taildrop */
+   u64 wred;   /* WRED congestion */
+   u64 err_cond;   /* Error condition */
+   u64 early_window;   /* Order restoration, frame too early */
+   u64 late_window;/* Order restoration, frame too late */
+   u64 fq_tdrop;   /* FQ taildrop */
+   u64 fq_retired; /* FQ is retired */
+   u64 orp_zero;   /* ORP disabled */
+};
+
 struct dpa_napi_portal {
struct napi_struct napi;
struct qman_portal *p;
@@ -201,7 +220,13 @@ struct dpa_napi_portal {
 struct dpa_percpu_priv_s {
struct net_device *net_dev;
struct dpa_napi_portal *np;
+   u64 in_interrupt;
+   u64 tx_confirm;
+   /* fragmented (non-linear) skbuffs received from the stack */
+   u64 tx_frag_skbuffs;
struct rtnl_link_stats64 stats;
+   struct dpa_rx_errors rx_errors;
+   struct dpa_ern_cnt ern_cnt;
 };
 
 struct dpa_priv_s {
@@ -228,6 +253,14 @@ struct dpa_priv_s {
 * (and the same) congestion group.
 */
struct qman_cgr cgr;
+   /* If congested, when it began. Used for performance stats. */
+   u32 congestion_start_jiffies;
+   /* Number of jiffies the Tx port was congested. */
+   u32 congested_jiffies;
+   /* Counter for the number of times the CGR
+* entered congestion state
+*/
+   u32 cgr_congested_count;
} cgr_data;
/* Use a per-port CGR for ingress traffic. */
bool use_ingress_cgr;
@@ -289,6 +322,7 @@ static inline int dpaa_eth_napi_schedule(struct 
dpa_percpu_priv_s *percpu_priv,
 
np->p = portal;
napi_schedule(>napi);
+   percpu_priv->in_interrupt++;
return 1;
}
}
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
index 

Re: [PATCH net] net: avoid NULL deref in inet_ctl_sock_destroy()

2015-11-02 Thread Eric Dumazet
On Mon, 2015-11-02 at 17:53 +0100, Hannes Frederic Sowa wrote:
> On Mon, Nov 2, 2015, at 16:50, Eric Dumazet wrote:
> > From: Eric Dumazet 
> > 
> > Under low memory conditions, tcp_sk_init() and icmp_sk_init()
> > can both iterate on all possible cpus and call inet_ctl_sock_destroy(),
> > with eventual NULL pointer.
> > 
> > Signed-off-by: Eric Dumazet 
> > Reported-by: Dmitry Vyukov 
> 
> Eric, was this a private report or some of those floating around
> publicly?

Dmitry Vyukov filled two internal bug reports at Google,
not sure if he mentioned the issue elsewhere.

Google-Bug-Id: 25415196
Google-Bug-Id: 25416355

(But you do not have access to them)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH net-next] net/core: initial support for stacked dev feature toggles

2015-11-02 Thread Jarod Wilson

Alexander Duyck wrote:

On 10/30/2015 09:25 AM, Jarod Wilson wrote:

...

Rather than outright dropping the second bit though, I was thinking
maybe just drop a note in dmesg along the lines of "hey, you shut off
LRO, it is still enabled on upper dev foo", to placate end-users.


I would rather not see it. It would be mostly noise. It is perfectly
valid to have LRO advertised on an upper device, but not supported on a
lower one. It basically just means that the path will allow LRO frames
through, it doesn't guarantee that we are going to provide them.


Okay, dropping this.

...

Same thing here. If a lower dev has it disabled then leave it
disabled. I believe your goal is to make it so that
dev_disable_lro() can shut down LRO when it is making packets in the
data-path unusable.


This is already the case since commit fbe168ba91f7 ("net: generic
dev_disable_lro() stacked device handling"). That commit makes sure
dev_disable_lro() is propagated down the stack and also makes sure new
slaves added to a bond/team with LRO disabled have it disabled too.

What it does not do is propagating LRO disabling down if it is disabled
in ways that do not call dev_disable_lro() (e.g. via ethtool). I'm not
sure if this should be done or not, both options have their pros and
cons.


Making it work with ethtool was one of my primary goals with this
change, as it was users prodding things with ethtool that prompted the
"hey, this doesn't make sense" bug reports.


I'd say make it work like dev_disable_lro already does. Disabling LRO
propagates down, enabling LRO only enables it on the specific device.

The way to think of it is as a warning flag. With LRO enabled this
device may report frames larger than MTU to the stack and will mangle
checksums. Without LRO all of the frames received should be restricted
to MTU. That is why you have to force the disabling down to all lower
devices, and why you cannot enable it if an upper device has it disabled.


However, I believe enabling LRO shouldn't be propagated down.


Hm. Devices that should never have LRO enabled still won't get it
enabled, so I'm not clear what harm it would cause.I tend to think you


How do you define "devices that should never have LRO enabled"?


No NETIF_F_LRO flag set in hw_features is what I was thinking.


The fact
is LRO is very messy in terms of the way it functions. Different drivers
handle it different ways. Usually it results in the Rx checksum being
mangled, it provides frames larger than MTU, and uses fraglist instead
of frags on some drivers.


do want this sync'ing down the stack if set on an upper dev (i.e.,
ethtool -K bond0 lro on), for consistency's sake. You can always come
back through afterwards and disable things on lower devs individually if
they're really not wanted, since we're in agreement that we shouldn't
prevent disabling features on lower devices.


Think of it this way. Lets say I have a NIC that I know is problematic
when LRO is enabled, it might cause a kernel panic due to an skb
overrun. So I have a bond with it and some other NIC which can run with
LRO enabled without issues. How do I enable LRO on the other device
without causing a kernel panic, and without tearing apart the existing
bond? With the approach you have described I can't because I have to
enable it at the bond and doing so will enable it on the NIC with the
faulty implementation.


I'd argue that if enabling LRO on a device causes a panic, that device 
probably shouldn't be advertising LRO support, and the driver ought to 
be fixed, but that's somewhat tangential. I'm already sold on only 
disabling down the stack.



This is why we cannot enable LRO unless all upper devices support it,
and why we should propagate disabling LRO down to all lower devices.
Trying to force it on for a lower device just because the upper device
supports it is a bad idea because there are multiple LRO implementations
and they all behave very differently.


That's a bit concerning, given that we default to LRO on in a bond, as 
should all the slaves, regardless of which LRO implementation the device 
has (so long as the driver claims to support LRO, anyway).


But again, that's probably a separate issue, I've got a forthcoming 
patch that I'm still beating around and touching up, but I think looks 
sane and lines up with what you've suggested.



If nothing else you might start looking at working with a mask of
bits that function like this.  You could probably start with GRO,
LRO, and RXCSUM and work your way up from there.  If they aren't set
on the upper devices you cannot enable them, and if they are cleared
then they must be cleared on all lower devices.


For step one, I've added a feature mask and a new helper that iterates 
over it looking for set feature flags. In the case of the bnx2x equipped 
host I'm currently testing on, adding RXCSUM had an interesting and as 
yet unexplained side-effect of preventing LRO from being enabled on the 
bnx2x cards -- ethtool showed "off 

Re: [PATCH net] net: avoid NULL deref in inet_ctl_sock_destroy()

2015-11-02 Thread Dmitry Vyukov
On Mon, Nov 2, 2015 at 6:00 PM, Eric Dumazet  wrote:
> On Mon, 2015-11-02 at 17:53 +0100, Hannes Frederic Sowa wrote:
>> On Mon, Nov 2, 2015, at 16:50, Eric Dumazet wrote:
>> > From: Eric Dumazet 
>> >
>> > Under low memory conditions, tcp_sk_init() and icmp_sk_init()
>> > can both iterate on all possible cpus and call inet_ctl_sock_destroy(),
>> > with eventual NULL pointer.
>> >
>> > Signed-off-by: Eric Dumazet 
>> > Reported-by: Dmitry Vyukov 
>>
>> Eric, was this a private report or some of those floating around
>> publicly?
>
> Dmitry Vyukov filled two internal bug reports at Google,
> not sure if he mentioned the issue elsewhere.

No, I did not.
Can I now?

> Google-Bug-Id: 25415196
> Google-Bug-Id: 25416355
>
> (But you do not have access to them)
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] af_unix: optimize unix_writable by inlining

2015-11-02 Thread Aaron Conole
David Miller  writes:

> From: Aaron Conole 
> Date: Mon,  2 Nov 2015 12:01:59 -0500
>
>> unix_writable() originally was inlined, but was changed as part of
>> commit 1586a5877db9 ("af_unix: do not report POLLOUT on
>> listeners"). Re-enable the inline flag.
>> 
>> Signed-off-by: Aaron Conole 
>
> This is never appropriate.
>
> The compiler should be fixed to inline functions properly when
> appropriate for the optimization level requested.

Okay, apologies for the noise.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] net: fix percpu memory leaks

2015-11-02 Thread Hannes Frederic Sowa
On Mon, Nov 2, 2015, at 18:03, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> This patch fixes following problems :
> 
> 1) percpu_counter_init() can return an error, therefore
>   init_frag_mem_limit() must propagate this error so that
>   inet_frags_init_net() can do the same up to its callers.
> 
> 2) If ip[46]_frags_ns_ctl_register() fail, we must unwind
>properly and free the percpu_counter.
> 
> Without this fix, we leave freed object in percpu_counters
> global list (if CONFIG_HOTPLUG_CPU) leading to crashes.
> 
> This bug was detected by KASAN and syzkaller tool
> (http://github.com/google/syzkaller)
> 
> Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation
> mem accounting")
> Signed-off-by: Eric Dumazet 
> Reported-by: Dmitry Vyukov 
> Cc: Hannes Frederic Sowa 
> Cc: Jesper Dangaard Brouer 

Acked-by: Hannes Frederic Sowa 

Syzkaller tool looks amazing, seems like it got support for unshare :).

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] mpls: support for dead routes

2015-11-02 Thread Julia Lawall
Please check on what was intended.

julia

On Tue, 3 Nov 2015, kbuild test robot wrote:

> CC: kbuild-...@01.org
> In-Reply-To: <1446498529-50275-1-git-send-email-ro...@cumulusnetworks.com>
> TO: Roopa Prabhu 
> CC: ebied...@xmission.com, rshea...@brocade.com, da...@davemloft.net, 
> netdev@vger.kernel.org
> CC: da...@davemloft.net, netdev@vger.kernel.org
> 
> Hi Roopa,
> 
> [auto build test WARNING on net-next/master -- if it's inappropriate base, 
> please suggest rules for selecting the more suitable base]
> 
> url:
> https://github.com/0day-ci/linux/commits/Roopa-Prabhu/mpls-support-for-dead-routes/20151103-051211
> :: branch date: 88 minutes ago
> :: commit date: 88 minutes ago
> 
> >> net/mpls/af_mpls.c:702:6-22: duplicated argument to && or ||
> 
> git remote add linux-review https://github.com/0day-ci/linux
> git remote update linux-review
> git checkout 21fa92d07a7254a2042ded6647fc4b91332c6d0e
> vim +702 net/mpls/af_mpls.c
> 
> f8efb73c Roopa Prabhu2015-10-23  686  int nhs = 0;
> f8efb73c Roopa Prabhu2015-10-23  687  int err = 0;
> f8efb73c Roopa Prabhu2015-10-23  688  
> f8efb73c Roopa Prabhu2015-10-23  689  change_nexthops(rt) {
> f8efb73c Roopa Prabhu2015-10-23  690  int attrlen;
> f8efb73c Roopa Prabhu2015-10-23  691  
> f8efb73c Roopa Prabhu2015-10-23  692  nla_via = NULL;
> f8efb73c Roopa Prabhu2015-10-23  693  nla_newdst = NULL;
> f8efb73c Roopa Prabhu2015-10-23  694  
> f8efb73c Roopa Prabhu2015-10-23  695  err = -EINVAL;
> f8efb73c Roopa Prabhu2015-10-23  696  if (!rtnh_ok(rtnh, 
> remaining))
> f8efb73c Roopa Prabhu2015-10-23  697  goto errout;
> f8efb73c Roopa Prabhu2015-10-23  698  
> 1c78efa8 Robert Shearman 2015-10-23  699  /* neither weighted 
> multipath nor any flags
> 1c78efa8 Robert Shearman 2015-10-23  700   * are supported
> 1c78efa8 Robert Shearman 2015-10-23  701   */
> 21fa92d0 Roopa Prabhu2015-11-02 @702  if (rtnh->rtnh_flags || 
> rtnh->rtnh_flags)
> 1c78efa8 Robert Shearman 2015-10-23  703  goto errout;
> 1c78efa8 Robert Shearman 2015-10-23  704  
> f8efb73c Roopa Prabhu2015-10-23  705  attrlen = 
> rtnh_attrlen(rtnh);
> f8efb73c Roopa Prabhu2015-10-23  706  if (attrlen > 0) {
> f8efb73c Roopa Prabhu2015-10-23  707  struct nlattr 
> *attrs = rtnh_attrs(rtnh);
> f8efb73c Roopa Prabhu2015-10-23  708  
> f8efb73c Roopa Prabhu2015-10-23  709  nla_via = 
> nla_find(attrs, attrlen, RTA_VIA);
> f8efb73c Roopa Prabhu2015-10-23  710  nla_newdst = 
> nla_find(attrs, attrlen, RTA_NEWDST);
> 
> ---
> 0-DAY kernel test infrastructureOpen Source Technology Center
> https://lists.01.org/pipermail/kbuild-all   Intel Corporation
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next V1 0/7] Mellanox mlx5e driver update, Nov 3 2015

2015-11-02 Thread Or Gerlitz
On Tue, Nov 3, 2015 at 8:07 AM, Or Gerlitz  wrote:
> This series contains bunch of small fixes to the mlx5e driver from Achiad.

Oops, I missed your email from 2h ago... but these all ARE bug fixes,
so hopefully
I didn't really violated the directive (I guess I should have just
asked, but again,
I missed the email...)

Or.

> Applies on net-next commit e7b63ff "Merge branch 'master' of
> git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next"

> Changes from V0:
>   - removed the driver patch that dealt with IRQ affinity changes during
> NAPI poll, as this is a generic problem which needs generic solution.
>
> Achiad Shochat (7):
>   net/mlx5e: Avoid NULL pointer access in case of configuration failure
>   net/mlx5e: Wait for RX buffers initialization in a more proper manner
>   net/mlx5_core: Use the the real irqn in eq->irqn
>   net/mlx5e: Don't allow more than max supported channels
>   net/mlx5e: Return error in case mlx5e_set_features() fails
>   net/mlx5e: Re-eanble client vlan TX acceleration
>   net/mlx5e: Fix LSO vlan insertion
>
>  drivers/net/ethernet/mellanox/mlx5/core/en.h   |  6 +
>  .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  5 ++---
>  drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 16 -
>  drivers/net/ethernet/mellanox/mlx5/core/en_tx.c| 26 
> +++---
>  drivers/net/ethernet/mellanox/mlx5/core/eq.c   |  8 +++
>  5 files changed, 46 insertions(+), 15 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next V1 0/7] Mellanox mlx5e driver update, Nov 3 2015

2015-11-02 Thread Or Gerlitz
Hi Dave,

This series contains bunch of small fixes to the mlx5e driver from Achiad.

Applies on net-next commit e7b63ff "Merge branch 'master' of 
git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next"

Or.

Changes from V0:
  - removed the driver patch that dealt with IRQ affinity changes during
NAPI poll, as this is a generic problem which needs generic solution.

Achiad Shochat (7):
  net/mlx5e: Avoid NULL pointer access in case of configuration failure
  net/mlx5e: Wait for RX buffers initialization in a more proper manner
  net/mlx5_core: Use the the real irqn in eq->irqn
  net/mlx5e: Don't allow more than max supported channels
  net/mlx5e: Return error in case mlx5e_set_features() fails
  net/mlx5e: Re-eanble client vlan TX acceleration
  net/mlx5e: Fix LSO vlan insertion

 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  6 +
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  5 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 16 -
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c| 26 +++---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c   |  8 +++
 5 files changed, 46 insertions(+), 15 deletions(-)

-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next V1 1/7] net/mlx5e: Avoid NULL pointer access in case of configuration failure

2015-11-02 Thread Or Gerlitz
From: Achiad Shochat 

In case a configuration operation that involves closing and re-opening
resources (e.g RX/TX queue size change) fails at the re-opening stage
these resources will remain closed.
So when executing (following) configuration operations (e.g ifconfig
down) we cannot assume that these resources are available.

Signed-off-by: Achiad Shochat 
Signed-off-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bb801a9..9df6f9a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1404,6 +1404,12 @@ int mlx5e_close_locked(struct net_device *netdev)
 {
struct mlx5e_priv *priv = netdev_priv(netdev);
 
+   /* May already be CLOSED in case a previous configuration operation
+* (e.g RX/TX queue size change) that involves close failed.
+*/
+   if (!test_bit(MLX5E_STATE_OPENED, >state))
+   return 0;
+
clear_bit(MLX5E_STATE_OPENED, >state);
 
mlx5e_redirect_rqts(priv);
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next V1 4/7] net/mlx5e: Don't allow more than max supported channels

2015-11-02 Thread Or Gerlitz
From: Achiad Shochat 

Consider MLX5E_MAX_NUM_CHANNELS @ethtool set/get_channels

Signed-off-by: Achiad Shochat 
Signed-off-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h | 6 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 5 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c| 3 +--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 0983a20..f2ae62d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -617,5 +617,11 @@ static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, NULL, cq->wq.cc);
 }
 
+static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
+{
+   return min_t(int, mdev->priv.eq_table.num_comp_vectors,
+MLX5E_MAX_NUM_CHANNELS);
+}
+
 extern const struct ethtool_ops mlx5e_ethtool_ops;
 u16 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index bce9126..2e022e9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -345,9 +345,8 @@ static void mlx5e_get_channels(struct net_device *dev,
   struct ethtool_channels *ch)
 {
struct mlx5e_priv *priv = netdev_priv(dev);
-   int ncv = priv->mdev->priv.eq_table.num_comp_vectors;
 
-   ch->max_combined   = ncv;
+   ch->max_combined   = mlx5e_get_max_num_channels(priv->mdev);
ch->combined_count = priv->params.num_channels;
 }
 
@@ -355,7 +354,7 @@ static int mlx5e_set_channels(struct net_device *dev,
  struct ethtool_channels *ch)
 {
struct mlx5e_priv *priv = netdev_priv(dev);
-   int ncv = priv->mdev->priv.eq_table.num_comp_vectors;
+   int ncv = mlx5e_get_max_num_channels(priv->mdev);
unsigned int count = ch->combined_count;
bool was_opened;
int err = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 0bab33c..febf711 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2047,8 +2047,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev 
*mdev)
 {
struct net_device *netdev;
struct mlx5e_priv *priv;
-   int nch = min_t(int, mdev->priv.eq_table.num_comp_vectors,
-   MLX5E_MAX_NUM_CHANNELS);
+   int nch = mlx5e_get_max_num_channels(mdev);
int err;
 
if (mlx5e_check_required_hca_cap(mdev))
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next V1 5/7] net/mlx5e: Return error in case mlx5e_set_features() fails

2015-11-02 Thread Or Gerlitz
From: Achiad Shochat 

In case mlx5e_set_features() fails, return the failure status rather
than 0.

Signed-off-by: Achiad Shochat 
Signed-off-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index febf711..28eaed5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1843,7 +1843,7 @@ static int mlx5e_set_features(struct net_device *netdev,
mlx5e_disable_vlan_filter(priv);
}
 
-   return 0;
+   return err;
 }
 
 static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v2] ravb: use clock rate as basis for GTI.TIV

2015-11-02 Thread Simon Horman
The GTI.TIV may be set to 2GHz^2 / rate, where rate is
that of the clock of the device. Rather than assuming a
rate of 130MHz use the actual rate of the clock.

The motivation for this is to use the correct rate on
the r8a7795/Salvator-X which is advertised as 133MHz but
may differ depending on the extal present on the Salvator-X.

Signed-off-by: Simon Horman 

---
Tested on the topic/gen3-latest branch of Geert Uytterhoeven's
renesas-drivers tree on kernel.org: 3f5a88be9fea ("[WIP] arm64: renesas:
r8a7795: Convert to new CPG/MSSR bindings")

v2
* Corrected typos in changelog, as pointed out by Geert Uytterhoeven
* Use do_div() rather than 64-bit division to allow compilation on
  32-bit ARM
---
 drivers/net/ethernet/renesas/ravb.h  |  3 +++
 drivers/net/ethernet/renesas/ravb_main.c | 38 +++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/renesas/ravb.h 
b/drivers/net/ethernet/renesas/ravb.h
index 0623fff932e4..f9dee7436e81 100644
--- a/drivers/net/ethernet/renesas/ravb.h
+++ b/drivers/net/ethernet/renesas/ravb.h
@@ -576,6 +576,9 @@ enum GTI_BIT {
GTI_TIV = 0x0FFF,
 };
 
+#define GTI_TIV_MAXGTI_TIV
+#define GTI_TIV_MIN0x20
+
 /* GIC */
 enum GIC_BIT {
GIC_PTCE= 0x0001,   /* Undocumented? */
diff --git a/drivers/net/ethernet/renesas/ravb_main.c 
b/drivers/net/ethernet/renesas/ravb_main.c
index aa7b2083cb53..599334d68afe 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -32,6 +32,8 @@
 #include 
 #include 
 
+#include 
+
 #include "ravb.h"
 
 #define RAVB_DEF_MSG_ENABLE \
@@ -1659,6 +1661,38 @@ static const struct of_device_id ravb_match_table[] = {
 };
 MODULE_DEVICE_TABLE(of, ravb_match_table);
 
+static int ravb_set_gti(struct net_device *ndev)
+{
+
+   struct device *dev = ndev->dev.parent;
+   struct device_node *np = dev->of_node;
+   unsigned long rate;
+   struct clk *clk;
+   uint64_t inc;
+
+   clk = of_clk_get(np, 0);
+   if (IS_ERR(clk)) {
+   dev_err(dev, "could not get clock\n");
+   return PTR_ERR(clk);
+   }
+
+   rate = clk_get_rate(clk);
+   clk_put(clk);
+
+   inc = 10ULL << 20;
+   do_div(inc, rate);
+
+   if (inc < GTI_TIV_MIN || inc > GTI_TIV_MAX) {
+   dev_err(dev, "gti.tiv increment 0x%llx is outside the range 
0x%x - 0x%x\n",
+   inc, GTI_TIV_MIN, GTI_TIV_MAX);
+   return -EINVAL;
+   }
+
+   ravb_write(ndev, inc, GTI);
+
+   return 0;
+}
+
 static int ravb_probe(struct platform_device *pdev)
 {
struct device_node *np = pdev->dev.of_node;
@@ -1755,7 +1789,9 @@ static int ravb_probe(struct platform_device *pdev)
   CCC);
 
/* Set GTI value */
-   ravb_write(ndev, ((1000 << 20) / 130) & GTI_TIV, GTI);
+   error = ravb_set_gti(ndev);
+   if (error)
+   goto out_release;
 
/* Request GTI loading */
ravb_write(ndev, ravb_read(ndev, GCCR) | GCCR_LTI, GCCR);
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next V1 7/7] net/mlx5e: Fix LSO vlan insertion

2015-11-02 Thread Or Gerlitz
From: Achiad Shochat 

Consider vlan insertion impact on headers copy size also for LSO
packets.

Signed-off-by: Achiad Shochat 
Signed-off-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 5105288..cd8f85a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -116,7 +116,7 @@ static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq 
*sq,
 * headers and occur before the data gather.
 * Therefore these headers must be copied into the WQE
 */
-#define MLX5E_MIN_INLINE (ETH_HLEN + 2/*vlan tag*/)
+#define MLX5E_MIN_INLINE ETH_HLEN
 
if (bf && (skb_headlen(skb) <= sq->max_inline))
return skb_headlen(skb);
@@ -128,7 +128,7 @@ static inline void mlx5e_insert_vlan(void *start, struct 
sk_buff *skb, u16 ihs)
 {
struct vlan_ethhdr *vhdr = (struct vlan_ethhdr *)start;
int cpy1_sz = 2 * ETH_ALEN;
-   int cpy2_sz = ihs - cpy1_sz - VLAN_HLEN;
+   int cpy2_sz = ihs - cpy1_sz;
 
skb_copy_from_linear_data(skb, vhdr, cpy1_sz);
skb_pull_inline(skb, cpy1_sz);
@@ -192,6 +192,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, 
struct sk_buff *skb)
 
if (skb_vlan_tag_present(skb)) {
mlx5e_insert_vlan(eseg->inline_hdr_start, skb, ihs);
+   ihs += VLAN_HLEN;
} else {
skb_copy_from_linear_data(skb, eseg->inline_hdr_start, ihs);
skb_pull_inline(skb, ihs);
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next V1 6/7] net/mlx5e: Re-eanble client vlan TX acceleration

2015-11-02 Thread Or Gerlitz
From: Achiad Shochat 

This reverts commit cd58c714acb9 "net/mlx5e: Disable client vlan TX 
acceleration".

Bring back client vlan insertion offload, the original
performance issue was found and fixed in the next patch.

Signed-off-by: Achiad Shochat 
Signed-off-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 23 +--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 28eaed5..5fc4d2d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2004,6 +2004,7 @@ static void mlx5e_build_netdev(struct net_device *netdev)
netdev->vlan_features|= NETIF_F_LRO;
 
netdev->hw_features   = netdev->vlan_features;
+   netdev->hw_features  |= NETIF_F_HW_VLAN_CTAG_TX;
netdev->hw_features  |= NETIF_F_HW_VLAN_CTAG_RX;
netdev->hw_features  |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index b73672f..5105288 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -124,6 +124,21 @@ static inline u16 mlx5e_get_inline_hdr_size(struct 
mlx5e_sq *sq,
return MLX5E_MIN_INLINE;
 }
 
+static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs)
+{
+   struct vlan_ethhdr *vhdr = (struct vlan_ethhdr *)start;
+   int cpy1_sz = 2 * ETH_ALEN;
+   int cpy2_sz = ihs - cpy1_sz - VLAN_HLEN;
+
+   skb_copy_from_linear_data(skb, vhdr, cpy1_sz);
+   skb_pull_inline(skb, cpy1_sz);
+   vhdr->h_vlan_proto = skb->vlan_proto;
+   vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb));
+   skb_copy_from_linear_data(skb, >h_vlan_encapsulated_proto,
+ cpy2_sz);
+   skb_pull_inline(skb, cpy2_sz);
+}
+
 static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 {
struct mlx5_wq_cyc   *wq   = >wq;
@@ -175,8 +190,12 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, 
struct sk_buff *skb)
ETH_ZLEN);
}
 
-   skb_copy_from_linear_data(skb, eseg->inline_hdr_start, ihs);
-   skb_pull_inline(skb, ihs);
+   if (skb_vlan_tag_present(skb)) {
+   mlx5e_insert_vlan(eseg->inline_hdr_start, skb, ihs);
+   } else {
+   skb_copy_from_linear_data(skb, eseg->inline_hdr_start, ihs);
+   skb_pull_inline(skb, ihs);
+   }
 
eseg->inline_hdr_sz = cpu_to_be16(ihs);
 
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next V1 2/7] net/mlx5e: Wait for RX buffers initialization in a more proper manner

2015-11-02 Thread Or Gerlitz
From: Achiad Shochat 

Use jiffies rather than wait loop with msleep().

The wait loop didn't take into consideration time when the
process was not executing.

Signed-off-by: Achiad Shochat 
Signed-off-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9df6f9a..0bab33c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -442,12 +442,12 @@ static void mlx5e_disable_rq(struct mlx5e_rq *rq)
 
 static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq)
 {
+   unsigned long exp_time = jiffies + msecs_to_jiffies(2);
struct mlx5e_channel *c = rq->channel;
struct mlx5e_priv *priv = c->priv;
struct mlx5_wq_ll *wq = >wq;
-   int i;
 
-   for (i = 0; i < 1000; i++) {
+   while (time_before(jiffies, exp_time)) {
if (wq->cur_sz >= priv->params.min_rx_wqes)
return 0;
 
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next V1 3/7] net/mlx5_core: Use the the real irqn in eq->irqn

2015-11-02 Thread Or Gerlitz
From: Achiad Shochat 

Instead of storing the msix array index in eq->irqn (vecidx),
store the real irq number.

Signed-off-by: Achiad Shochat 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Or Gerlitz 
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 1f01fe8..713ead5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -382,10 +382,10 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct 
mlx5_eq *eq, u8 vecidx,
 name, pci_name(dev->pdev));
 
eq->eqn = out.eq_number;
-   eq->irqn = vecidx;
+   eq->irqn = priv->msix_arr[vecidx].vector;
eq->dev = dev;
eq->doorbell = uar->map + MLX5_EQ_DOORBEL_OFFSET;
-   err = request_irq(priv->msix_arr[vecidx].vector, mlx5_msix_handler, 0,
+   err = request_irq(eq->irqn, mlx5_msix_handler, 0,
  priv->irq_info[vecidx].name, eq);
if (err)
goto err_eq;
@@ -421,12 +421,12 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, 
struct mlx5_eq *eq)
int err;
 
mlx5_debug_eq_remove(dev, eq);
-   free_irq(dev->priv.msix_arr[eq->irqn].vector, eq);
+   free_irq(eq->irqn, eq);
err = mlx5_cmd_destroy_eq(dev, eq->eqn);
if (err)
mlx5_core_warn(dev, "failed to destroy a previously created eq: 
eqn %d\n",
   eq->eqn);
-   synchronize_irq(dev->priv.msix_arr[eq->irqn].vector);
+   synchronize_irq(eq->irqn);
mlx5_buf_free(dev, >buf);
 
return err;
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete

2015-11-02 Thread Eric W. Biederman
Sergei Shtylyov  writes:

> Hello.
>>
>> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
>> index c70d750148b6..893cd2dc3979 100644
>> --- a/net/mpls/af_mpls.c
>> +++ b/net/mpls/af_mpls.c
>> @@ -1162,6 +1162,8 @@ static int rtm_to_route_config(struct sk_buff *skb,  
>> struct nlmsghdr *nlh,
>>  {
>>  cfg->rc_mp = nla_data(nla);
>>  cfg->rc_mp_len = nla_len(nla);
>> +/* Fail until multipath support is complete */
>> +goto errout;
>>  break;
>
>Forgot to delete *break*?

Nope.  I did that deliberately, because this code is not supposed to
stay this way for long.

Eric
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next rfc V2 0/2] basic busy polling support for vhost_net

2015-11-02 Thread Jason Wang


On 10/30/2015 07:58 PM, Jason Wang wrote:
>
> On 10/29/2015 04:45 PM, Jason Wang wrote:
>> Hi all:
>>
>> This series tries to add basic busy polling for vhost net. The idea is
>> simple: at the end of tx processing, busy polling for new tx added
>> descriptor and rx receive socket for a while. The maximum number of
>> time (in us) could be spent on busy polling was specified through
>> module parameter.
>>
>> Test were done through:
>>
>> - 50 us as busy loop timeout
>> - Netperf 2.6
>> - Two machines with back to back connected mlx4
>> - Guest with 8 vcpus and 1 queue
>>
>> Result shows very huge improvement on both tx (at most 158%) and rr
>> (at most 53%) while rx is as much as in the past. Most cases the cpu
>> utilization is also improved:
>>
> Just notice there's something wrong in the setup. So the numbers are
> incorrect here. Will re-run and post correct number here.
>
> Sorry.

Here's the updated testing result:

1) 1 vcpu 1 queue:

TCP_RR
size/session/+thu%/+normalize%
1/ 1/0%/  -25%
1/50/  +12%/0%
1/   100/  +12%/   +1%
1/   200/   +9%/   -1%
   64/ 1/   +3%/  -21%
   64/50/   +8%/0%
   64/   100/   +7%/0%
   64/   200/   +9%/0%
  256/ 1/   +1%/  -25%
  256/50/   +7%/   -2%
  256/   100/   +6%/   -2%
  256/   200/   +4%/   -2%
  512/ 1/   +2%/  -19%
  512/50/   +5%/   -2%
  512/   100/   +3%/   -3%
  512/   200/   +6%/   -2%
 1024/ 1/   +2%/  -20%
 1024/50/   +3%/   -3%
 1024/   100/   +5%/   -3%
 1024/   200/   +4%/   -2%
Guest RX
size/session/+thu%/+normalize%
   64/ 1/   -4%/   -5%
   64/ 4/   -3%/  -10%
   64/ 8/   -3%/   -5%
  512/ 1/  +15%/   +1%
  512/ 4/   -5%/   -5%
  512/ 8/   -2%/   -4%
 1024/ 1/   -5%/  -16%
 1024/ 4/   -2%/   -5%
 1024/ 8/   -6%/   -6%
 2048/ 1/  +10%/   +5%
 2048/ 4/   -8%/   -4%
 2048/ 8/   -1%/   -4%
 4096/ 1/   -9%/  -11%
 4096/ 4/   +1%/   -1%
 4096/ 8/   +1%/0%
16384/ 1/  +20%/  +11%
16384/ 4/0%/   -3%
16384/ 8/   +1%/0%
65535/ 1/  +36%/  +13%
65535/ 4/  -10%/   -9%
65535/ 8/   -3%/   -2%
Guest TX
size/session/+thu%/+normalize%
   64/ 1/   -7%/  -16%
   64/ 4/  -14%/  -23%
   64/ 8/   -9%/  -20%
  512/ 1/  -62%/  -56%
  512/ 4/  -62%/  -56%
  512/ 8/  -61%/  -53%
 1024/ 1/  -66%/  -61%
 1024/ 4/  -77%/  -73%
 1024/ 8/  -73%/  -67%
 2048/ 1/  -74%/  -75%
 2048/ 4/  -77%/  -74%
 2048/ 8/  -72%/  -68%
 4096/ 1/  -65%/  -68%
 4096/ 4/  -66%/  -63%
 4096/ 8/  -62%/  -57%
16384/ 1/  -25%/  -28%
16384/ 4/  -28%/  -17%
16384/ 8/  -24%/  -10%
65535/ 1/  -17%/  -14%
65535/ 4/  -22%/   -5%
65535/ 8/  -25%/   -9%

- obvious improvement on TCP_RR (at most 12%)
- improvement on guest RX
- huge decreasing on Guest TX (at most -75%), this is probably because
virtio-net driver suffers from buffer bloat by orphaning skb before
transmission. The faster vhost it is, the smaller packet it could
produced. To reduce the impact on this, turning off gso in guest can
result the following result:

size/session/+thu%/+normalize%
   64/ 1/   +3%/  -11%
   64/ 4/   +4%/  -10%
   64/ 8/   +4%/  -10%
  512/ 1/   +2%/   +5%
  512/ 4/0%/   -1%
  512/ 8/0%/0%
 1024/ 1/  +11%/0%
 1024/ 4/0%/   -1%
 1024/ 8/   +3%/   +1%
 2048/ 1/   +4%/   -1%
 2048/ 4/   +8%/   +3%
 2048/ 8/0%/   -1%
 4096/ 1/   +4%/   -1%
 4096/ 4/   +1%/0%
 4096/ 8/   +2%/0%
16384/ 1/   +2%/   -2%
16384/ 4/   +3%/   +1%
16384/ 8/0%/   -1%
65535/ 1/   +9%/   +7%
65535/ 4/0%/   -3%
65535/ 8/   -1%/   -1%

2) 8 vcpus 1 queue:

TCP_RR
size/session/+thu%/+normalize%
1/ 1/   +5%/  -14%
1/50/   +2%/   +1%
1/   100/0%/   -1%
1/   200/0%/0%
   64/ 1/0%/  -25%
   64/50/   +5%/   +5%
   64/   100/0%/0%
   64/   200/0%/   -1%
  256/ 1/0%/  -30%
  256/50/0%/0%
  256/   100/   -2%/   -2%
  256/   200/0%/0%
  512/ 1/   +1%/  -23%
  512/50/   +1%/   +1%
  512/   100/   +1%/0%
  512/   200/   +1%/   +1%
 1024/ 1/   +1%/  -23%
 1024/50/   +5%/   +5%
 1024/   100/0%/   -1%
 1024/   200/0%/0%
Guest RX
size/session/+thu%/+normalize%
   64/ 1/   +1%/   +1%
   64/ 4/   -2%/   +1%
   64/ 8/   +6%/  +19%
  512/ 1/   +5%/   -7%
  512/ 4/   -4%/   -4%
  512/ 8/0%/0%
 1024/ 1/   +1%/   +2%
 1024/ 4/   -2%/   -2%
 1024/ 8/   -1%/   +7%
 2048/ 1/   +8%/   -2%
 2048/ 4/0%/   +5%
 2048/ 8/   -1%/  +13%
 4096/ 1/   -1%/   +2%
 4096/ 4/0%/   +6%
 4096/ 8/   -2%/  +15%
16384/ 1/   -1%/0%
16384/ 4/   -2%/   -1%
16384/ 8/   -2%/   +2%
65535/ 1/   -2%/0%
65535/ 4/   -3%/   -3%
65535/ 8/   -2%/   +2%
Guest TX
size/session/+thu%/+normalize%
   64/ 1/   +6%/   

[PATCH net-next] mpls: Don't accept multipath configuration until the support is complete

2015-11-02 Thread Eric W. Biederman

Currently the multipath code has a nasty failure mode in that it will
fail to notice link down or administrative device down and will
instead black hole packets instead of sending them to their nexthop
destination.

Half the point of multipath is to gracefully handle forwarding path
failures and as the current code does not handle forwarding failures the
current code is dangerous to use.

As mpls multipath has never been exported to userspace and as the
implementation was not complete before the merge window disable the mpls
multipath code by rejecting all multipath configuration requests.  This
will give us another kernel development cycle to cleanly sort out the
issues, without any bad precedents to worry about.

Signed-off-by: "Eric W. Biederman" 
---
 net/mpls/af_mpls.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c70d750148b6..893cd2dc3979 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1162,6 +1162,8 @@ static int rtm_to_route_config(struct sk_buff *skb,  
struct nlmsghdr *nlh,
{
cfg->rc_mp = nla_data(nla);
cfg->rc_mp_len = nla_len(nla);
+   /* Fail until multipath support is complete */
+   goto errout;
break;
}
default:
-- 
2.2.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Resource leak in unshare

2015-11-02 Thread Eric Dumazet
On Mon, 2015-11-02 at 13:01 -0600, Eric W. Biederman wrote:
> Dmitry Vyukov  writes:
> 
> > Hello,
> >
> > I am hitting the following warnings on
> > bcee19f424a0d8c26ecf2607b73c690802658b29 (4.3):
> 
> Do you have any trace of the earlier failures?
> 
> This appears to be something caused by an earlier failure (possibly
> whatever fails to allocate memory).  Having network devices present
> but being in the generic cleanup routines is wrong.
> 
> If there is no additional information can you please rerun with the
> following change applied?  That should at least report which function is
> failing, and give us a good clue where to start debugging this.

At first, I would say sit is leaking percpu memory

Load sit module, then :

while :
do
ip netns add foo
ip netns del foo
done

Will eat all memory eventually.

ipip6_tunnel_init() and ipip6_fb_tunnel_init() are _both_ called for the
sit0 device, this looks very wrong.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT] Networking

2015-11-02 Thread Andy Lutomirski

On 10/28/2015 02:39 AM, Linus Torvalds wrote:


I'm sorry, but we don't add idiotic new interfaces like this for
idiotic new code like that.


As one of the people who encouraged gcc to add this interface, I'll 
speak up in its favor:


Getting overflow checking right in more complicated cases is a PITA. 
I'll admit that the "subtract from an unsigned integer if it won't go 
negative" isn't particularly useful, but there are other cases in which 
it's much more useful.


The one I care about the most is for multiplication.  Witness the 
never-ending debates about the proper way to implement things like 
kmalloc_array.  We currently do:


static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
{
if (size != 0 && n > SIZE_MAX / size)
return NULL;
return __kmalloc(n * size, flags);
}

This is correct, and it's even reasonably efficient if size is a 
compile-time constant.  (On x86, it still might not be quite optimal, 
since there'll be an extra cmp instruction.  Sure, the difference could 
easily be a cycle or even less.)


But if size is not a constant, then, unless the compiler is quite 
clever, this ends up generating a division, and that sucks.


If we were willing to do:

size_t total_bytes;
#if efficient_overflow_detection_works
if (__builtin_mul_overflow(n, size, _bytes))
return NULL;
#else
/* existing check goes here */
total_bytes = n * size;
#endif
return __kmalloc(n * size, flags);

then we get optimal code generation on new compilers and the result 
isn't even that ugly to look at.


For compiler flag settings in which signed overflow can cause subtle 
disasters, the signed addition overflow helpers can be nice, too.


--Andy
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] commit c6825c0976fa7893692e0e43b09740b419b23c09 upstream.

2015-11-02 Thread Ani Sinha
> On Thu, Oct 29, 2015 at 6:21 PM, Neal P. Murphy
>  wrote:
> > On Thu, 29 Oct 2015 17:01:24 -0700
> > Ani Sinha  wrote:
> >
> >> On Wed, Oct 28, 2015 at 11:40 PM, Neal P. Murphy
> >>  wrote:
> >> > On Wed, 28 Oct 2015 02:36:50 -0400
> >> > "Neal P. Murphy"  wrote:
> >> >
> >> >> On Mon, 26 Oct 2015 21:06:33 +0100
> >> >> Pablo Neira Ayuso  wrote:
> >> >>
> >> >> > Hi,
> >> >> >
> >> >> > On Mon, Oct 26, 2015 at 11:55:39AM -0700, Ani Sinha wrote:
> >> >> > > netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get
> >> >> >
> >> >> > Please, no need to Cc everyone here. Please, submit your Netfilter
> >> >> > patches to netfilter-de...@vger.kernel.org.
> >> >> >
> >> >> > Moreover, it would be great if the subject includes something
> >> >> > descriptive on what you need, for this I'd suggest:
> >> >> >
> >> >> > [PATCH -stable 3.4,backport] netfilter: nf_conntrack: fix RCU race in 
> >> >> > nf_conntrack_find_get
> >> >> >
> >> >> > I'm including Neal P. Murphy, he said he would help testing these
> >> >> > backports, getting a Tested-by: tag usually speeds up things too.
> >> >>
> >> >
> >> > I've probably done about as much seat-of-the-pants testing as I can. All 
> >> > opening/closing the same destination IP/port.
> >> >
> >> > Host: Debian Jessie, 8-core Vishera 8350 at 4.4 GHz, 16GiB RAM at (I 
> >> > think) 2100MHz.
> >> >
> >> > Traffic generator 1: 6-CPU KVM running 64-bit Smoothwall Express 3.1 
> >> > (linux 3.4.109 without these patches), with 8GiB RAM and 9GiB swap. 
> >> > Packets sent across PURPLE (to bypass NAT and firewall).
> >> >
> >> > Traffic generator 2: 32-bit KVM running Smoothwall Express 3.1 (linux 
> >> > 3.4.110 with these patches), 3GiB RAM and minimal swap.
> >> >
> >> > In the first set of tests, generator 1's traffic passed through 
> >> > Generator 2 as a NATting firewall, to the host's web server. In the 
> >> > second set of tests, generator 2's traffic went through NAT to the 
> >> > host's web server.
> >> >
> >> > The load tests:
> >> >   - 2500 processes using 2500 addresses and random src ports
> >> >   - 2500 processes using 2500 addresses and the same src port
> >> >   - 2500 processes using the same src address and port
> >> >
> >> > I also tested using stock NF timeouts and using 1 second timeouts.
> >> >
> >> > Bandwidth used got as high as 16Mb/s for some tests.
> >> >
> >> > Conntracks got up to 200 000 or so or bounced between 1 and 2, depending 
> >> > on the test and the timeouts.
> >> >
> >> > I did not reproduce the problem these patches solve. But more 
> >> > importantly, I saw no problems at all. Each time I terminated a test, 
> >> > RAM usage returned to about that of post-boot; so there were no apparent 
> >> > memory leaks. No kernel messages and no netfilter messages appeared 
> >> > during the tests.
> >> >
> >> > If I have time, I suppose I could run another set of tests: 2500 source 
> >> > processes using 2500 addresses times 200 ports to connect to 2500 
> >> > addresses times 200 ports on a destination system. Each process opens 
> >> > 200 sockets, then closes them. And repeats ad infinitum. But I might 
> >> > have to be clever since I can't run 500 000 processes; but I could run 
> >> > 20 VMs; that would get it down to about 12 000 processes per VM. And I 
> >> > might have to figure out how to allow allow processes on the destination 
> >> > system to open hundreds or thousands of sockets.
> >>
> >> Should I resend the patch with a Tested-by: tag?
> >
> > ... Oh, wait. Not yet. The dawn just broke over ol' Marblehead here. I only 
> > tested TCP; I need to hammer UDP, too.
> >
> > Can I set the timeouts to zero? Or is one as low as I can go?
>
> Any progress with testing ?

I applied the 'hammer' through a firewall with the patch. I used TCP,
UDP and ICMP.

I don't know if the patch fixes the problem. But I'm reasonably sure
that it did not break normal operations.

To test a different problem I fixed (a memory leak in my 64-bit
counter patch for xt_ACCOUNT), I tested 60,000 addresses (most of a
/16) through the firewall. Again, no troubles.

I only observed two odd things which are likely completely unrelated
to your patch. When I started the TCP test, then added the UDP test,
only TCP would come through. If I stopped and restarted the TCP test,
only UDP would come through. I suspect this is due to buffering. It's
just a behaviour I haven't encountered since I started using Linux
many years ago (around '98). The second, when I started the test, the
firewall would lose contact with the upstream F/W's apcupsd daemon;
again, this is likely due to the nature of the test: it likely floods
input and output queues.


I'd say you can probably resend with Tested-by.

Neal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

[PATCH -stable 3.4,backport] commit c6825c0976fa7893692e0e43b09740b419b23c09 upstream.

2015-11-02 Thread Ani Sinha
netfilter: nf_conntrack: don't release a conntrack with non-zero
refcnt

With this patch, the conntrack refcount is initially set to zero and
it is bumped once it is added to any of the list, so we fulfill
Eric's golden rule which is that all released objects always have a
refcount that equals zero.

Andrey Vagin reports that nf_conntrack_free can't be called for a
conntrack with non-zero ref-counter, because it can race with
nf_conntrack_find_get().

A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero
ref-counter says that this conntrack is used. So when we release
a conntrack with non-zero counter, we break this assumption.

CPU1CPU2
nf_conntrack_find()
nf_ct_put()
 destroy_conntrack()
...
init_conntrack
 __nf_conntrack_alloc (set use = 1)
atomic_inc_not_zero(>use) (use = 2)
 if (!l4proto->new(ct, skb, dataoff, 
timeouts))
  nf_conntrack_free(ct); (use = 2 !!!)
...
__nf_conntrack_alloc (set use = 1)
 if (!nf_ct_key_equal(h, tuple, zone))
  nf_ct_put(ct); (use = 0)
   destroy_conntrack()
/* continue to work with CT */

After applying the path "[PATCH] netfilter: nf_conntrack: fix RCU
race in nf_conntrack_find_get" another bug was triggered in
destroy_conntrack():

<4>[67096.759334] [ cut here ]
<2>[67096.759353] kernel BUG at net/netfilter/nf_conntrack_core.c:211!
...
<4>[67096.759837] Pid: 498649, comm: atdd veid: 666 Tainted: G C 
---2.6.32-042stab084.18 #1 042stab084_18 /DQ45CB
<4>[67096.759932] RIP: 0010:[]  [] 
destroy_conntrack+0x15c/0x190 [nf_conntrack]
<4>[67096.760255] Call Trace:
<4>[67096.760255]  [] nf_conntrack_destroy+0x17/0x30
<4>[67096.760255]  [] nf_conntrack_find_get+0x85/0x130 
[nf_conntrack]
<4>[67096.760255]  [] nf_conntrack_in+0x352/0xb60 
[nf_conntrack]
<4>[67096.760255]  [] ipv4_conntrack_local+0x51/0x60 
[nf_conntrack_ipv4]
<4>[67096.760255]  [] nf_iterate+0x69/0xb0
<4>[67096.760255]  [] ? dst_output+0x0/0x20
<4>[67096.760255]  [] nf_hook_slow+0x74/0x110
<4>[67096.760255]  [] ? dst_output+0x0/0x20
<4>[67096.760255]  [] raw_sendmsg+0x775/0x910
<4>[67096.760255]  [] ? flush_tlb_others_ipi+0x128/0x130
<4>[67096.760255]  [] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255]  [] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255]  [] inet_sendmsg+0x4a/0xb0
<4>[67096.760255]  [] ? sock_sendmsg+0x13/0x140
<4>[67096.760255]  [] sock_sendmsg+0x117/0x140
<4>[67096.760255]  [] ? native_smp_send_reschedule+0x49/0x60
<4>[67096.760255]  [] ? _spin_unlock_bh+0x1b/0x20
<4>[67096.760255]  [] ? autoremove_wake_function+0x0/0x40
<4>[67096.760255]  [] ? do_ip_setsockopt+0x90/0xd80
<4>[67096.760255]  [] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255]  [] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255]  [] sys_sendto+0x139/0x190
<4>[67096.760255]  [] ? audit_syscall_entry+0x1d7/0x200
<4>[67096.760255]  [] ? __audit_syscall_exit+0x265/0x290
<4>[67096.760255]  [] compat_sys_socketcall+0x13f/0x210
<4>[67096.760255]  [] ia32_sysret+0x0/0x5

I have reused the original title for the RFC patch that Andrey posted and
most of the original patch description.

Signed-off-by: Ani Sinha 
Tested-by: "Neal P. Murphy"  
---
 net/netfilter/nf_conntrack_core.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index 9a171b2..9a46908 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -441,7 +441,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
goto out;
 
add_timer(>timeout);
-   nf_conntrack_get(>ct_general);
+   smp_wmb();
+   /* The caller holds a reference to this object */
+   atomic_set(>ct_general.use, 2);
__nf_conntrack_hash_insert(ct, hash, repl_hash);
NF_CT_STAT_INC(net, insert);
spin_unlock_bh(_conntrack_lock);
@@ -732,11 +734,10 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
nf_ct_zone->id = zone;
}
 #endif
-   /*
-* changes to lookup keys must be done before setting refcnt to 1
+   /* Because we use RCU lookups, we set ct_general.use to zero before
+* this is inserted in any list.
 */
-   smp_wmb();
-   atomic_set(>ct_general.use, 1);
+   atomic_set(>ct_general.use, 0);
return ct;
 
 #ifdef CONFIG_NF_CONNTRACK_ZONES
@@ -759,6 +760,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
 void nf_conntrack_free(struct nf_conn *ct)
 {
struct net *net = nf_ct_net(ct);
+   /* 

Re: [net-next v4 2/8] dpaa_eth: add support for DPAA Ethernet

2015-11-02 Thread Joe Perches
On Mon, 2015-11-02 at 19:31 +0200, Madalin Bucur wrote:
> This introduces the Freescale Data Path Acceleration Architecture
> (DPAA) Ethernet driver (dpaa_eth) that builds upon the DPAA QMan,
> BMan, PAMU and FMan drivers to deliver Ethernet connectivity on
> the Freescale DPAA QorIQ platforms.
[]
> diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
> b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
[]
> +static void _dpa_rx_error(struct net_device *net_dev,
> +   const struct dpa_priv_s *priv,
> +   struct dpa_percpu_priv_s *percpu_priv,
> +   const struct qm_fd *fd,
> +   u32 fqid)
> +{
> + /* limit common, possibly innocuous Rx FIFO Overflow errors'
> +  * interference with zero-loss convergence benchmark results.
> +  */
> + if (likely(fd->status & FM_FD_ERR_PHYSICAL))
> + pr_warn_once("non-zero error counters in fman statistics 
> (sysfs)\n");
> + else
> + if (net_ratelimit())
> + netif_err(priv, hw, net_dev, "Err FD status = 0x%08x\n",
> +   fd->status & FM_FD_STAT_RX_ERRORS);

It's a bit of a pity the logging message code is
a mix of pr_, dev_, netdev_
and netif_

Perhaps netif__ratelimited macros should be added.

Something like:

---
 include/linux/netdevice.h | 54 +++
 1 file changed, 54 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 210d11a..555471d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4025,6 +4025,60 @@ do { 
\
 })
 #endif
 
+#define netif_level_ratelimited(level, priv, type, dev, fmt, args...)  \
+do {   \
+   if (netif_msg_##type(priv) && net_ratelimit())  \
+   netdev_##level(dev, fmt, ##args);   \
+} while (0)
+
+#define netif_emerg_ratelimited(priv, type, dev, fmt, args...) \
+   netif_level_ratelimited(emerg, priv, type, dev, fmt, ##args)
+#define netif_alert_ratelimited(priv, type, dev, fmt, args...) \
+   netif_level_ratelimited(alert, priv, type, dev, fmt, ##args)
+#define netif_crit_ratelimited(priv, type, dev, fmt, args...)  \
+   netif_level_ratelimited(crit, priv, type, dev, fmt, ##args)
+#define netif_err_ratelimited(priv, type, dev, fmt, args...)   \
+   netif_level_ratelimited(err, priv, type, dev, fmt, ##args)
+#define netif_warn_ratelimited(priv, type, dev, fmt, args...)  \
+   netif_level_ratelimited(warn, priv, type, dev, fmt, ##args)
+#define netif_notice_ratelimited(priv, type, dev, fmt, args...)
\
+   netif_level_ratelimited(notice, priv, type, dev, fmt, ##args)
+#define netif_info_ratelimited(priv, type, dev, fmt, args...)  \
+   netif_level_ratelimited(info, priv, type, dev, fmt, ##args)
+
+#if defined(CONFIG_DYNAMIC_DEBUG)
+/* descriptor check is first to prevent flooding with "callbacks suppressed" */
+#define netif_dbg_ratelimited(priv, type, dev, fmt, args...)   \
+do {   \
+   DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
+   if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) &&\
+   netif_msg_##type(priv) && net_ratelimit())  \
+   __dynamic_netdev_dbg(, dev, fmt, ##args);\
+} while (0)
+#elif defined(DEBUG)
+#define netif_dbg_ratelimited(priv, type, dev, fmt, args...)   \
+do {   \
+   if (netif_msg_##type(priv) && net_ratelimit())  \
+   netif_printk(priv, type, KERN_DEBUG, dev, fmt, ##args); \
+} while (0)
+#else
+#define netif_dbg_ratelimited(priv, type, dev, fmt, args...)   \
+do {   \
+   if (0)  \
+   netif_printk(priv, type, KERN_DEBUG, dev, fmt, ##args); \
+} while (0)
+#endif
+
+#if defined(VERBOSE_DEBUG)
+#define netif_vdbg_ratelimited netif_dbg_ratelimited
+#else
+#define netif_vdbg(priv, type, dev, fmt, args...)  \
+do {   \
+   if (0)  \
+   netif_printk(priv, type, KERN_DEBUG, dev, fmt, ##args); \
+} while (0)
+#endif
+
 /*
  * The list of packet types we will receive (as opposed to discard)
  * and the routines to invoke.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next v4 1/8] devres: add devm_alloc_percpu()

2015-11-02 Thread Joe Perches
On Mon, 2015-11-02 at 19:31 +0200, Madalin Bucur wrote:
> Introduce managed counterparts for alloc_percpu() and free_percpu().
> Add devm_alloc_percpu() and devm_free_percpu() into the managed
> interfaces list.

trivia, could be fixed later

> +/**
> + * __devm_alloc_percpu - Resource-managed alloc_percpu
> + * @dev: Device to allocate per-cpu memory for
> + * @size: Size of per-cpu memory to allocate
> + * @align: Alignement of per-cpu memory to allocate

French spelling?  alignment


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-02 Thread Oliver Hartkopp
On 11/02/2015 08:41 PM, Aleksander Morgado wrote:
> On Mon, Nov 2, 2015 at 12:14 PM, Oliver Hartkopp  
> wrote:
>>
>> What about defining some overlay data structure to map ARINC-429 frames into
>> CAN frames?
>>
>> E.g. we could write the ARINC 32 bit data completely into data[0..3] and
>> additionally copy the 8 bit label information (or should it better be 10 bit
>> including the Source/Destination Identifiers?) additionally into the can_id.
> 
> Note that the only bits which are always treated as non-data are the 8
> label bits (well, and the parity bit #31). The 2 SDI bits (#8, #9) may
> be used as data bits when a high resolution is needed, like Lat/Long
> encoded in binary words 310 and 311. I wouldn't make any assumption on
> what's on those 2 bits; i.e. they're not always "source/destination".
> 

You definitely know these details better than me. That's why I'm asking.

Would hosting the 32 bit in the struct can_frame.data and just the 8 bit label
in struct can_frame.can_id offer the functionality you need?

Besides the arinc429_frame struct

struct arinc429_frame {
__u8label;  /* 8 bit label */
__u8data[3];/* Up-to 23 bits are valid. */
};

everything else roughly looks like copy from PF_CAN with renaming.

So when we can fit the arinc frames into CAN frames and re-use the existing
CAN infrastructure - we are almost done.

Regards,
Oliver

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost: move is_le setup to the backend

2015-11-02 Thread David Miller
From: Greg Kurz 
Date: Fri, 30 Oct 2015 12:42:35 +0100

> The vq->is_le field is used to fix endianness when accessing the vring via
> the cpu_to_vhost16() and vhost16_to_cpu() helpers in the following cases:
> 
> 1) host is big endian and device is modern virtio
> 
> 2) host has cross-endian support and device is legacy virtio with a different
>endianness than the host
> 
> Both cases rely on the VHOST_SET_FEATURES ioctl, but 2) also needs the
> VHOST_SET_VRING_ENDIAN ioctl to be called by userspace. Since vq->is_le
> is only needed when the backend is active, it was decided to set it at
> backend start.
> 
> This is currently done in vhost_init_used()->vhost_init_is_le() but it
> obfuscates the core vhost code. This patch moves the is_le setup to a
> dedicated function that is called from the backend code.
> 
> Note vhost_net is the only backend that can pass vq->private_data == NULL to
> vhost_init_used(), hence the "if (sock)" branch.
> 
> No behaviour change.
> 
> Signed-off-by: Greg Kurz 

Michael, I'm assuming that you will be the one taking this.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next] enic: assign affinity hint to interrupts

2015-11-02 Thread David Miller
From: Govindarajulu Varadarajan <_gov...@gmx.com>
Date: Fri, 30 Oct 2015 16:52:51 +0530

> The affinity hint is used by the user space daemon, irqbalancer, to
> indicate a preferred CPU mask for irqs. This patch sets the irq affinity
> hint to local numa core first, when exausted we try non-local numa cores.
> 
> Also set tx xps cpus mask bassed on affinity hint.
> 
> v2: remove the global affinity policy.
> 
> Signed-off-by: Govindarajulu Varadarajan <_gov...@gmx.com>

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-02 Thread Vostrikov Andrey
Hi,

> I was thinking about this and I mostly agree with you. Obviously, copying the
> code this way was dumb. On the other hand, ARINC and CAN are two different 
> sort
> of busses, so I'd propose something slightly different here to avoid confusion
> and prevent the future extensions (or protocols) from adding unrelated cruft
> into the CAN stack.

Another  major  difference  between  CAN and ARINC429 is that ARINC is
simplex.  It  does  not  need  loopback  and echo. For example HOLT IC
chip  HI-3593  has  two receivers and single transmitter, which
should  be  instantiated as separate devices, as each channel could be
connected to different network.

It  would  be nice if new ARINC framework will provide means to create
RX  or  TX  only  network device and have -rx- or -tx- as part of device
name.

Label  space in ARINC is much smaller than in CAN, is it really needed
to  have  hash  and  masks? May be simple bitmap for 256 bits will fit
better.  At least it could be directly provided to mentioned HOLT chip
to do filtering in hardware.

-- 
Best regards,
Andrey

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-02 Thread Marek Vasut
On Monday, November 02, 2015 at 09:15:21 PM, Vostrikov Andrey wrote:
> Hi,

Hi,

> > I was thinking about this and I mostly agree with you. Obviously, copying
> > the code this way was dumb. On the other hand, ARINC and CAN are two
> > different sort of busses, so I'd propose something slightly different
> > here to avoid confusion and prevent the future extensions (or protocols)
> > from adding unrelated cruft into the CAN stack.
> 
> Another  major  difference  between  CAN and ARINC429 is that ARINC is
> simplex.  It  does  not  need  loopback  and echo. For example HOLT IC
> chip  HI-3593  has  two receivers and single transmitter, which
> should  be  instantiated as separate devices, as each channel could be
> connected to different network.

So this would effectively be three devices, correct ?  I think you can just
register a regular ARINC device for each channel and be done with it. Loopback
and echo can be configurable.

> It  would  be nice if new ARINC framework will provide means to create
> RX  or  TX  only  network device and have -rx- or -tx- as part of device
> name.

I'd say you can fail the TX if you're trying to send via an RX-only channel.
The naming can probably be also tweaked, but I don't see much value in that,
especially since you can rename those interfaces by using udev rules. Checking
if the interface supports RX/TX should be done by other means, not the name.

> Label  space in ARINC is much smaller than in CAN, is it really needed
> to  have  hash  and  masks? May be simple bitmap for 256 bits will fit
> better.  At least it could be directly provided to mentioned HOLT chip
> to do filtering in hardware.

CAN does the can_id filtering this way and I find it familiar and convenient,
so I don't see a reason not to re-use it. If the hardware has some special
support for the frame filtering, it's the driver that should convert the
filter specification into form which the hardware understands -- this sort
of configuration is done only once at the beginning of operation, so some
small overhead of the conversion of the filter setting should be acceptable,
we're talking about generating 256 entries for the hardware from ID/mask tuple,
no big deal here.

Best regards,
Marek Vasut
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next] net: make skb_set_owner_w() more robust

2015-11-02 Thread David Miller
From: Eric Dumazet 
Date: Mon, 02 Nov 2015 12:09:25 -0800

> On Mon, 2015-11-02 at 20:05 +, Haiyang Zhang wrote:
> 
>> Thanks for the fix!
>> For some driver, like ours, this condition may not be "unlikely".
>> So could you remove the "unlikely"?
> 
> No, I wont remove the unlikely.
> 
> Look, your main issue is about reallocating skbs, because of excessive
> dev->needed_headroom.
> 
> An unlikely() mismatch is 1000 times less expensive, why would you
> care ?
> 
> If you really care, fix your driver to not abuse skb->head to store 220
> bytes of private data.

+1

And I've been saying this from the beginning.  This driver must place
it's private per-packet data in another location if it wants optimal
behavior inside of the Linux networking stack.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next PATCH v2] RDS: convert bind hash table to re-sizable hashtable

2015-11-02 Thread David Miller
From: Santosh Shilimkar 
Date: Fri, 30 Oct 2015 08:49:10 -0700

> To further improve the RDS connection scalabilty on massive systems
> where number of sockets grows into tens of thousands  of sockets, there
> is a need of larger bind hashtable. Pre-allocated 8K or 16K table is
> not very flexible in terms of memory utilisation. The rhashtable
> infrastructure gives us the flexibility to grow the hashtbable based
> on use and also comes up with inbuilt efficient bucket(chain) handling.
> 
> Reviewed-by: David Miller 
> Signed-off-by: Santosh Shilimkar 
> Signed-off-by: Santosh Shilimkar 
> ---
> v2: Dropped empty new line from rds_add_bound()  (David Miller)

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -stable 3.4,backport] commit c6825c0976fa7893692e0e43b09740b419b23c09 upstream.

2015-11-02 Thread Ani Sinha
netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get

Lets look at destroy_conntrack:

hlist_nulls_del_rcu(>tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
...
nf_conntrack_free(ct)
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);

net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU.

The hash is protected by rcu, so readers look up conntracks without
locks.
A conntrack is removed from the hash, but in this moment a few readers
still can use the conntrack. Then this conntrack is released and another
thread creates conntrack with the same address and the equal tuple.
After this a reader starts to validate the conntrack:
* It's not dying, because a new conntrack was created
* nf_ct_tuple_equal() returns true.

But this conntrack is not initialized yet, so it can not be used by two
threads concurrently. In this case BUG_ON may be triggered from
nf_nat_setup_info().

Florian Westphal suggested to check the confirm bit too. I think it's
right.

task 1  task 2  task 3
nf_conntrack_find_get
 nf_conntrack_find
destroy_conntrack
 hlist_nulls_del_rcu
 nf_conntrack_free
 kmem_cache_free
__nf_conntrack_alloc
 kmem_cache_alloc
 
memset(>tuplehash[IP_CT_DIR_MAX],
 if (nf_ct_is_dying(ct))
 if (!nf_ct_tuple_equal()

I'm not sure, that I have ever seen this race condition in a real life.
Currently we are investigating a bug, which is reproduced on a few nodes.
In our case one conntrack is initialized from a few tasks concurrently,
we don't have any other explanation for this.

<2>[46267.083061] kernel BUG at net/ipv4/netfilter/nf_nat_core.c:322!
...
<4>[46267.083951] RIP: 0010:[]  [] 
nf_nat_setup_info+0x564/0x590 [nf_nat]
...
<4>[46267.085549] Call Trace:
<4>[46267.085622]  [] alloc_null_binding+0x5b/0xa0 
[iptable_nat]
<4>[46267.085697]  [] nf_nat_rule_find+0x5c/0x80 [iptable_nat]
<4>[46267.085770]  [] nf_nat_fn+0x111/0x260 [iptable_nat]
<4>[46267.085843]  [] nf_nat_out+0x48/0xd0 [iptable_nat]
<4>[46267.085919]  [] nf_iterate+0x69/0xb0
<4>[46267.085991]  [] ? ip_finish_output+0x0/0x2f0
<4>[46267.086063]  [] nf_hook_slow+0x74/0x110
<4>[46267.086133]  [] ? ip_finish_output+0x0/0x2f0
<4>[46267.086207]  [] ? dst_output+0x0/0x20
<4>[46267.086277]  [] ip_output+0xa4/0xc0
<4>[46267.086346]  [] raw_sendmsg+0x8b4/0x910
<4>[46267.086419]  [] inet_sendmsg+0x4a/0xb0
<4>[46267.086491]  [] ? sock_update_classid+0x3a/0x50
<4>[46267.086562]  [] sock_sendmsg+0x117/0x140
<4>[46267.086638]  [] ? _spin_unlock_bh+0x1b/0x20
<4>[46267.086712]  [] ? autoremove_wake_function+0x0/0x40
<4>[46267.086785]  [] ? do_ip_setsockopt+0x90/0xd80
<4>[46267.086858]  [] ? call_function_interrupt+0xe/0x20
<4>[46267.086936]  [] ? ub_slab_ptr+0x20/0x90
<4>[46267.087006]  [] ? ub_slab_ptr+0x20/0x90
<4>[46267.087081]  [] ? kmem_cache_alloc+0xd8/0x1e0
<4>[46267.087151]  [] sys_sendto+0x139/0x190
<4>[46267.087229]  [] ? sock_setsockopt+0x16d/0x6f0
<4>[46267.087303]  [] ? audit_syscall_entry+0x1d7/0x200
<4>[46267.087378]  [] ? __audit_syscall_exit+0x265/0x290
<4>[46267.087454]  [] ? compat_sys_setsockopt+0x75/0x210
<4>[46267.087531]  [] compat_sys_socketcall+0x13f/0x210
<4>[46267.087607]  [] ia32_sysret+0x0/0x5
<4>[46267.087676] Code: 91 20 e2 01 75 29 48 89 de 4c 89 f7 e8 56 fa ff ff 85 
c0 0f 84 68 fc ff ff 0f b6 4d c6 41 8b 45 00 e9 4d fb ff ff e8 7c 19 e9 e0 <0f> 
0b eb fe f6 05 17 91 20 e2 80 74 ce 80 3d 5f 2e 00 00 00 74
<1>[46267.088023] RIP  [] nf_nat_setup_info+0x564/0x590

Signed-off-by: Ani Sinha 
Tested-by: Neal P. Murphy 
---
 net/netfilter/nf_conntrack_core.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index 9a46908..fd0f7a3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -309,6 +309,21 @@ static void death_by_timeout(unsigned long ul_conntrack)
nf_ct_put(ct);
 }
 
+static inline bool
+nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
+   const struct nf_conntrack_tuple *tuple,
+   u16 zone)
+{
+   struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+   /* A conntrack can be recreated with the equal tuple,
+* so we need to check that the conntrack is confirmed
+*/
+   return nf_ct_tuple_equal(tuple, >tuple) &&
+   nf_ct_zone(ct) == zone &&
+   nf_ct_is_confirmed(ct);
+}
+
 /*
  * Warning :
  * - Caller must take a reference on returned object
@@ -330,8 +345,7 @@ nf_conntrack_find(struct net *net, u16 zone,
local_bh_disable();
 begin:
hlist_nulls_for_each_entry_rcu(h, n, >ct.hash[bucket], hnnode) {
-   if 

Re: Resource leak in unshare

2015-11-02 Thread Eric W. Biederman
Dmitry Vyukov  writes:

> Hello,
>
> I am hitting the following warnings on
> bcee19f424a0d8c26ecf2607b73c690802658b29 (4.3):

Do you have any trace of the earlier failures?

This appears to be something caused by an earlier failure (possibly
whatever fails to allocate memory).  Having network devices present
but being in the generic cleanup routines is wrong.

If there is no additional information can you please rerun with the
following change applied?  That should at least report which function is
failing, and give us a good clue where to start debugging this.

Eric


diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..125c94af22b8 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -292,6 +292,7 @@ out:
return error;
 
 out_undo:
+   WARN(1, "net ops->init %pF returned with %d\n", ops->init, error);
/* Walk through the list backwards calling the exit functions
 * for the pernet modules whose init functions did not fail.
 */

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-02 Thread Marek Vasut
On Monday, November 02, 2015 at 12:14:27 PM, Oliver Hartkopp wrote:
> On 02.11.2015 10:47, Marc Kleine-Budde wrote:
> > On 11/02/2015 12:16 AM, Marek Vasut wrote:
> >> The ARINC-429 is a technical standard, which describes, among others,
> >> a data bus used by airplanes. The standard contains much more, since
> >> it is based off the ISO/OSI model, but this patch implements just the
> >> data bus protocol.
> >> 
> >> This stack is derived from the SocketCAN implementation, already present
> >> in the kernel and thus behaves in a very similar fashion. Thus far, we
> >> support sending RAW ARINC-429 datagrams, configuration of the RX and TX
> >> clock speed and filtering.
> >> 
> >> The ARINC-429 datagram is four-byte long. The first byte is always the
> >> LABEL, the function of remaining three bytes can vary, so we handle it
> >> as an opaque PAYLOAD. The userspace tools can send these datagrams via
> >> a standard socket.
> >> 
> >> A LABEL-based filtering can be configured on each socket separately in
> >> a way comparable to CAN -- user uses setsockopt() to push a list of
> >> label,mask tuples into the kernel and the kernel will deliver a datagram
> >> to the socket if ( & mask) == (label & mask), otherwise
> >> the datagram is not delivered.
> > 
> > What's difference compared to CAN besides a different MTU? The CAN stack
> > is already capable to handle CAN and CAN-FD frames. Would it make sense
> > to integrate the ARINC-429 into the existing CAN stack?
> 
> That was my first impression too.

Hi!

> What about defining some overlay data structure to map ARINC-429 frames
> into CAN frames?

I agree about the code reuse, it was stupid to do such a blatant copy of the
code all right. I don't think it's such a great idea to outright place ARINC 
support into the CAN stack though. They're two different busses after all. 
Please see below.

> E.g. we could write the ARINC 32 bit data completely into data[0..3] and
> additionally copy the 8 bit label information (or should it better be 10
> bit including the Source/Destination Identifiers?) additionally into the
> can_id.
> 
>  From what I can see the filtering by label is similar to filtering by
> can_id. And you would be able to use the can-gw functionality too.

This is correct.

> The only real difference is the bitrate configuration of the ARINC
> interface.

There might be additional ARINC-specific configuration bits involved,
but thus far, that's correct.

> I wonder if a similar approach would fit here as we discussed with the
> University of Prague for a LIN implementation using the PF_CAN
> infrastructure:

OT: Hey, there is no "University of Prague", there are two universities in 
Prague to boot -- Charles University and Czech Technical University -- you
mean the later ;-)

> http://rtime.felk.cvut.cz/can/lin-bus/
> 
> It could probably boil down to a 'CAN interface' that is named arinc0 which
> implements the serial driver like in slcan.c or sllin.c ...

I was thinking about this and I mostly agree with you. Obviously, copying the
code this way was dumb. On the other hand, ARINC and CAN are two different sort 
of busses, so I'd propose something slightly different here to avoid confusion 
and prevent the future extensions (or protocols) from adding unrelated cruft 
into the CAN stack.

I would propose we (read: me) create some sort of "common" core, which would
contain the following:
 - drivers/net/: big part of the device interface here is common
 big part of the virtual interface here is common
 -> CAN or ARINC can just add their own specific callbacks and
be done with it
   
 - net/: there's a lot of common parts as well, like the filtering can be
 unified such that it can be used by both. A big part of the socket
 handling is also similar.

This would also let the slcan or sllin or whatever stuff they made at CVUT just
plug into this "common" core part.

Now I wonder if we should introduce AF_ARINC or stick to AF_CAN for both. I'd
be much happier to keep those two separate, again, to avoid confusion.

What do you think please ?

Best regards,
Marek Vasut
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next] net: make skb_set_owner_w() more robust

2015-11-02 Thread Eric Dumazet
On Mon, 2015-11-02 at 20:05 +, Haiyang Zhang wrote:

> Thanks for the fix!
> For some driver, like ours, this condition may not be "unlikely".
> So could you remove the "unlikely"?

No, I wont remove the unlikely.

Look, your main issue is about reallocating skbs, because of excessive
dev->needed_headroom.

An unlikely() mismatch is 1000 times less expensive, why would you
care ?

If you really care, fix your driver to not abuse skb->head to store 220
bytes of private data.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 0/5] dp83640 driver fixes

2015-11-02 Thread David Miller
From: Stefan Sørensen 
Date: Fri, 30 Oct 2015 13:13:59 +0100

> This series fixes a number of minor bugs in the dp83640 driver. 

Looks like Richard wants changes to patch #1.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: pull-request: can 2015-10-30

2015-11-02 Thread David Miller
From: Marc Kleine-Budde 
Date: Fri, 30 Oct 2015 14:39:58 +0100

> this is a pull request for the upcoming v4.3 release.
> 
> Marek Vasut provides a patch to use the correct attrlen in the nla_put() in 
> the
> can_fill_info() function.

Pulled, but this missed the v4.3 release so you'll need to push this explicitly
out to -stable.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] net: encx24j600: Fix SPI driver module autoload

2015-11-02 Thread David Miller
From: Javier Martinez Canillas 
Date: Fri, 30 Oct 2015 13:49:16 +0100

> Recently I've been trying to fix module autoloading for all SPI drivers and
> found that the encx24j600 driver does not fill module alias information due
> missing a MODULE_DEVICE_TABLE() so module autload won't work and the driver
> Kconfig symbol is tristate which means the driver can be built as a module.
> 
> But also the SPI id table is not correctly defined so this series fixes both
> issues.

Series applied to net-next, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] net: rds: changing the return type from int to void

2015-11-02 Thread David Miller
From: Saurabh Sengar 
Date: Fri, 30 Oct 2015 19:46:44 +0530

> as result of function rds_iw_flush_mr_pool is nowhere checked,
> changing its return type from int to void.
> also removing the unused variable rc as there is nothing to return
> 
> Signed-off-by: Saurabh Sengar 
> ---
> v2 :  modify patch description, as per the comments from Sergei Shtylyov

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bpf: convert hashtab lock to raw lock

2015-11-02 Thread David Miller
From: Yang Shi 
Date: Fri, 30 Oct 2015 15:16:26 -0700

> When running bpf samples on rt kernel, it reports the below warning:
 ...
> Convert hashtab lock to raw lock to avoid such warning.
> 
> Signed-off-by: Yang Shi 

Applied to net-next, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next v4 3/8] dpaa_eth: add support for S/G frames

2015-11-02 Thread Joe Perches
On Mon, 2015-11-02 at 19:31 +0200, Madalin Bucur wrote:
> Add support for Scater/Gather (S/G) frames. The FMan can place
> the frame content into multiple buffers and provide a S/G Table
> (SGT) into one first buffer with references to the others.

trivia: scatter

> diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c 
> b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
[]
> @@ -1177,10 +1177,42 @@ void dpaa_eth_init_ports(struct mac_device *mac_dev,
> port_fqs->rx_defq, _layout[RX]);
>  }
>  
> +void dpa_release_sgt(struct qm_sg_entry *sgt)
> +{
> + struct dpa_bp *dpa_bp;
> + struct bm_buffer bmb[DPA_BUFF_RELEASE_MAX];

Where is "struct bm_buffer" defined?


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2] ipv4: use l4 hash for locally generated multipath flows

2015-11-02 Thread David Miller
From: Paolo Abeni 
Date: Thu, 29 Oct 2015 22:20:40 +0100

> This patch changes how the multipath hash is computed for locally
> generated flows: now the hash comprises l4 information.
> 
> This allows better utilization of the available paths when the existing
> flows have the same source IP and the same destination IP: with l3 hash,
> even when multiple connections are in place simultaneously, a single path
> will be used, while with l4 hash we can use all the available paths.
> 
> v2 changes:
> - use get_hash_from_flowi4() instead of implementing just another l4 hash
>   function
> 
> Signed-off-by: Paolo Abeni 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-02 Thread Sowmini Varadhan
On (11/02/15 17:26), Nelson, Shannon wrote:
> > I assume you mean .1q 
> 
> Yes, this is what I had in mind.

I dont think we're quite there yet, even without vlans.

If I turn on/off tcpdump, there's something about the way that the link
is bounced that leaves the device down while tcpdump is running. Then
after I exit tcpdump, it bounces things a few more times again, packets
flow for a brief interval, and then there's silence.

Seems like there's is a workq that results in
i40e_service_task->i40e_sync_vsi_filters that periodically resets things.

Doing 'ip link set eth0 promisc on' keeps things nice and steady.

How is this all supposed to work if I change the macaddr from /sbin/ip 
using i40e_set_mac() and then jiggle the promisc (either just the flag,
or with tcpdump)? (I cant tell because I dont have an x86 machine with 
i40e handy)

To frame the question differently, where all should I be invoking
the new i40e_macaddr_init() function from? 

--Sowmini


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v2 net-next] net: make skb_set_owner_w() more robust

2015-11-02 Thread Haiyang Zhang


> -Original Message-
> From: Eric Dumazet [mailto:eric.duma...@gmail.com]
> Sent: Sunday, November 1, 2015 6:37 PM
> To: David Miller 
> Cc: Haiyang Zhang ; eduma...@google.com;
> netdev@vger.kernel.org; KY Srinivasan 
> Subject: [PATCH v2 net-next] net: make skb_set_owner_w() more robust
> 
> From: Eric Dumazet 
> 
> skb_set_owner_w() is called from various places that assume
> skb->sk always point to a full blown socket (as it changes
> sk->sk_wmem_alloc)
> 
> We'd like to attach skb to request sockets, and in the future
> to timewait sockets as well. For these kind of pseudo sockets,
> we need to take a traditional refcount and use sock_edemux()
> as the destructor.
> 
> It is now time to un-inline skb_set_owner_w(), being too big.
> 
> Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets
> instead of listener")
> Signed-off-by: Eric Dumazet 
> Bisected-by: Haiyang Zhang 
> ---
> v2: sock_edemux() must be guarded by CONFIG_INET
> 
>  include/net/sock.h|   17 ++---
>  net/core/sock.c   |   22 ++
>  net/ipv4/tcp_output.c |4 +---
>  3 files changed, 25 insertions(+), 18 deletions(-)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index aeed5c95f3ca..f570e75e3da9 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1951,6 +1951,8 @@ static inline void skb_set_hash_from_sk(struct
> sk_buff *skb, struct sock *sk)
>   }
>  }
> 
> +void skb_set_owner_w(struct sk_buff *skb, struct sock *sk);
> +
>  /*
>   *   Queue a received datagram if it will fit. Stream and sequenced
>   *   protocols can't normally use this as they need to fit buffers in
> @@ -1959,21 +1961,6 @@ static inline void skb_set_hash_from_sk(struct
> sk_buff *skb, struct sock *sk)
>   *   Inlined as it's very short and called for pretty much every
>   *   packet ever received.
>   */
> -
> -static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
> -{
> - skb_orphan(skb);
> - skb->sk = sk;
> - skb->destructor = sock_wfree;
> - skb_set_hash_from_sk(skb, sk);
> - /*
> -  * We used to take a refcount on sk, but following operation
> -  * is enough to guarantee sk_free() wont free this sock until
> -  * all in-flight packets are completed
> -  */
> - atomic_add(skb->truesize, >sk_wmem_alloc);
> -}
> -
>  static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
>  {
>   skb_orphan(skb);
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 0ef30aa90132..7529eb9463be 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1656,6 +1656,28 @@ void sock_wfree(struct sk_buff *skb)
>  }
>  EXPORT_SYMBOL(sock_wfree);
> 
> +void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
> +{
> + skb_orphan(skb);
> + skb->sk = sk;
> +#ifdef CONFIG_INET
> + if (unlikely(!sk_fullsock(sk))) {

Thanks for the fix!
For some driver, like ours, this condition may not be "unlikely".
So could you remove the "unlikely"?

Thanks,
- Haiyang




Re: [PATCH net-next] tcp/dccp: fix ireq->pktopts race

2015-11-02 Thread David Miller
From: Eric Dumazet 
Date: Fri, 30 Oct 2015 09:46:12 -0700

> From: Eric Dumazet 
> 
> IPv6 request sockets store a pointer to skb containing the SYN packet
> to be able to transfer it to full blown socket when 3WHS is done
> (ireq->pktopts -> np->pktoptions)
> 
> As explained in commit 5e0724d027f0 ("tcp/dccp: fix hashdance race for
> passive sessions"), we must transfer the skb only if we won the
> hashdance race, if multiple cpus receive the 'ack' packet completing
> 3WHS at the same time.
> 
> Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets")
> Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table")
> Signed-off-by: Eric Dumazet 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-02 Thread Aleksander Morgado
On Mon, Nov 2, 2015 at 12:14 PM, Oliver Hartkopp  wrote:
>
> What about defining some overlay data structure to map ARINC-429 frames into
> CAN frames?
>
> E.g. we could write the ARINC 32 bit data completely into data[0..3] and
> additionally copy the 8 bit label information (or should it better be 10 bit
> including the Source/Destination Identifiers?) additionally into the can_id.

Note that the only bits which are always treated as non-data are the 8
label bits (well, and the parity bit #31). The 2 SDI bits (#8, #9) may
be used as data bits when a high resolution is needed, like Lat/Long
encoded in binary words 310 and 311. I wouldn't make any assumption on
what's on those 2 bits; i.e. they're not always "source/destination".

-- 
Aleksander
https://aleksander.es
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] mpls: Don't accept multipath configuration until the support is complete

2015-11-02 Thread Sergei Shtylyov

Hello.

On 11/02/2015 10:29 PM, Eric W. Biederman wrote:


Currently the multipath code has a nasty failure mode in that it will
fail to notice link down or administrative device down and will
instead black hole packets instead of sending them to their nexthop
destination.

Half the point of multipath is to gracefully handle forwarding path
failures and as the current code does not handle forwarding failures the
current code is dangerous to use.

As mpls multipath has never been exported to userspace and as the
implementation was not complete before the merge window disable the mpls
multipath code by rejecting all multipath configuration requests.  This
will give us another kernel development cycle to cleanly sort out the
issues, without any bad precedents to worry about.

Signed-off-by: "Eric W. Biederman" 
---
  net/mpls/af_mpls.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c70d750148b6..893cd2dc3979 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1162,6 +1162,8 @@ static int rtm_to_route_config(struct sk_buff *skb,  
struct nlmsghdr *nlh,
{
cfg->rc_mp = nla_data(nla);
cfg->rc_mp_len = nla_len(nla);
+   /* Fail until multipath support is complete */
+   goto errout;
break;


   Forgot to delete *break*?


}
default:



MBR, Sergei

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 0/2] bridge: vlan: failure path and comment fixes

2015-11-02 Thread David Miller
From: Nikolay Aleksandrov 
Date: Fri, 30 Oct 2015 17:46:18 +0100

> This is a set from Ido which takes care of one failure path error in
> nbp_vlan_init (patch 1) and a few comment errors (patch 2).
> I must admit I didn't expect the port init continues after a vlan init
> failure but should've checked to make sure. Thanks to Ido for catching
> these!

Series applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next v4 6/8] dpaa_eth: add ethtool statistics

2015-11-02 Thread Joe Perches
On Mon, 2015-11-02 at 19:31 +0200, Madalin Bucur wrote:
> Add a series of counters to be exported through ethtool:
> - add detailed counters for reception errors;
> - add detailed counters for QMan enqueue reject events;
> - count the number of fragmented skbs received from the stack;
> - count all frames received on the Tx confirmation path;
> - add congestion group statistics;
> - count the number of interrupts for each CPU.
[]
> diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c 
> b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
[]
> +static void dpa_get_strings(struct net_device *net_dev, u32 stringset, u8 
> *data)
> +{
> + unsigned int i, j, num_cpus, size;
> + char string_cpu[ETH_GSTRING_LEN];
> + u8 *strings;
> +
> + strings   = data;
> + num_cpus  = num_online_cpus();
> + size  = DPA_STATS_GLOBAL_LEN * ETH_GSTRING_LEN;
> +
> + for (i = 0; i < DPA_STATS_PERCPU_LEN; i++) {
> + for (j = 0; j < num_cpus; j++) {
> + snprintf(string_cpu, ETH_GSTRING_LEN, "%s [CPU %d]",
> +  dpa_stats_percpu[i], j);
> + memcpy(strings, string_cpu, ETH_GSTRING_LEN);
> + strings += ETH_GSTRING_LEN;
> + }
> + snprintf(string_cpu, ETH_GSTRING_LEN, "%s [TOTAL]",
> +  dpa_stats_percpu[i]);
> + memcpy(strings, string_cpu, ETH_GSTRING_LEN);
> + strings += ETH_GSTRING_LEN;
> + }
> + memcpy(strings, dpa_stats_global, size);
> +}

This leaks uninitialized stack via a memcpy of uninitialized
string_cpu bytes into user-space.

Using
char string_cpu[ETH_GSTRING_LEN] = {};
or a memset before each snprintf would fix it.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 02/10] ss: created formatters for json and hr

2015-11-02 Thread Matthias Tafelmeier
> Your perception is incorrect.
> I am still maintaining iproute2. Phil is just providing lots of feedback
> and new patches.
No offense intendet! I know, Phil has already clarified things in the
other responses. Maybe you overlooked those accidentally.

> The size of the change makes it harder to digest, and I do think
> adding JSON support is a good idea. Just concerned about the long
> term maintainance overhead. Plus I want the other utilities to have
> JSON output as well. Therefore this change is going to take longer
> to adopt and hopefull we can figure out a good way to do this kind
> of output.

I see! Well, do you conceive a decentralized approach or a centralized
one for all the utilities?
Decentralizied would be as things are now – I mean what my patch series
tries to amount to.

> Almost want to go to C++ or something.

Would that be viable or is that a wishful line of thougts?



signature.asc
Description: OpenPGP digital signature


RE: [PATCH][v2] net: phy: fix a bug in get_phy_c45_ids

2015-11-02 Thread Shaohui Xie
> -Original Message-
> From: David Miller [mailto:da...@davemloft.net]
> Sent: Tuesday, November 03, 2015 12:02 PM
> To: shh@gmail.com
> Cc: netdev@vger.kernel.org; f.faine...@gmail.com; Xie Shaohui-B21989
> Subject: Re: [PATCH][v2] net: phy: fix a bug in get_phy_c45_ids
> 
> From: 
> Date: Mon, 2 Nov 2015 18:48:33 +0800
> 
> > @@ -205,6 +205,37 @@ struct phy_device *phy_device_create(struct
> > mii_bus *bus, int addr, int phy_id,  }
> > EXPORT_SYMBOL(phy_device_create);
> >
> > + /* get_phy_c45_devs_in_pkg - reads a MMD's devices in package
> registers.
> > + * @bus: the target MII bus
> 
> Please remove the leading space on the first line of this new comment.
OK. Fixed in V3.

Thank you!

--Shaohui
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] net: avoid NULL deref in inet_ctl_sock_destroy()

2015-11-02 Thread Eric Dumazet
On Mon, 2015-11-02 at 22:46 -0500, David Miller wrote:
> From: Eric Dumazet 
> Date: Mon, 02 Nov 2015 07:50:07 -0800
> 
> > From: Eric Dumazet 
> > 
> > Under low memory conditions, tcp_sk_init() and icmp_sk_init()
> > can both iterate on all possible cpus and call inet_ctl_sock_destroy(),
> > with eventual NULL pointer.
> > 
> > Signed-off-by: Eric Dumazet 
> > Reported-by: Dmitry Vyukov 
> 
> Applied.

Thanks David.

Bug origin was in linux-4.2 :

commit 26abe14379f8e2fa3fd1bcf97c9a7ad9364886fe
Author: Eric W. Biederman 
Date:   Fri May 8 21:10:31 2015 -0500

net: Modify sk_alloc to not reference count the netns of kernel sockets.

Now that sk_alloc knows when a kernel socket is being allocated modify
it to not reference count the network namespace of kernel sockets.

Keep track of if a socket needs reference counting by adding a flag to
struct sock called sk_net_refcnt.

Update all of the callers of sock_create_kern to stop using
sk_change_net and sk_release_kernel as those hacks are no longer
needed, to avoid reference counting a kernel socket.

Signed-off-by: "Eric W. Biederman" 
Signed-off-by: David S. Miller 



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v2] mpls: support for dead routes

2015-11-02 Thread Roopa Prabhu
From: Roopa Prabhu 

Adds support for RTNH_F_DEAD and RTNH_F_LINKDOWN flags on mpls
routes due to link events. Also adds code to ignore dead
routes during route selection

Signed-off-by: Roopa Prabhu 
---
RFC to v1:
Addressed a few comments from Eric and Robert:
- remove support for weighted nexthops
- Use rt_nhn_alive in the rt structure to keep count of alive
routes.
What i have not done is: sort nexthops on link events.
I am not comfortable recreating or sorting nexthops on
every carrier change. This leaves scope for optimizing in the future

v1 to v2:
Fix dead nexthop checks as suggested by dave

 net/mpls/af_mpls.c  | 191 
 net/mpls/internal.h |   3 +
 2 files changed, 166 insertions(+), 28 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c70d750..5e88118 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -96,22 +96,15 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned 
int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
-static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
-struct sk_buff *skb, bool bos)
+static u32 mpls_multipath_hash(struct mpls_route *rt,
+  struct sk_buff *skb, bool bos)
 {
struct mpls_entry_decoded dec;
struct mpls_shim_hdr *hdr;
bool eli_seen = false;
int label_index;
-   int nh_index = 0;
u32 hash = 0;
 
-   /* No need to look further into packet if there's only
-* one path
-*/
-   if (rt->rt_nhn == 1)
-   goto out;
-
for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
 label_index++) {
if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
@@ -165,9 +158,37 @@ static struct mpls_nh *mpls_select_multipath(struct 
mpls_route *rt,
}
}
 
-   nh_index = hash % rt->rt_nhn;
+   return hash;
+}
+
+static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
+struct sk_buff *skb, bool bos)
+{
+   u32 hash = 0;
+   int nh_index;
+   int n = 0;
+
+   /* No need to look further into packet if there's only
+* one path
+*/
+   if (rt->rt_nhn == 1)
+   goto out;
+
+   if (rt->rt_nhn_alive <= 0)
+   return NULL;
+
+   hash = mpls_multipath_hash(rt, skb, bos);
+   nh_index = hash % rt->rt_nhn_alive;
+   for_nexthops(rt) {
+   if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+   continue;
+   if (n == nh_index)
+   return nh;
+   n++;
+   } endfor_nexthops(rt);
+
 out:
-   return >rt_nh[nh_index];
+   return >rt_nh[0];
 }
 
 static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
@@ -365,6 +386,7 @@ static struct mpls_route *mpls_rt_alloc(int num_nh, u8 
max_alen)
 GFP_KERNEL);
if (rt) {
rt->rt_nhn = num_nh;
+   rt->rt_nhn_alive = num_nh;
rt->rt_max_alen = max_alen_aligned;
}
 
@@ -536,6 +558,15 @@ static int mpls_nh_assign_dev(struct net *net, struct 
mpls_route *rt,
 
RCU_INIT_POINTER(nh->nh_dev, dev);
 
+   if (!netif_carrier_ok(dev))
+   nh->nh_flags |= RTNH_F_LINKDOWN;
+
+   if (!(dev->flags & IFF_UP))
+   nh->nh_flags |= RTNH_F_DEAD;
+
+   if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+   rt->rt_nhn_alive--;
+
return 0;
 
 errout:
@@ -577,7 +608,7 @@ errout:
 }
 
 static int mpls_nh_build(struct net *net, struct mpls_route *rt,
-struct mpls_nh *nh, int oif,
+struct mpls_nh *nh, int oif, int hops,
 struct nlattr *via, struct nlattr *newdst)
 {
int err = -ENOMEM;
@@ -666,7 +697,7 @@ static int mpls_nh_build_multi(struct mpls_route_config 
*cfg,
/* neither weighted multipath nor any flags
 * are supported
 */
-   if (rtnh->rtnh_hops || rtnh->rtnh_flags)
+   if (rtnh->rtnh_flags || rtnh->rtnh_flags)
goto errout;
 
attrlen = rtnh_attrlen(rtnh);
@@ -681,8 +712,8 @@ static int mpls_nh_build_multi(struct mpls_route_config 
*cfg,
goto errout;
 
err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
-   rtnh->rtnh_ifindex, nla_via,
-   nla_newdst);
+   rtnh->rtnh_ifindex, rtnh->rtnh_hops,
+   nla_via, nla_newdst);
if (err)
goto errout;
 
@@ -875,34 +906,100 @@ free:
return 

Re: [PATCH net-next] bridge: vlan: Use rcu_dereference instead of rtnl_dereference

2015-11-02 Thread Jiri Pirko
Sun, Nov 01, 2015 at 05:31:45PM CET, ido...@mellanox.com wrote:
>br_should_learn() is protected by RCU and not by RTNL, so use correct
>flavor of nbp_vlan_group().
>
>Fixes: 907b1e6e83ed ("bridge: vlan: use proper rcu for the vlgrp
>member")
>Signed-off-by: Ido Schimmel 
>Acked-by: Nikolay Aleksandrov 

Acked-by: Jiri Pirko 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[v7, 6/6] fsl/fman: Add FMan MAC driver

2015-11-02 Thread igal.liberman
From: Igal Liberman 

This patch adds the Ethernet MAC driver supporting the three
different types of MACs: dTSEC, tGEC and mEMAC.

Signed-off-by: Igal Liberman 
---
 drivers/net/ethernet/freescale/fman/Makefile |3 +-
 drivers/net/ethernet/freescale/fman/mac.c|  980 ++
 drivers/net/ethernet/freescale/fman/mac.h|   97 +++
 3 files changed, 1079 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/freescale/fman/mac.c
 create mode 100644 drivers/net/ethernet/freescale/fman/mac.h

diff --git a/drivers/net/ethernet/freescale/fman/Makefile 
b/drivers/net/ethernet/freescale/fman/Makefile
index 2eb0b9b..51fd2e6 100644
--- a/drivers/net/ethernet/freescale/fman/Makefile
+++ b/drivers/net/ethernet/freescale/fman/Makefile
@@ -1,6 +1,7 @@
 subdir-ccflags-y +=  -I$(srctree)/drivers/net/ethernet/freescale/fman
 
-obj-y  += fsl_fman.o fsl_fman_mac.o
+obj-y  += fsl_fman.o fsl_fman_mac.o fsl_mac.o
 
 fsl_fman-objs  := fman_muram.o fman.o fman_sp.o fman_port.o
 fsl_fman_mac-objs := fman_dtsec.o fman_memac.o fman_tgec.o
+fsl_mac-objs += mac.o
diff --git a/drivers/net/ethernet/freescale/fman/mac.c 
b/drivers/net/ethernet/freescale/fman/mac.c
new file mode 100644
index 000..9dd66bc
--- /dev/null
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -0,0 +1,980 @@
+/* Copyright 2008-2015 Freescale Semiconductor, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in the
+ *  documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ *  names of its contributors may be used to endorse or promote products
+ *  derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "mac.h"
+#include "fman_mac.h"
+#include "fman_dtsec.h"
+#include "fman_tgec.h"
+#include "fman_memac.h"
+
+#define MAC_DESCRIPTION "FSL FMan MAC API based driver"
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_AUTHOR("Emil Medve ");
+
+MODULE_DESCRIPTION(MAC_DESCRIPTION);
+
+struct mac_priv_s {
+   struct device   *dev;
+   void __iomem*vaddr;
+   u8  cell_index;
+   phy_interface_t phy_if;
+   struct fman *fman;
+   struct device_node  *phy_node;
+   /* List of multicast addresses */
+   struct list_headmc_addr_list;
+   struct platform_device  *eth_dev;
+   struct fixed_phy_status *fixed_link;
+   u16 speed;
+   u16 max_speed;
+
+   int (*enable)(struct fman_mac *mac_dev, enum comm_mode mode);
+   int (*disable)(struct fman_mac *mac_dev, enum comm_mode mode);
+};
+
+struct mac_address {
+   u8 addr[ETH_ALEN];
+   struct list_head list;
+};
+
+static void mac_exception(void *_mac_dev, enum fman_mac_exceptions ex)
+{
+   struct mac_device   *mac_dev;
+   struct mac_priv_s   *priv;
+
+   mac_dev = (struct mac_device *)_mac_dev;
+   priv = mac_dev->priv;
+
+   if (ex == FM_MAC_EX_10G_RX_FIFO_OVFL) {
+   /* don't flag RX FIFO after the first */
+   mac_dev->set_exception(mac_dev->fman_mac,
+  

[v7, 1/6] fsl/fman: Add FMan MURAM support

2015-11-02 Thread igal.liberman
From: Igal Liberman 

Add Frame Manager Multi-User RAM support.
This internal FMan memory block is used by the
FMan hardware modules, the management being made
through the generic allocator.

The FMan Internal memory, for example, is used for
allocating transmit and receive FIFOs.

Signed-off-by: Igal Liberman 
---
 drivers/net/ethernet/freescale/Kconfig   |1 +
 drivers/net/ethernet/freescale/Makefile  |2 +
 drivers/net/ethernet/freescale/fman/Kconfig  |8 ++
 drivers/net/ethernet/freescale/fman/Makefile |5 +
 drivers/net/ethernet/freescale/fman/fman_muram.c |  159 ++
 drivers/net/ethernet/freescale/fman/fman_muram.h |   51 +++
 6 files changed, 226 insertions(+)
 create mode 100644 drivers/net/ethernet/freescale/fman/Kconfig
 create mode 100644 drivers/net/ethernet/freescale/fman/Makefile
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_muram.c
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_muram.h

diff --git a/drivers/net/ethernet/freescale/Kconfig 
b/drivers/net/ethernet/freescale/Kconfig
index ff76d4e..f3f89cc 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -53,6 +53,7 @@ config FEC_MPC52xx_MDIO
  If compiled as module, it will be called fec_mpc52xx_phy.
 
 source "drivers/net/ethernet/freescale/fs_enet/Kconfig"
+source "drivers/net/ethernet/freescale/fman/Kconfig"
 
 config FSL_PQ_MDIO
tristate "Freescale PQ MDIO"
diff --git a/drivers/net/ethernet/freescale/Makefile 
b/drivers/net/ethernet/freescale/Makefile
index 71debd1..4097c58 100644
--- a/drivers/net/ethernet/freescale/Makefile
+++ b/drivers/net/ethernet/freescale/Makefile
@@ -17,3 +17,5 @@ gianfar_driver-objs := gianfar.o \
gianfar_ethtool.o
 obj-$(CONFIG_UCC_GETH) += ucc_geth_driver.o
 ucc_geth_driver-objs := ucc_geth.o ucc_geth_ethtool.o
+
+obj-$(CONFIG_FSL_FMAN) += fman/
diff --git a/drivers/net/ethernet/freescale/fman/Kconfig 
b/drivers/net/ethernet/freescale/fman/Kconfig
new file mode 100644
index 000..66b7296
--- /dev/null
+++ b/drivers/net/ethernet/freescale/fman/Kconfig
@@ -0,0 +1,8 @@
+config FSL_FMAN
+   bool "FMan support"
+   depends on FSL_SOC || COMPILE_TEST
+   select GENERIC_ALLOCATOR
+   default n
+   help
+   Freescale Data-Path Acceleration Architecture Frame Manager
+   (FMan) support
diff --git a/drivers/net/ethernet/freescale/fman/Makefile 
b/drivers/net/ethernet/freescale/fman/Makefile
new file mode 100644
index 000..fc2e194
--- /dev/null
+++ b/drivers/net/ethernet/freescale/fman/Makefile
@@ -0,0 +1,5 @@
+subdir-ccflags-y +=  -I$(srctree)/drivers/net/ethernet/freescale/fman
+
+obj-y  += fsl_fman.o
+
+fsl_fman-objs  := fman_muram.o
diff --git a/drivers/net/ethernet/freescale/fman/fman_muram.c 
b/drivers/net/ethernet/freescale/fman/fman_muram.c
new file mode 100644
index 000..35d4a50
--- /dev/null
+++ b/drivers/net/ethernet/freescale/fman/fman_muram.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2008-2015 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ *   names of its contributors may be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fman_muram.h"
+
+#include 
+#include 
+#include 

[v7, 4/6] fsl/fman: Add FMan SP support

2015-11-02 Thread igal.liberman
From: Igal Liberman 

The Storage Profiles contain parameters that are used
by the FMan for frame reception and transmission.

Signed-off-by: Igal Liberman 
---
 drivers/net/ethernet/freescale/fman/Makefile  |2 +-
 drivers/net/ethernet/freescale/fman/fman_sp.c |  167 +
 drivers/net/ethernet/freescale/fman/fman_sp.h |  103 +++
 3 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_sp.c
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_sp.h

diff --git a/drivers/net/ethernet/freescale/fman/Makefile 
b/drivers/net/ethernet/freescale/fman/Makefile
index 43360d70..5141532 100644
--- a/drivers/net/ethernet/freescale/fman/Makefile
+++ b/drivers/net/ethernet/freescale/fman/Makefile
@@ -2,5 +2,5 @@ subdir-ccflags-y +=  
-I$(srctree)/drivers/net/ethernet/freescale/fman
 
 obj-y  += fsl_fman.o fsl_fman_mac.o
 
-fsl_fman-objs  := fman_muram.o fman.o
+fsl_fman-objs  := fman_muram.o fman.o fman_sp.o
 fsl_fman_mac-objs := fman_dtsec.o fman_memac.o fman_tgec.o
diff --git a/drivers/net/ethernet/freescale/fman/fman_sp.c 
b/drivers/net/ethernet/freescale/fman/fman_sp.c
new file mode 100644
index 000..f36c622
--- /dev/null
+++ b/drivers/net/ethernet/freescale/fman/fman_sp.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2008 - 2015 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ *   names of its contributors may be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fman_sp.h"
+#include "fman.h"
+
+void fman_sp_set_buf_pools_in_asc_order_of_buf_sizes(struct fman_ext_pools
+*fm_ext_pools,
+u8 *ordered_array,
+u16 *sizes_array)
+{
+   u16 buf_size = 0;
+   int i = 0, j = 0, k = 0;
+
+   /* First we copy the external buffers pools information
+* to an ordered local array
+*/
+   for (i = 0; i < fm_ext_pools->num_of_pools_used; i++) {
+   /* get pool size */
+   buf_size = fm_ext_pools->ext_buf_pool[i].size;
+
+   /* keep sizes in an array according to poolId
+* for direct access
+*/
+   sizes_array[fm_ext_pools->ext_buf_pool[i].id] = buf_size;
+
+   /* save poolId in an ordered array according to size */
+   for (j = 0; j <= i; j++) {
+   /* this is the next free place in the array */
+   if (j == i)
+   ordered_array[i] =
+   fm_ext_pools->ext_buf_pool[i].id;
+   else {
+   /* find the right place for this poolId */
+   if (buf_size < sizes_array[ordered_array[j]]) {
+   /* move the pool_ids one place ahead
+* to make room for this poolId
+*/
+   for (k = i; k > j; k--)
+   ordered_array[k] =
+   

[net-next v4 0/8] dpaa_eth: Add the Freescale DPAA Ethernet driver

2015-11-02 Thread Madalin Bucur
This patch series adds the Ethernet driver for the Freescale
QorIQ Data Path Acceleration Architecture (DPAA).

This version includes changes following the feedback received
on previous versions from Eric Dumazet, Bob Cochran, Joe Perches,
Paul Bolle, Joakim Tjernlund, Scott Wood, David Miller - thank you.

Together with the driver a managed version of alloc_percpu
is provided that simplifies the release of per-CPU memory.

The Freescale DPAA architecture consists in a series of hardware
blocks that support the Ethernet connectivity. The Ethernet driver
depends upon the following drivers that are currently in the Linux
kernel or in review (the underlying drivers are not inter-dependent):
 - Peripheral Access Memory Unit (PAMU)
drivers/iommu/fsl_*
 - Frame Manager (FMan)
drivers/net/ethernet/freescale/fman
 - Queue Manager (QMan), Buffer Manager (BMan)
drivers/soc/fsl/qbman

dpaa_eth interfaces mapping to FMan MACs:

  dpaa_eth   /eth0\ ...   /ethN\
  driver|  | |  |
  -      ---      -
   -Ports  / Tx  Rx \.../ Tx  Rx \
  FMan|  | |  |
   -MACs  |   MAC0   | |   MACN   |
 /   dtsec0   \  ...  /   dtsecN   \ (or tgec)
/  \ /  \(or memac)
  -  --  ---  --  -
  FMan, FMan Port, FMan SP, FMan MURAM drivers
  -
  FMan HW blocks: MURAM, MACs, Ports, SP
  -

dpaa_eth relation to QMan, FMan:
  
  dpaa_eth   /eth0\
  driver/  \
  -   -^-   -^-   -^-   ----
  QMan driver / \   / \   / \  \   /  | BMan|
 |Rx | |Rx | |Tx | |Tx |  | driver  |
  -  |Dfl| |Err| |Cnf| |FQs|  | |
  QMan HW|FQ | |FQ | |FQ | |   |  | |
 /   \ /   \ /   \  \ /   | |
  -   ---   ---   ---   -v--
|FMan QMI | |
| FMan HW   FMan BMI  | BMan HW |
  ---   

where the acronyms used above (and in the code) are:
DPAA = Data Path Acceleration Architecture
FMan = DPAA Frame Manager
QMan = DPAA Queue Manager
BMan = DPAA Buffers Manager
QMI = QMan interface in FMan
BMI = BMan interface in FMan
FMan SP = FMan Storage Profiles
MURAM = Multi-user RAM in FMan
FQ = QMan Frame Queue
Rx Dfl FQ = default reception FQ
Rx Err FQ = Rx error frames FQ
Tx Cnf FQ = Tx confirmation FQ
Tx FQs = transmission frame queues
dtsec = datapath three speed Ethernet controller (10/100/1000 Mbps)
tgec = ten gigabit Ethernet controller (10 Gbps)
memac = multirate Ethernet MAC (10/100/1000/1)

The latest FMan driver patches were submitted by Igal Liberman:
https://patchwork.ozlabs.org/project/netdev/list/?submitter=64715=*=[v7,

The latest Q/BMan drivers were submitted by Roy Pledge:
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?submitter=66331=*

Changes from v3:
 - removed bogus delay and comment in .ndo_stop implementation
 - addressed minor issues reported by David Miller

Changes from v2:
 - removed debugfs, moved exports to ethtool statistics
 - removed congestion groups Kconfig params

Changes from v1:
 - bpool level Kconfig options removed
 - print format using pr_fmt, cleaned up prints
 - __hot/__cold removed
 - gratuitous unlikely() removed
 - code style aligned, consistent spacing for declarations
 - comment formatting

The complete patch set based on the latest net-next/master kernel
can be found in the public git at:
http://git.freescale.com/git/cgit.cgi/ppc/upstream/linux.git
under the tag ldup_public_git_20151102:
http://git.freescale.com/git/cgit.cgi/ppc/upstream/linux.git/log/?h=ldup_public_git_20151102

There is one u-boot patch that one needs to make sure it's applied
to align u-boot to the latest device tree binding document specification
used by the FMan driver. Please make sure your u-boot includes this patch:

commit 97a8d010e029111e5711a45264a726bedbeb24c4
Author: Igal Liberman 
Date:   Tue Aug 18 14:47:05 2015 +0300

net/fman: Support both new and legacy FMan Compatibles

The patch was included in u-boot in v2015.10-rc3.

Madalin Bucur (8):
  devres: add devm_alloc_percpu()
  dpaa_eth: add support for DPAA Ethernet
  dpaa_eth: add support for S/G frames
  dpaa_eth: add driver's Tx queue selection
  dpaa_eth: add ethtool functionality
  dpaa_eth: add ethtool statistics
  dpaa_eth: add sysfs exports
  dpaa_eth: add trace points

 Documentation/driver-model/devres.txt  |4 +
 drivers/base/devres.c  |   64 +
 drivers/net/ethernet/freescale/Kconfig |2 +
 

[net-next v4 3/8] dpaa_eth: add support for S/G frames

2015-11-02 Thread Madalin Bucur
Add support for Scater/Gather (S/G) frames. The FMan can place
the frame content into multiple buffers and provide a S/G Table
(SGT) into one first buffer with references to the others.

Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c |   6 +
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h |   2 +-
 .../net/ethernet/freescale/dpaa/dpaa_eth_common.c  |  50 ++-
 .../net/ethernet/freescale/dpaa/dpaa_eth_common.h  |   2 +
 drivers/net/ethernet/freescale/dpaa/dpaa_eth_sg.c  | 335 +++--
 5 files changed, 374 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 8381616..31d55b4 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -463,6 +463,12 @@ static int dpa_private_netdev_init(struct net_device 
*net_dev)
net_dev->hw_features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 NETIF_F_LLTX);
 
+   /* Advertise S/G and HIGHDMA support for private interfaces */
+   net_dev->hw_features |= NETIF_F_SG | NETIF_F_HIGHDMA;
+   /* Recent kernels enable GSO automatically, if
+* we declare NETIF_F_SG. For conformity, we'll
+* still declare GSO explicitly.
+*/
net_dev->features |= NETIF_F_GSO;
 
return dpa_netdev_init(net_dev, mac_addr, tx_timeout);
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
index 1cc8682..1ba6617 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
@@ -347,7 +347,7 @@ static inline void clear_fd(struct qm_fd *fd)
 }
 
 static inline int _dpa_tx_fq_to_id(const struct dpa_priv_s *priv,
-  struct qman_fq *tx_fq)
+   struct qman_fq *tx_fq)
 {
int i;
 
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
index 963be4d8..b36cbca 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
@@ -1177,10 +1177,42 @@ void dpaa_eth_init_ports(struct mac_device *mac_dev,
  port_fqs->rx_defq, _layout[RX]);
 }
 
+void dpa_release_sgt(struct qm_sg_entry *sgt)
+{
+   struct dpa_bp *dpa_bp;
+   struct bm_buffer bmb[DPA_BUFF_RELEASE_MAX];
+   u8 i = 0, j;
+
+   memset(bmb, 0, sizeof(bmb));
+
+   do {
+   dpa_bp = dpa_bpid2pool(sgt[i].bpid);
+   DPA_ERR_ON(!dpa_bp);
+
+   j = 0;
+   do {
+   DPA_ERR_ON(sgt[i].extension);
+
+   bmb[j].hi = sgt[i].addr_hi;
+   bmb[j].lo = be32_to_cpu(sgt[i].addr_lo);
+
+   j++; i++;
+   } while (j < ARRAY_SIZE(bmb) &&
+   !sgt[i - 1].final &&
+   sgt[i - 1].bpid == sgt[i].bpid);
+
+   while (bman_release(dpa_bp->pool, bmb, j, 0))
+   cpu_relax();
+   } while (!sgt[i - 1].final);
+}
+
 void dpa_fd_release(const struct net_device *net_dev, const struct qm_fd *fd)
 {
+   struct qm_sg_entry *sgt;
struct dpa_bp *dpa_bp;
struct bm_buffer bmb;
+   dma_addr_t addr;
+   void *vaddr;
 
memset(, 0, sizeof(bmb));
bm_buffer_set64(, fd->addr);
@@ -1188,7 +1220,23 @@ void dpa_fd_release(const struct net_device *net_dev, 
const struct qm_fd *fd)
dpa_bp = dpa_bpid2pool(fd->bpid);
DPA_ERR_ON(!dpa_bp);
 
-   DPA_ERR_ON(fd->format == qm_fd_sg);
+   if (fd->format == qm_fd_sg) {
+   vaddr = phys_to_virt(fd->addr);
+   sgt = vaddr + dpa_fd_offset(fd);
+
+   dma_unmap_single(dpa_bp->dev, qm_fd_addr(fd), dpa_bp->size,
+DMA_BIDIRECTIONAL);
+
+   dpa_release_sgt(sgt);
+
+   addr = dma_map_single(dpa_bp->dev, vaddr, dpa_bp->size,
+ DMA_BIDIRECTIONAL);
+   if (dma_mapping_error(dpa_bp->dev, addr)) {
+   dev_err(dpa_bp->dev, "DMA mapping failed");
+   return;
+   }
+   bm_buffer_set64(, addr);
+   }
 
while (bman_release(dpa_bp->pool, , 1, 0))
cpu_relax();
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h
index 68843c0..9df8f14 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h
@@ -37,6 +37,7 @@
 
 #include "dpaa_eth.h"
 
+#define DPA_SGT_MAX_ENTRIES 16 /* maximum number of entries in SG Table */
 #define DPA_BUFF_RELEASE_MAX 8 /* maximum number of buffers released at 

[net-next v4 8/8] dpaa_eth: add trace points

2015-11-02 Thread Madalin Bucur
Add trace points on the hot processing path.

Signed-off-by: Ruxandra Ioana Radulescu 
---
 drivers/net/ethernet/freescale/dpaa/Makefile   |   1 +
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c |  12 ++
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h |   4 +
 .../net/ethernet/freescale/dpaa/dpaa_eth_trace.h   | 141 +
 4 files changed, 158 insertions(+)
 create mode 100644 drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h

diff --git a/drivers/net/ethernet/freescale/dpaa/Makefile 
b/drivers/net/ethernet/freescale/dpaa/Makefile
index 141ade4..15ed1c4 100644
--- a/drivers/net/ethernet/freescale/dpaa/Makefile
+++ b/drivers/net/ethernet/freescale/dpaa/Makefile
@@ -9,3 +9,4 @@ ccflags-y += -I$(FMAN)
 obj-$(CONFIG_FSL_DPAA_ETH) += fsl_dpa.o
 
 fsl_dpa-objs += dpaa_eth.o dpaa_eth_sg.o dpaa_eth_common.o dpaa_ethtool.o 
dpaa_eth_sysfs.o
+CFLAGS_dpaa_eth.o := -I$(src)
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 3cd03f5..b939d9d 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -57,6 +57,12 @@
 #include "dpaa_eth.h"
 #include "dpaa_eth_common.h"
 
+/* CREATE_TRACE_POINTS only needs to be defined once. Other dpa files
+ * using trace events only need to #include 
+ */
+#define CREATE_TRACE_POINTS
+#include "dpaa_eth_trace.h"
+
 /* Valid checksum indication */
 #define DPA_CSUM_VALID 0x
 
@@ -229,6 +235,9 @@ priv_rx_default_dqrr(struct qman_portal *portal,
priv = netdev_priv(net_dev);
dpa_bp = priv->dpa_bp;
 
+   /* Trace the Rx fd */
+   trace_dpa_rx_fd(net_dev, fq, >fd);
+
/* IRQ handler, non-migratable; safe to use raw_cpu_ptr here */
percpu_priv = raw_cpu_ptr(priv->percpu_priv);
count_ptr = raw_cpu_ptr(dpa_bp->percpu_count);
@@ -285,6 +294,9 @@ priv_tx_conf_default_dqrr(struct qman_portal *portal,
net_dev = ((struct dpa_fq *)fq)->net_dev;
priv = netdev_priv(net_dev);
 
+   /* Trace the fd */
+   trace_dpa_tx_conf_fd(net_dev, fq, >fd);
+
/* Non-migratable context, safe to use raw_cpu_ptr */
percpu_priv = raw_cpu_ptr(priv->percpu_priv);
 
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
index cdc7595..7dee8de 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
@@ -36,6 +36,7 @@
 
 #include "fman.h"
 #include "mac.h"
+#include "dpaa_eth_trace.h"
 
 extern int dpa_rx_extra_headroom;
 extern int dpa_max_frm;
@@ -407,6 +408,9 @@ static inline int dpa_xmit(struct dpa_priv_s *priv,
if (fd->bpid == 0xff)
fd->cmd |= qman_fq_fqid(priv->conf_fqs[queue]);
 
+   /* Trace this Tx fd */
+   trace_dpa_tx_fd(priv->net_dev, egress_fq, fd);
+
for (i = 0; i < 10; i++) {
err = qman_enqueue(egress_fq, fd, 0);
if (err != -EBUSY)
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h
new file mode 100644
index 000..3b67477
--- /dev/null
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h
@@ -0,0 +1,141 @@
+/* Copyright 2013-2015 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in the
+ *  documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ *  names of its contributors may be used to endorse or promote products
+ *  derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING 

Re: [PATCH] bpf: convert hashtab lock to raw lock

2015-11-02 Thread Shi, Yang

On 11/2/2015 12:59 AM, Thomas Gleixner wrote:

On Sun, 1 Nov 2015, Alexei Starovoitov wrote:

On Sat, Oct 31, 2015 at 09:47:36AM -0400, Steven Rostedt wrote:

On Fri, 30 Oct 2015 17:03:58 -0700
Alexei Starovoitov  wrote:


On Fri, Oct 30, 2015 at 03:16:26PM -0700, Yang Shi wrote:

When running bpf samples on rt kernel, it reports the below warning:

BUG: sleeping function called from invalid context at 
kernel/locking/rtmutex.c:917
in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping
Preemption disabled at:[] kprobe_perf_func+0x30/0x228

...

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d..972b76b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,7 +17,7 @@
  struct bpf_htab {
struct bpf_map map;
struct hlist_head *buckets;
-   spinlock_t lock;
+   raw_spinlock_t lock;


How do we address such things in general?
I bet there are tons of places around the kernel that
call spin_lock from atomic.
I'd hate to lose the benefits of lockdep of non-raw spin_lock
just to make rt happy.


You wont lose any benefits of lockdep. Lockdep still checks
raw_spin_lock(). The only difference between raw_spin_lock and
spin_lock is that in -rt spin_lock turns into an rt_mutex() and
raw_spin_lock stays a spin lock.


I see. The patch makes sense then.
Would be good to document this peculiarity of spin_lock.


I'm working on a document.


Thanks Steven and Thomas for your elaboration and comment.

Yang



Thanks,

tglx



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bpf: convert hashtab lock to raw lock

2015-11-02 Thread Steven Rostedt
On Mon, 02 Nov 2015 09:12:29 -0800
"Shi, Yang"  wrote:

> Yes, it is common practice for converting sleepable spin lock to raw 
> spin lock in -rt to avoid scheduling in atomic context bug.

Note, in a lot of cases we don't just convert spin_locks to raw because
of atomic context. There's times we need to change the design where the
lock is not taken in atomic context (switching preempt_disable() to a
local_lock() for example).

But bpf is much like ftrace and kprobes where they can be taken almost
anywhere, and the do indeed need to be raw.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bpf: convert hashtab lock to raw lock

2015-11-02 Thread Shi, Yang

On 11/2/2015 9:24 AM, Steven Rostedt wrote:

On Mon, 02 Nov 2015 09:12:29 -0800
"Shi, Yang"  wrote:


Yes, it is common practice for converting sleepable spin lock to raw
spin lock in -rt to avoid scheduling in atomic context bug.


Note, in a lot of cases we don't just convert spin_locks to raw because
of atomic context. There's times we need to change the design where the
lock is not taken in atomic context (switching preempt_disable() to a
local_lock() for example).


Yes, definitely. Understood.

Thanks,
Yang



But bpf is much like ftrace and kprobes where they can be taken almost
anywhere, and the do indeed need to be raw.

-- Steve



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[v7, 2/6] fsl/fman: Add FMan support

2015-11-02 Thread igal.liberman
From: Igal Liberman 

Add the Data Path Acceleration Architecture Frame Manger Driver.
The FMan embeds a series of hardware blocks that implement a group
of Ethernet interfaces. This patch adds The FMan configuration,
initialization and runtime control routines.

The FMan driver supports several hardware versions
differentiated by things like:
- Different type of MACs
- Number of MAC and ports
- Available resources
- Different hardware errata

Signed-off-by: Igal Liberman 
---
 drivers/net/ethernet/freescale/fman/Makefile |2 +-
 drivers/net/ethernet/freescale/fman/fman.c   | 2876 ++
 drivers/net/ethernet/freescale/fman/fman.h   |  325 +++
 3 files changed, 3202 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/freescale/fman/fman.c
 create mode 100644 drivers/net/ethernet/freescale/fman/fman.h

diff --git a/drivers/net/ethernet/freescale/fman/Makefile 
b/drivers/net/ethernet/freescale/fman/Makefile
index fc2e194..fb5a7f0 100644
--- a/drivers/net/ethernet/freescale/fman/Makefile
+++ b/drivers/net/ethernet/freescale/fman/Makefile
@@ -2,4 +2,4 @@ subdir-ccflags-y +=  
-I$(srctree)/drivers/net/ethernet/freescale/fman
 
 obj-y  += fsl_fman.o
 
-fsl_fman-objs  := fman_muram.o
+fsl_fman-objs  := fman_muram.o fman.o
diff --git a/drivers/net/ethernet/freescale/fman/fman.c 
b/drivers/net/ethernet/freescale/fman/fman.c
new file mode 100644
index 000..f97a52b
--- /dev/null
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -0,0 +1,2876 @@
+/*
+ * Copyright 2008-2015 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ *   names of its contributors may be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "fman.h"
+#include "fman_muram.h"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* General defines */
+#define FMAN_LIODN_TBL 64  /* size of LIODN table */
+#define MAX_NUM_OF_MACS10
+#define FM_NUM_OF_FMAN_CTRL_EVENT_REGS 4
+#define BASE_RX_PORTID 0x08
+#define BASE_TX_PORTID 0x28
+
+/* Modules registers offsets */
+#define BMI_OFFSET 0x0008
+#define QMI_OFFSET 0x00080400
+#define DMA_OFFSET 0x000C2000
+#define FPM_OFFSET 0x000C3000
+#define IMEM_OFFSET0x000C4000
+#define CGP_OFFSET 0x000DB000
+
+/* Exceptions bit map */
+#define EX_DMA_BUS_ERROR   0x8000
+#define EX_DMA_READ_ECC0x4000
+#define EX_DMA_SYSTEM_WRITE_ECC0x2000
+#define EX_DMA_FM_WRITE_ECC0x1000
+#define EX_FPM_STALL_ON_TASKS  0x0800
+#define EX_FPM_SINGLE_ECC  0x0400
+#define EX_FPM_DOUBLE_ECC  0x0200
+#define EX_QMI_SINGLE_ECC  0x0100
+#define EX_QMI_DEQ_FROM_UNKNOWN_PORTID 0x0080
+#define EX_QMI_DOUBLE_ECC  0x0040
+#define EX_BMI_LIST_RAM_ECC0x0020
+#define EX_BMI_STORAGE_PROFILE_ECC 0x0010
+#define EX_BMI_STATISTICS_RAM_ECC  0x0008
+#define EX_IRAM_ECC0x0004
+#define EX_MURAM_ECC  

[v7, 5/6] fsl/fman: Add FMan Port Support

2015-11-02 Thread igal.liberman
From: Igal Liberman 

Add the Data Path Acceleration Architecture Frame Manger Port Driver.
The FMan driver uses a module called "Port" to represent the physical
TX and RX ports.
Each FMan version has different number of physical ports.
This patch adds The FMan Port configuration, initialization and
runtime control routines for both TX and RX.

Signed-off-by: Igal Liberman 
---
 drivers/net/ethernet/freescale/fman/Makefile|2 +-
 drivers/net/ethernet/freescale/fman/fman_port.c | 1779 +++
 drivers/net/ethernet/freescale/fman/fman_port.h |  151 ++
 3 files changed, 1931 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_port.c
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_port.h

diff --git a/drivers/net/ethernet/freescale/fman/Makefile 
b/drivers/net/ethernet/freescale/fman/Makefile
index 5141532..2eb0b9b 100644
--- a/drivers/net/ethernet/freescale/fman/Makefile
+++ b/drivers/net/ethernet/freescale/fman/Makefile
@@ -2,5 +2,5 @@ subdir-ccflags-y +=  
-I$(srctree)/drivers/net/ethernet/freescale/fman
 
 obj-y  += fsl_fman.o fsl_fman_mac.o
 
-fsl_fman-objs  := fman_muram.o fman.o fman_sp.o
+fsl_fman-objs  := fman_muram.o fman.o fman_sp.o fman_port.o
 fsl_fman_mac-objs := fman_dtsec.o fman_memac.o fman_tgec.o
diff --git a/drivers/net/ethernet/freescale/fman/fman_port.c 
b/drivers/net/ethernet/freescale/fman/fman_port.c
new file mode 100644
index 000..56ecf2b
--- /dev/null
+++ b/drivers/net/ethernet/freescale/fman/fman_port.c
@@ -0,0 +1,1779 @@
+/*
+ * Copyright 2008 - 2015 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ *   names of its contributors may be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "fman_port.h"
+#include "fman.h"
+#include "fman_sp.h"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* Queue ID */
+#define DFLT_FQ_ID 0x00FF
+
+/* General defines */
+#define PORT_BMI_FIFO_UNITS0x100
+
+#define MAX_PORT_FIFO_SIZE(bmi_max_fifo_size)  \
+   min((u32)bmi_max_fifo_size, (u32)1024 * FMAN_BMI_FIFO_UNITS)
+
+#define PORT_CG_MAP_NUM8
+#define PORT_PRS_RESULT_WORDS_NUM  8
+#define PORT_IC_OFFSET_UNITS   0x10
+
+#define MIN_EXT_BUF_SIZE   64
+
+#define BMI_PORT_REGS_OFFSET   0
+#define QMI_PORT_REGS_OFFSET   0x400
+
+/* Default values */
+#define DFLT_PORT_BUFFER_PREFIX_CONTEXT_DATA_ALIGN \
+   DFLT_FM_SP_BUFFER_PREFIX_CONTEXT_DATA_ALIGN
+
+#define DFLT_PORT_CUT_BYTES_FROM_END   4
+
+#define DFLT_PORT_ERRORS_TO_DISCARDFM_PORT_FRM_ERR_CLS_DISCARD
+#define DFLT_PORT_MAX_FRAME_LENGTH 9600
+
+#define DFLT_PORT_RX_FIFO_PRI_ELEVATION_LEV(bmi_max_fifo_size) \
+   MAX_PORT_FIFO_SIZE(bmi_max_fifo_size)
+
+#define DFLT_PORT_RX_FIFO_THRESHOLD(major, bmi_max_fifo_size)  \
+   (major == 6 ?   \
+   MAX_PORT_FIFO_SIZE(bmi_max_fifo_size) : \
+   (MAX_PORT_FIFO_SIZE(bmi_max_fifo_size) * 3 / 4))\
+
+#define DFLT_PORT_EXTRA_NUM_OF_FIFO_BUFS   0
+
+/* QMI defines */
+#define 

[PATCH net-next] net: fix percpu memory leaks

2015-11-02 Thread Eric Dumazet
From: Eric Dumazet 

This patch fixes following problems :

1) percpu_counter_init() can return an error, therefore
  init_frag_mem_limit() must propagate this error so that
  inet_frags_init_net() can do the same up to its callers.

2) If ip[46]_frags_ns_ctl_register() fail, we must unwind
   properly and free the percpu_counter.

Without this fix, we leave freed object in percpu_counters
global list (if CONFIG_HOTPLUG_CPU) leading to crashes.

This bug was detected by KASAN and syzkaller tool
(http://github.com/google/syzkaller)

Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem 
accounting")
Signed-off-by: Eric Dumazet 
Reported-by: Dmitry Vyukov 
Cc: Hannes Frederic Sowa 
Cc: Jesper Dangaard Brouer 
---
 include/net/inet_frag.h |   15 +--
 net/ieee802154/6lowpan/reassembly.c |   11 ---
 net/ipv4/inet_fragment.c|6 --
 net/ipv4/ip_fragment.c  |   12 +---
 net/ipv6/netfilter/nf_conntrack_reasm.c |   12 +---
 net/ipv6/reassembly.c   |   12 +---
 6 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 53eead2da743..ac42bbb37b2d 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -108,7 +108,15 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-void inet_frags_init_net(struct netns_frags *nf);
+static inline int inet_frags_init_net(struct netns_frags *nf)
+{
+   return percpu_counter_init(>mem, 0, GFP_KERNEL);
+}
+static inline void inet_frags_uninit_net(struct netns_frags *nf)
+{
+   percpu_counter_destroy(>mem);
+}
+
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 
 void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
@@ -154,11 +162,6 @@ static inline void add_frag_mem_limit(struct netns_frags 
*nf, int i)
__percpu_counter_add(>mem, i, frag_percpu_counter_batch);
 }
 
-static inline void init_frag_mem_limit(struct netns_frags *nf)
-{
-   percpu_counter_init(>mem, 0, GFP_KERNEL);
-}
-
 static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
 {
unsigned int res;
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 12e8cf4bda9f..6b437e8760d3 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -580,14 +580,19 @@ static int __net_init lowpan_frags_init_net(struct net 
*net)
 {
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
+   int res;
 
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
 
-   inet_frags_init_net(_lowpan->frags);
-
-   return lowpan_frags_ns_sysctl_register(net);
+   res = inet_frags_init_net(_lowpan->frags);
+   if (res)
+   return res;
+   res = lowpan_frags_ns_sysctl_register(net);
+   if (res)
+   inet_frags_uninit_net(_lowpan->frags);
+   return res;
 }
 
 static void __net_exit lowpan_frags_exit_net(struct net *net)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index d0a7c0319e3d..fe144dae7372 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -209,12 +209,6 @@ int inet_frags_init(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_init);
 
-void inet_frags_init_net(struct netns_frags *nf)
-{
-   init_frag_mem_limit(nf);
-}
-EXPORT_SYMBOL(inet_frags_init_net);
-
 void inet_frags_fini(struct inet_frags *f)
 {
cancel_work_sync(>frags_work);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 5482745d5d68..1fe55ae81781 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -839,6 +839,8 @@ static void __init ip4_frags_ctl_register(void)
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
+   int res;
+
/* Fragment cache limits.
 *
 * The fragment memory accounting code, (tries to) account for
@@ -862,9 +864,13 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 */
net->ipv4.frags.timeout = IP_FRAG_TIME;
 
-   inet_frags_init_net(>ipv4.frags);
-
-   return ip4_frags_ns_ctl_register(net);
+   res = inet_frags_init_net(>ipv4.frags);
+   if (res)
+   return res;
+   res = ip4_frags_ns_ctl_register(net);
+   if (res)
+   inet_frags_uninit_net(>ipv4.frags);
+   return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c 
b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 99610547fccc..d5efeb87350e 100644
--- 

Re: [PATCH net-next] af_unix: optimize unix_writable by inlining

2015-11-02 Thread Eric Dumazet
On Mon, 2015-11-02 at 12:01 -0500, Aaron Conole wrote:
> unix_writable() originally was inlined, but was changed as part of
> commit 1586a5877db9 ("af_unix: do not report POLLOUT on
> listeners"). Re-enable the inline flag.
> 
> Signed-off-by: Aaron Conole 
> Cc: Eric Dumazet 
> ---


We leave this to the compiler nowadays.

If you take a look at disassembly, you'll see your patch has no effect
at all.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kernel panic in 4.2.3, rb_erase in sch_fq

2015-11-02 Thread Eric Dumazet
On Mon, 2015-11-02 at 17:58 +0200, Denys Fedoryshchenko wrote:
> On 2015-11-02 17:24, Eric Dumazet wrote:
> > On Mon, 2015-11-02 at 16:11 +0200, Denys Fedoryshchenko wrote:
> >> Hi!
> >> 
> >> Actually seems i was getting this panic for a while (once per week) on
> >> loaded pppoe server, but just now was able to get full panic message.
> >> After checking commit logs on sch_fq.c i didnt seen any fixes, so
> >> probably upgrading to newer kernel wont help?
> > 
> > I do not think we support sch_fq as a HTB leaf.
> > 
> > If you want both HTB and sch_fq, you need to setup a bonding device.
> > 
> > HTB on bond0
> > 
> > sch_fq on the slaves
> > 
> > Sure, the kernel should not crash, but HTB+sch_fq on same net device is
> > certainly not something that will work anyway.
> Strange, because except ppp, on static devices it works really very well 
> in such scheme. It is the only solution that can throttle incoming 
> bandwidth, when bandwidth is very overbooked - reliably, for my use 
> cases, such as 256k+ flows/2.5Gbps and several different classes of 
> traffic, so using DRR will end up in just not enough classes.
> 
> On latest kernels i had to patch tc to provide parameter for orphan mask 
> in fq, to increase number for flows for transit traffic.
> None of other qdiscs able to solve this problem, incoming bandwidth 
> simply flowing 10-20% more than set, but fq is doing magic.
> The only device that was working with similar efficiency for such cases 
> - proprietary PacketShaper, but is modifying tcp window size, and can't 
> be called transparent, and also has stability issues over 1Gbps.

Ah, I was thinking you needed more like 10Gb traffic ;)

with HTB on bonding, we can use MQ+FQ on the slaves in order to use many
cpus to serve local traffic.

But yes, if you use HTB+FQ for forwarding, I guess the bonding setup is
not really needed.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next v4 4/8] dpaa_eth: add driver's Tx queue selection

2015-11-02 Thread Madalin Bucur
Allow the selection of the transmission queue based on the CPU id.

Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/dpaa/Kconfig   | 10 ++
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c|  3 +++
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h|  6 ++
 drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c |  8 
 drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h |  4 
 5 files changed, 31 insertions(+)

diff --git a/drivers/net/ethernet/freescale/dpaa/Kconfig 
b/drivers/net/ethernet/freescale/dpaa/Kconfig
index 022d5aa..2577aac 100644
--- a/drivers/net/ethernet/freescale/dpaa/Kconfig
+++ b/drivers/net/ethernet/freescale/dpaa/Kconfig
@@ -11,6 +11,16 @@ menuconfig FSL_DPAA_ETH
 
 if FSL_DPAA_ETH
 
+config FSL_DPAA_ETH_USE_NDO_SELECT_QUEUE
+   bool "Use driver's Tx queue selection mechanism"
+   default y
+   ---help---
+ The DPAA Ethernet driver defines a ndo_select_queue() callback for 
optimal selection
+ of the egress FQ. That will override the XPS support for this 
netdevice.
+ If for whatever reason you want to be in control of the egress 
FQ-to-CPU selection and mapping,
+ or simply don't want to use the driver's ndo_select_queue() callback, 
then unselect this
+ and use the standard XPS support instead.
+
 config FSL_DPAA_ETH_FRIENDLY_IF_NAME
bool "Use fmX-macY names for the DPAA interfaces"
default y
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 31d55b4..894f1a7 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -390,6 +390,9 @@ static const struct net_device_ops dpa_private_ops = {
.ndo_get_stats64 = dpa_get_stats64,
.ndo_set_mac_address = dpa_set_mac_address,
.ndo_validate_addr = eth_validate_addr,
+#ifdef CONFIG_FSL_DPAA_ETH_USE_NDO_SELECT_QUEUE
+   .ndo_select_queue = dpa_select_queue,
+#endif
.ndo_change_mtu = dpa_change_mtu,
.ndo_set_rx_mode = dpa_set_rx_mode,
.ndo_init = dpa_ndo_init,
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
index 1ba6617..87577cf 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
@@ -420,9 +420,15 @@ static inline void _dpa_assign_wq(struct dpa_fq *fq)
}
 }
 
+#ifdef CONFIG_FSL_DPAA_ETH_USE_NDO_SELECT_QUEUE
+/* Use in lieu of skb_get_queue_mapping() */
+#define dpa_get_queue_mapping(skb) \
+   raw_smp_processor_id()
+#else
 /* Use the queue selected by XPS */
 #define dpa_get_queue_mapping(skb) \
skb_get_queue_mapping(skb)
+#endif
 
 static inline void _dpa_bp_free_pf(void *addr)
 {
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
index b36cbca..89f3b1f 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
@@ -593,6 +593,14 @@ bool dpa_bpid2pool_use(int bpid)
return false;
 }
 
+#ifdef CONFIG_FSL_DPAA_ETH_USE_NDO_SELECT_QUEUE
+u16 dpa_select_queue(struct net_device *net_dev, struct sk_buff *skb,
+void *accel_priv, select_queue_fallback_t fallback)
+{
+   return dpa_get_queue_mapping(skb);
+}
+#endif
+
 struct dpa_fq *dpa_fq_alloc(struct device *dev,
const struct fqid_cell *fqids,
struct list_head *list,
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h
index 9df8f14..2e9471d 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h
@@ -70,6 +70,10 @@ struct dpa_bp *dpa_bpid2pool(int bpid);
 void dpa_bpid2pool_map(int bpid, struct dpa_bp *dpa_bp);
 bool dpa_bpid2pool_use(int bpid);
 void dpa_bp_drain(struct dpa_bp *bp);
+#ifdef CONFIG_FSL_DPAA_ETH_USE_NDO_SELECT_QUEUE
+u16 dpa_select_queue(struct net_device *net_dev, struct sk_buff *skb,
+void *accel_priv, select_queue_fallback_t fallback);
+#endif
 struct dpa_fq *dpa_fq_alloc(struct device *dev,
const struct fqid_cell *fqids,
struct list_head *list,
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[v7, 3/6] fsl/fman: Add FMan MAC support

2015-11-02 Thread igal.liberman
From: Igal Liberman 

Add the Data Path Acceleration Architecture Frame Manger MAC support.
This patch adds The FMan MAC configuration, initialization and
runtime control routines.
This patch contains support for these types of MACs:
- dTSEC: Three speed Ethernet controller (10/100/1000 Mbps)
- tGEC: 10G Ethernet controller (10 Gbps)
- mEMAC: Multi-rate Ethernet MAC (10/100/1000/1 Mbps)
Different FMan revisions have different type and number of MACs.

Signed-off-by: Igal Liberman 
---
 drivers/net/ethernet/freescale/fman/Makefile   |3 +-
 .../net/ethernet/freescale/fman/crc_mac_addr_ext.h |  314 
 drivers/net/ethernet/freescale/fman/fman_dtsec.c   | 1609 
 drivers/net/ethernet/freescale/fman/fman_dtsec.h   |   59 +
 drivers/net/ethernet/freescale/fman/fman_mac.h |  276 
 drivers/net/ethernet/freescale/fman/fman_memac.c   | 1307 
 drivers/net/ethernet/freescale/fman/fman_memac.h   |   60 +
 drivers/net/ethernet/freescale/fman/fman_tgec.c|  798 ++
 drivers/net/ethernet/freescale/fman/fman_tgec.h|   55 +
 9 files changed, 4480 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/freescale/fman/crc_mac_addr_ext.h
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_dtsec.c
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_dtsec.h
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_mac.h
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_memac.c
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_memac.h
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_tgec.c
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_tgec.h

diff --git a/drivers/net/ethernet/freescale/fman/Makefile 
b/drivers/net/ethernet/freescale/fman/Makefile
index fb5a7f0..43360d70 100644
--- a/drivers/net/ethernet/freescale/fman/Makefile
+++ b/drivers/net/ethernet/freescale/fman/Makefile
@@ -1,5 +1,6 @@
 subdir-ccflags-y +=  -I$(srctree)/drivers/net/ethernet/freescale/fman
 
-obj-y  += fsl_fman.o
+obj-y  += fsl_fman.o fsl_fman_mac.o
 
 fsl_fman-objs  := fman_muram.o fman.o
+fsl_fman_mac-objs := fman_dtsec.o fman_memac.o fman_tgec.o
diff --git a/drivers/net/ethernet/freescale/fman/crc_mac_addr_ext.h 
b/drivers/net/ethernet/freescale/fman/crc_mac_addr_ext.h
new file mode 100644
index 000..92f2e87
--- /dev/null
+++ b/drivers/net/ethernet/freescale/fman/crc_mac_addr_ext.h
@@ -0,0 +1,314 @@
+/*
+ * Copyright 2008-2015 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ *   names of its contributors may be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Define a macro that calculate the crc value of an Ethernet MAC address
+ * (48 bitd address)
+ */
+
+#ifndef __crc_mac_addr_ext_h
+#define __crc_mac_addr_ext_h
+
+#include 
+
+static u32 crc_table[256] = {
+   0x,
+   0x77073096,
+   0xee0e612c,
+   0x990951ba,
+   0x076dc419,
+   0x706af48f,
+   0xe963a535,
+   0x9e6495a3,
+   0x0edb8832,
+   0x79dcb8a4,
+   0xe0d5e91e,
+   0x97d2d988,
+   0x09b64c2b,
+   0x7eb17cbd,
+   0xe7b82d07,
+   0x90bf1d91,
+   0x1db71064,
+   0x6ab020f2,
+   0xf3b97148,
+   

Re: [PATCH net] net: avoid NULL deref in inet_ctl_sock_destroy()

2015-11-02 Thread Hannes Frederic Sowa
On Mon, Nov 2, 2015, at 16:50, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> Under low memory conditions, tcp_sk_init() and icmp_sk_init()
> can both iterate on all possible cpus and call inet_ctl_sock_destroy(),
> with eventual NULL pointer.
> 
> Signed-off-by: Eric Dumazet 
> Reported-by: Dmitry Vyukov 

Eric, was this a private report or some of those floating around
publicly?

Thanks,
Hannes
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next v4 1/8] devres: add devm_alloc_percpu()

2015-11-02 Thread Madalin Bucur
Introduce managed counterparts for alloc_percpu() and free_percpu().
Add devm_alloc_percpu() and devm_free_percpu() into the managed
interfaces list.

Signed-off-by: Madalin Bucur 
Tested-by: Madalin-Cristian Bucur 
---
 Documentation/driver-model/devres.txt |  4 +++
 drivers/base/devres.c | 64 +++
 include/linux/device.h| 19 +++
 3 files changed, 87 insertions(+)

diff --git a/Documentation/driver-model/devres.txt 
b/Documentation/driver-model/devres.txt
index 831a536..595fd1b 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -312,6 +312,10 @@ MEM
   devm_kvasprintf()
   devm_kzalloc()
 
+PER-CPU MEM
+  devm_alloc_percpu()
+  devm_free_percpu()
+
 PCI
   pcim_enable_device() : after success, all PCI ops become managed
   pcim_pin_device(): keep PCI device enabled after release
diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index 8754646..6c314cc 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "base.h"
 
@@ -984,3 +985,66 @@ void devm_free_pages(struct device *dev, unsigned long 
addr)
   ));
 }
 EXPORT_SYMBOL_GPL(devm_free_pages);
+
+static void devm_percpu_release(struct device *dev, void *pdata)
+{
+   void __percpu *p;
+
+   p = *(void __percpu **)pdata;
+   free_percpu(p);
+}
+
+static int devm_percpu_match(struct device *dev, void *data, void *p)
+{
+   struct devres *devr = container_of(data, struct devres, data);
+
+   return *(void **)devr->data == p;
+}
+
+/**
+ * __devm_alloc_percpu - Resource-managed alloc_percpu
+ * @dev: Device to allocate per-cpu memory for
+ * @size: Size of per-cpu memory to allocate
+ * @align: Alignement of per-cpu memory to allocate
+ *
+ * Managed alloc_percpu. Per-cpu memory allocated with this function is
+ * automatically freed on driver detach.
+ *
+ * RETURNS:
+ * Pointer to allocated memory on success, NULL on failure.
+ */
+void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
+   size_t align)
+{
+   void *p;
+   void __percpu *pcpu;
+
+   pcpu = __alloc_percpu(size, align);
+   if (!pcpu)
+   return NULL;
+
+   p = devres_alloc(devm_percpu_release, sizeof(void *), GFP_KERNEL);
+   if (!p)
+   return NULL;
+
+   *(void __percpu **)p = pcpu;
+
+   devres_add(dev, p);
+
+   return pcpu;
+}
+EXPORT_SYMBOL_GPL(__devm_alloc_percpu);
+
+/**
+ * devm_free_percpu - Resource-managed free_percpu
+ * @dev: Device this memory belongs to
+ * @pdata: Per-cpu memory to free
+ *
+ * Free memory allocated with devm_alloc_percpu().
+ */
+void devm_free_percpu(struct device *dev, void __percpu *pdata)
+{
+   WARN_ON(devres_destroy(dev, devm_percpu_release, devm_percpu_match,
+  (void *)pdata));
+}
+EXPORT_SYMBOL_GPL(devm_free_percpu);
diff --git a/include/linux/device.h b/include/linux/device.h
index 5d7bc63..b563cc5 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -673,6 +673,25 @@ void __iomem *devm_ioremap_resource(struct device *dev, 
struct resource *res);
 int devm_add_action(struct device *dev, void (*action)(void *), void *data);
 void devm_remove_action(struct device *dev, void (*action)(void *), void 
*data);
 
+/**
+ * devm_alloc_percpu - Resource-managed alloc_percpu
+ * @dev: Device to allocate per-cpu memory for
+ * @type: Type to allocate per-cpu memory for
+ *
+ * Managed alloc_percpu. Per-cpu memory allocated with this function is
+ * automatically freed on driver detach.
+ *
+ * RETURNS:
+ * Pointer to allocated memory on success, NULL on failure.
+ */
+#define devm_alloc_percpu(dev, type)  \
+   (typeof(type) __percpu *)__devm_alloc_percpu(dev, sizeof(type), \
+__alignof__(type))
+
+void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
+  size_t align);
+void devm_free_percpu(struct device *dev, void __percpu *pdata);
+
 struct device_dma_parameters {
/*
 * a low level driver may set these to teach IOMMU code about
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-02 Thread Nelson, Shannon


> -Original Message-
> From: Sowmini Varadhan [mailto:sowmini.varad...@oracle.com]
> Sent: Sunday, November 01, 2015 4:07 PM
> 
> On (11/01/15 21:03), Nelson, Shannon wrote:
> > .. In the meantime, be sure to test what happens over a reset, such as
> what
> > happens when the MTU is changed.  This will make sure that the replay
> > of mac and vlan filters happens correctly.  You'll want to test this
> > with and without vlans.
> 
> I assume you mean .1q (aka linux macvlan) as opposed to access/trunk
> vlans?

Yes, this is what I had in mind.

> I will test that tomorrow but I did a quick sanity check on mtu, as well
> as turning tso on/off which also restarts the driver (I believe), and
> it was "fine", i.e., able to ping offlink hosts.
> 
> --Sowmini

Great, thanks.

sln
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kernel panic in 4.2.3, rb_erase in sch_fq

2015-11-02 Thread Denys Fedoryshchenko

On 2015-11-02 17:24, Eric Dumazet wrote:

On Mon, 2015-11-02 at 16:11 +0200, Denys Fedoryshchenko wrote:

Hi!

Actually seems i was getting this panic for a while (once per week) on
loaded pppoe server, but just now was able to get full panic message.
After checking commit logs on sch_fq.c i didnt seen any fixes, so
probably upgrading to newer kernel wont help?


I do not think we support sch_fq as a HTB leaf.

If you want both HTB and sch_fq, you need to setup a bonding device.

HTB on bond0

sch_fq on the slaves

Sure, the kernel should not crash, but HTB+sch_fq on same net device is
certainly not something that will work anyway.
Strange, because except ppp, on static devices it works really very well 
in such scheme. It is the only solution that can throttle incoming 
bandwidth, when bandwidth is very overbooked - reliably, for my use 
cases, such as 256k+ flows/2.5Gbps and several different classes of 
traffic, so using DRR will end up in just not enough classes.


On latest kernels i had to patch tc to provide parameter for orphan mask 
in fq, to increase number for flows for transit traffic.
None of other qdiscs able to solve this problem, incoming bandwidth 
simply flowing 10-20% more than set, but fq is doing magic.
The only device that was working with similar efficiency for such cases 
- proprietary PacketShaper, but is modifying tcp window size, and can't 
be called transparent, and also has stability issues over 1Gbps.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kernel panic in 4.2.3, rb_erase in sch_fq

2015-11-02 Thread Denys Fedoryshchenko

On 2015-11-02 18:12, Eric Dumazet wrote:

On Mon, 2015-11-02 at 17:58 +0200, Denys Fedoryshchenko wrote:

On 2015-11-02 17:24, Eric Dumazet wrote:
> On Mon, 2015-11-02 at 16:11 +0200, Denys Fedoryshchenko wrote:
>> Hi!
>>
>> Actually seems i was getting this panic for a while (once per week) on
>> loaded pppoe server, but just now was able to get full panic message.
>> After checking commit logs on sch_fq.c i didnt seen any fixes, so
>> probably upgrading to newer kernel wont help?
>
> I do not think we support sch_fq as a HTB leaf.
>
> If you want both HTB and sch_fq, you need to setup a bonding device.
>
> HTB on bond0
>
> sch_fq on the slaves
>
> Sure, the kernel should not crash, but HTB+sch_fq on same net device is
> certainly not something that will work anyway.
Strange, because except ppp, on static devices it works really very 
well

in such scheme. It is the only solution that can throttle incoming
bandwidth, when bandwidth is very overbooked - reliably, for my use
cases, such as 256k+ flows/2.5Gbps and several different classes of
traffic, so using DRR will end up in just not enough classes.

On latest kernels i had to patch tc to provide parameter for orphan 
mask

in fq, to increase number for flows for transit traffic.
None of other qdiscs able to solve this problem, incoming bandwidth
simply flowing 10-20% more than set, but fq is doing magic.
The only device that was working with similar efficiency for such 
cases
- proprietary PacketShaper, but is modifying tcp window size, and 
can't

be called transparent, and also has stability issues over 1Gbps.


Ah, I was thinking you needed more like 10Gb traffic ;)

with HTB on bonding, we can use MQ+FQ on the slaves in order to use 
many

cpus to serve local traffic.

But yes, if you use HTB+FQ for forwarding, I guess the bonding setup is
not really needed.
Well, here country is very underdeveloped in matters of technology. 10G 
interfaces appeared in some ISP only this year.
On the ppp interfaces where crash happening - it is even less bandwidth. 
Each user max 1-2Mbps(average usage 128kbps), 4.5k interfaces.
But i have some more heavy setups there, around 9k pppoe users 
terminated on single server, (means 9k interfaces), about 2Gbps traffic 
passing thru.
If i take non-FOSS solution, i will have to pay for software licenses 
$100k+, which is unbearable for local ISP. fq is not critical in this 
specific use case, i can use for ppp interfaces fifo or such, but i 
guess better to report a but :)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next v4 7/8] dpaa_eth: add sysfs exports

2015-11-02 Thread Madalin Bucur
Export Frame Queue and Buffer Pool IDs through sysfs.

Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/dpaa/Makefile   |   2 +-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c |   2 +
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h |   3 +
 .../net/ethernet/freescale/dpaa/dpaa_eth_common.c  |   2 +
 .../net/ethernet/freescale/dpaa/dpaa_eth_sysfs.c   | 167 +
 5 files changed, 175 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/freescale/dpaa/dpaa_eth_sysfs.c

diff --git a/drivers/net/ethernet/freescale/dpaa/Makefile 
b/drivers/net/ethernet/freescale/dpaa/Makefile
index 9b75d52..141ade4 100644
--- a/drivers/net/ethernet/freescale/dpaa/Makefile
+++ b/drivers/net/ethernet/freescale/dpaa/Makefile
@@ -8,4 +8,4 @@ ccflags-y += -I$(FMAN)
 
 obj-$(CONFIG_FSL_DPAA_ETH) += fsl_dpa.o
 
-fsl_dpa-objs += dpaa_eth.o dpaa_eth_sg.o dpaa_eth_common.o dpaa_ethtool.o
+fsl_dpa-objs += dpaa_eth.o dpaa_eth_sg.o dpaa_eth_common.o dpaa_ethtool.o 
dpaa_eth_sysfs.o
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 0b3332a..3cd03f5 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -756,6 +756,8 @@ dpaa_eth_priv_probe(struct platform_device *pdev)
if (err < 0)
goto netdev_init_failed;
 
+   dpaa_eth_sysfs_init(_dev->dev);
+
pr_info("Probed interface %s\n", net_dev->name);
 
return 0;
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
index ccaadd9..cdc7595 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
@@ -371,6 +371,9 @@ static inline u16 dpa_get_headroom(struct 
dpa_buffer_layout_s *bl)
return bl->data_align ? ALIGN(headroom, bl->data_align) : headroom;
 }
 
+void dpaa_eth_sysfs_remove(struct device *dev);
+void dpaa_eth_sysfs_init(struct device *dev);
+
 void dpa_private_napi_del(struct net_device *net_dev);
 
 static inline void clear_fd(struct qm_fd *fd)
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
index 4947cb9..2cf4565 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
@@ -299,6 +299,8 @@ int dpa_remove(struct platform_device *pdev)
 
priv = netdev_priv(net_dev);
 
+   dpaa_eth_sysfs_remove(dev);
+
dev_set_drvdata(dev, NULL);
unregister_netdev(net_dev);
 
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_sysfs.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_sysfs.c
new file mode 100644
index 000..a6c71b1
--- /dev/null
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_sysfs.c
@@ -0,0 +1,167 @@
+/* Copyright 2008-2015 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in the
+ *  documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ *  names of its contributors may be used to endorse or promote products
+ *  derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "dpaa_eth.h"
+#include "mac.h"
+
+static ssize_t dpaa_eth_show_addr(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+   struct 

[net-next v4 5/8] dpaa_eth: add ethtool functionality

2015-11-02 Thread Madalin Bucur
Add support for basic ethtool operations.

Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/dpaa/Makefile   |   2 +-
 .../net/ethernet/freescale/dpaa/dpaa_eth_common.c  |   2 +
 .../net/ethernet/freescale/dpaa/dpaa_eth_common.h  |   3 +
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 230 +
 4 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c

diff --git a/drivers/net/ethernet/freescale/dpaa/Makefile 
b/drivers/net/ethernet/freescale/dpaa/Makefile
index 3847ec7..9b75d52 100644
--- a/drivers/net/ethernet/freescale/dpaa/Makefile
+++ b/drivers/net/ethernet/freescale/dpaa/Makefile
@@ -8,4 +8,4 @@ ccflags-y += -I$(FMAN)
 
 obj-$(CONFIG_FSL_DPAA_ETH) += fsl_dpa.o
 
-fsl_dpa-objs += dpaa_eth.o dpaa_eth_sg.o dpaa_eth_common.o
+fsl_dpa-objs += dpaa_eth.o dpaa_eth_sg.o dpaa_eth_common.o dpaa_ethtool.o
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
index 89f3b1f..2b95696 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.c
@@ -102,6 +102,8 @@ int dpa_netdev_init(struct net_device *net_dev,
memcpy(net_dev->perm_addr, mac_addr, net_dev->addr_len);
memcpy(net_dev->dev_addr, mac_addr, net_dev->addr_len);
 
+   net_dev->ethtool_ops = _ethtool_ops;
+
net_dev->needed_headroom = priv->tx_headroom;
net_dev->watchdog_timeo = msecs_to_jiffies(tx_timeout);
 
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h
index 2e9471d..160a018 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_common.h
@@ -43,6 +43,9 @@
 /* used in napi related functions */
 extern u16 qman_portal_max;
 
+/* from dpa_ethtool.c */
+extern const struct ethtool_ops dpa_ethtool_ops;
+
 int dpa_netdev_init(struct net_device *net_dev,
const u8 *mac_addr,
u16 tx_timeout);
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
new file mode 100644
index 000..fa8ba69
--- /dev/null
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -0,0 +1,230 @@
+/* Copyright 2008-2015 Freescale Semiconductor, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in the
+ *  documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ *  names of its contributors may be used to endorse or promote products
+ *  derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+
+#include "dpaa_eth.h"
+#include "mac.h"
+#include "dpaa_eth_common.h"
+
+static int dpa_get_settings(struct net_device *net_dev,
+   struct ethtool_cmd *et_cmd)
+{
+   int err;
+   struct dpa_priv_s *priv;
+
+   priv = netdev_priv(net_dev);
+
+   if (!priv->mac_dev->phy_dev) {
+   netdev_dbg(net_dev, "phy device not initialized\n");
+   return 0;
+   }
+
+   err = phy_ethtool_gset(priv->mac_dev->phy_dev, et_cmd);
+
+   return err;
+}
+
+static int dpa_set_settings(struct net_device *net_dev,
+   struct ethtool_cmd *et_cmd)
+{
+   int err;
+   struct dpa_priv_s *priv;
+
+   priv = 

  1   2   >