date:20171116

[PATCH net v2 0/1] NULL pointer dereference in ipvlan_port_destroy

2017-11-16 Thread Girish Moodalbail

>From code inspection it appeared that there is a possibility where in
ipvlan_port_destroy() might be dealing with a port (struct ipvl_port)
that has already been destroyed and is therefore already NULL. However,
we don't check for NULL and continue to access the fields which results
in a kernel panic.

When call to register_netdevice() (called from ipvlan_link_new()) fails,
inside that function we call ipvlan_uninit() (through ndo_uninit()) to
destroy the ipvlan port. Upon returning unsuccessfully from
register_netdevice() we go ahead and call ipvlan_port_destroy() again
which causes NULL pointer dereference panic.

To test this theory, I loaded up netdev-notifier-error-inject.ko and did 

$ sudo echo -22 > /sys/kernel/debug/notifier-error-inject/\
  netdev/actions/NETDEV_POST_INIT/error
$ sudo  ip li add ipvl0 link enp7s0 type ipvlan
...system panics...
BUG: unable to handle kernel NULL pointer dereference at 0820
IP: ipvlan_port_destroy+0x2a/0xf0 [ipvlan]

Similar issue exists in macvlan_port_destroy() and it will be addressed
by a separate patch. The following patch fixes the ipvlan case. I tested
my changes for regression by running LTP's ipvlan test case.

Girish Moodalbail (1):
  ipvlan: NULL pointer dereference panic in ipvlan_port_destroy

 drivers/net/ipvlan/ipvlan_main.c | 104 +--
 1 file changed, 55 insertions(+), 49 deletions(-)

-- 
1.8.3.1

[PATCH net v2 1/1] ipvlan: NULL pointer dereference panic in ipvlan_port_destroy

2017-11-16 Thread Girish Moodalbail

When call to register_netdevice() (called from ipvlan_link_new()) fails,
we call ipvlan_uninit() (through ndo_uninit()) to destroy the ipvlan
port. After returning unsuccessfully from register_netdevice() we go
ahead and call ipvlan_port_destroy() again which causes NULL pointer
dereference panic. Fix the issue by making ipvlan_init() and
ipvlan_uninit() call symmetric.

The ipvlan port will now be created inside ipvlan_init() and will be
destroyed in ipvlan_uninit().

Fixes: 2ad7bf363841 (ipvlan: Initial check-in of the IPVLAN driver)
Signed-off-by: Girish Moodalbail 
---
v1 -> v2:
  - took care of David Miller's comment on ipvlan_init() and
ipvlan_uninit() not being symmetric.
---
---
 drivers/net/ipvlan/ipvlan_main.c | 104 +--
 1 file changed, 55 insertions(+), 49 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index a266aa4..30cb803 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -107,16 +107,6 @@ static int ipvlan_port_create(struct net_device *dev)
struct ipvl_port *port;
int err, idx;
 
-   if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) {
-   netdev_err(dev, "Master is either lo or non-ether device\n");
-   return -EINVAL;
-   }
-
-   if (netdev_is_rx_handler_busy(dev)) {
-   netdev_err(dev, "Device is already in use.\n");
-   return -EBUSY;
-   }
-
port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL);
if (!port)
return -ENOMEM;
@@ -179,8 +169,9 @@ static void ipvlan_port_destroy(struct net_device *dev)
 static int ipvlan_init(struct net_device *dev)
 {
struct ipvl_dev *ipvlan = netdev_priv(dev);
-   const struct net_device *phy_dev = ipvlan->phy_dev;
-   struct ipvl_port *port = ipvlan->port;
+   struct net_device *phy_dev = ipvlan->phy_dev;
+   struct ipvl_port *port;
+   int err;
 
dev->state = (dev->state & ~IPVLAN_STATE_MASK) |
 (phy_dev->state & IPVLAN_STATE_MASK);
@@ -196,18 +187,27 @@ static int ipvlan_init(struct net_device *dev)
if (!ipvlan->pcpu_stats)
return -ENOMEM;
 
+   if (!netif_is_ipvlan_port(phy_dev)) {
+   err = ipvlan_port_create(phy_dev);
+   if (err < 0) {
+   free_percpu(ipvlan->pcpu_stats);
+   return err;
+   }
+   }
+   port = ipvlan_port_get_rtnl(phy_dev);
port->count += 1;
-
return 0;
 }
 
 static void ipvlan_uninit(struct net_device *dev)
 {
struct ipvl_dev *ipvlan = netdev_priv(dev);
-   struct ipvl_port *port = ipvlan->port;
+   struct net_device *phy_dev = ipvlan->phy_dev;
+   struct ipvl_port *port;
 
free_percpu(ipvlan->pcpu_stats);
 
+   port = ipvlan_port_get_rtnl(phy_dev);
port->count -= 1;
if (!port->count)
ipvlan_port_destroy(port->dev);
@@ -554,7 +554,6 @@ int ipvlan_link_new(struct net *src_net, struct net_device 
*dev,
struct net_device *phy_dev;
int err;
u16 mode = IPVLAN_MODE_L3;
-   bool create = false;
 
if (!tb[IFLA_LINK])
return -EINVAL;
@@ -568,28 +567,41 @@ int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
 
phy_dev = tmp->phy_dev;
} else if (!netif_is_ipvlan_port(phy_dev)) {
-   err = ipvlan_port_create(phy_dev);
-   if (err < 0)
-   return err;
-   create = true;
-   }
+   /* Exit early if the underlying link is invalid or busy */
+   if (phy_dev->type != ARPHRD_ETHER ||
+   phy_dev->flags & IFF_LOOPBACK) {
+   netdev_err(phy_dev,
+  "Master is either lo or non-ether device\n");
+   return -EINVAL;
+   }
 
-   if (data && data[IFLA_IPVLAN_MODE])
-   mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
+   if (netdev_is_rx_handler_busy(phy_dev)) {
+   netdev_err(phy_dev, "Device is already in use.\n");
+   return -EBUSY;
+   }
+   }
 
-   port = ipvlan_port_get_rtnl(phy_dev);
ipvlan->phy_dev = phy_dev;
ipvlan->dev = dev;
-   ipvlan->port = port;
ipvlan->sfeatures = IPVLAN_FEATURES;
ipvlan_adjust_mtu(ipvlan, phy_dev);
INIT_LIST_HEAD(>addrs);
 
-   /* Flags are per port and latest update overrides. User has
-* to be consistent in setting it just like the mode attribute.
+   /* TODO Probably put random address here to be presented to the
+* world but keep using the physical-dev address for the outgoing
+* packets.
 */
-   if (data && data[IFLA_IPVLAN_FLAGS])
-   ipvlan->port->flags =

[PATCH net] route: also update fnhe_genid when updating a route cache

2017-11-16 Thread Xin Long

Now when ip route flush cache and it turn out all fnhe_genid != genid.
If a redirect/pmtu icmp packet comes and the old fnhe is found and all
it's members but fnhe_genid will be updated.

Then next time when it looks up route and tries to rebind this fnhe to
the new dst, the fnhe will be flushed due to fnhe_genid != genid. It
causes this redirect/pmtu icmp packet acutally not to be applied.

This patch is to also reset fnhe_genid when updating a route cache.

Fixes: 5aad1de5ea2c ("ipv4: use separate genid for next hop exceptions")
Acked-by: Hannes Frederic Sowa 
Signed-off-by: Xin Long 
---
 net/ipv4/route.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11cf2fe..43b69af 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -651,9 +651,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, 
__be32 daddr, __be32 gw,
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe;
struct rtable *rt;
+   u32 genid, hval;
unsigned int i;
int depth;
-   u32 hval = fnhe_hashfun(daddr);
+
+   genid = fnhe_genid(dev_net(nh->nh_dev));
+   hval = fnhe_hashfun(daddr);
 
spin_lock_bh(_lock);
 
@@ -676,6 +679,8 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 
daddr, __be32 gw,
}
 
if (fnhe) {
+   if (fnhe->fnhe_genid != genid)
+   fnhe->fnhe_genid = genid;
if (gw)
fnhe->fnhe_gw = gw;
if (pmtu)
@@ -699,7 +704,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 
daddr, __be32 gw,
fnhe->fnhe_next = hash->chain;
rcu_assign_pointer(hash->chain, fnhe);
}
-   fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
+   fnhe->fnhe_genid = genid;
fnhe->fnhe_daddr = daddr;
fnhe->fnhe_gw = gw;
fnhe->fnhe_pmtu = pmtu;
-- 
2.1.0

[PATCH net] route: update fnhe_expires for redirect when the fnhe exists

2017-11-16 Thread Xin Long

Now when creating fnhe for redirect, it sets fnhe_expires for this
new route cache. But when updating the exist one, it doesn't do it.
It will cause this fnhe never to be expired.

Paolo already noticed it before, in Jianlin's test case, it became
even worse:

When ip route flush cache, the old fnhe is not to be removed, but
only clean it's members. When redirect comes again, this fnhe will
be found and updated, but never be expired due to fnhe_expires not
being set.

So fix it by simply updating fnhe_expires even it's for redirect.

Fixes: aee06da6726d ("ipv4: use seqlock for nh_exceptions")
Reported-by: Jianlin Shi 
Acked-by: Hannes Frederic Sowa 
Signed-off-by: Xin Long 
---
 net/ipv4/route.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3b42775..11cf2fe 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -678,10 +678,9 @@ static void update_or_create_fnhe(struct fib_nh *nh, 
__be32 daddr, __be32 gw,
if (fnhe) {
if (gw)
fnhe->fnhe_gw = gw;
-   if (pmtu) {
+   if (pmtu)
fnhe->fnhe_pmtu = pmtu;
-   fnhe->fnhe_expires = max(1UL, expires);
-   }
+   fnhe->fnhe_expires = max(1UL, expires);
/* Update all cached dsts too */
rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt)
-- 
2.1.0

[PATCH net] sctp: report SCTP_ERROR_INV_STRM as cpu endian

2017-11-16 Thread Xin Long

rfc6458 demands the send_error in SCTP_SEND_FAILED_EVENT should
be in cpu endian, while SCTP_ERROR_INV_STRM is in big endian.

This issue is there since very beginning, Eric noticed it by
running 'make C=2 M=net/sctp/'.

This patch is to convert it before reporting it.

Reported-by: Eric Dumazet 
Signed-off-by: Xin Long 
---
 net/sctp/stream.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index a11db21..f86ceee 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -64,7 +64,7 @@ static void sctp_stream_outq_migrate(struct sctp_stream 
*stream,
 */
 
/* Mark as failed send. */
-   sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM);
+   sctp_chunk_fail(ch, be16_to_cpu(SCTP_ERROR_INV_STRM));
if (asoc->peer.prsctp_capable &&
SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags))
asoc->sent_cnt_removable--;
-- 
2.1.0

Re: [PATCH] net: bridge: add max_fdb_count

2017-11-16 Thread Nikolay Aleksandrov

On 17/11/17 07:26, Willy Tarreau wrote:
> Hi Stephen,
> 
> On Thu, Nov 16, 2017 at 04:27:18PM -0800, Stephen Hemminger wrote:
>> On Thu, 16 Nov 2017 21:21:55 +0100
>> Vincent Bernat  wrote:
>>
>>>  ? 16 novembre 2017 20:23 +0100, Andrew Lunn  :
>>>
 struct net_bridge_fdb_entry is 40 bytes.

 My WiFi access point which is also a 5 port bridge, currently has 97MB
 free RAM. That is space for about 2.5M FDB entries. So even Roopa's
 128K is not really a problem, in terms of memory.  
>>>
>>> I am also interested in Sarah's patch because we can now have bridge
>>> with many ports through VXLAN. The FDB can be replicated to an external
>>> daemon with BGP and the cost of each additional MAC address is therefore
>>> higher than just a few bytes. It seems simpler to implement a limiting
>>> policy early (at the port or bridge level).
>>>
>>> Also, this is a pretty standard limit to have for a bridge (switchport
>>> port-security maximum on Cisco, set interface X mac-limit on
>>> Juniper). And it's not something easy to do with ebtables.
>>
>> I want an optional limit per port, it makes a lot of sense.
>> If for no other reason that huge hash tables are a performance problems.
> 
> Except its not a limit in that it doesn't prevent new traffic from going
> in, it only prevents new MACs from being learned, so suddenly you start
> flooding all ports with new traffic once the limit is reached, which is
> not trivial to detect nor diagnose.
> 
>> There is a bigger question about which fdb to evict but just dropping the
>> new one seems to be easiest and as good as any other solution.
> 
> Usually it's better to apply LRU or random here in my opinion, as the
> new entry is much more likely to be needed than older ones by definition.
> In terms of CPU usage it may even be better to kill an entire series in
> the hash table (eg: all nodes in the same table entry for example), as
> the operation can be almost as cheap and result in not being needed for
> a while again.
> 
> Willy
> 

Hi,
I have been thinking about this and how to try and keep everyone happy
while maintaining performance, so how about this:

 * Add a per-port fdb LRU list which is used only when there are >= 2000
   global fdb entries _or_ a per-port limit is set. If the list is in use,
   update the fdb list position once per second. I think these properties will
   allow us to scale and have a better LRU locking granularity (per-port), and 
in
   smaller setups (not needing LRU) the cost will be a single test in fast path.

 * Use the above LRU list for per-port limit evictions

 * More importantly use the LRU list for fdb expire, removing the need to walk
   over all fdbs every time we expire entries. This would be of great help for
   larger setups with many fdbs (it will kick in on >= 2000 fdb entries).

 * (optional) Make the global LRU kick in limit an option, people might want to
   minimize traffic blocking due to expire process.

Any comments and suggestions are welcome. When we agree on the details I'll do
the RFC patches and run some tests before submitting. Defaults will be kept as
they are now. I've chosen the 2000 limit arbitrarily and am happy to change it
if people have something else in mind. This should play nice with the resizeable
hashtable change.

Thanks,
 Nik

[PATCH net] sctp: set frag_point in sctp_setsockopt_maxseg correctly

2017-11-16 Thread Xin Long

Now in sctp_setsockopt_maxseg user_frag or frag_point can be set with
val >= 8 and val <= SCTP_MAX_CHUNK_LEN. But both checks are incorrect.

val >= 8 means frag_point can even be less than SCTP_DEFAULT_MINSEGMENT.
Then in sctp_datamsg_from_user(), when it's value is greater than cookie
echo len and trying to bundle with cookie echo chunk, the first_len will
overflow.

The worse case is when it's value is equal as cookie echo len, first_len
becomes 0, it will go into a dead loop for fragment later on. In Hangbin
syzkaller testing env, oom was even triggered due to consecutive memory
allocation in that loop.

Besides, SCTP_MAX_CHUNK_LEN is the max size of the whole chunk, it should
deduct the data header for frag_point or user_frag check.

This patch does a proper check with SCTP_DEFAULT_MINSEGMENT subtracting
the sctphdr and datahdr, SCTP_MAX_CHUNK_LEN subtracting datahdr when
setting frag_point via sockopt. It also improves sctp_setsockopt_maxseg
codes.

Suggested-by: Marcelo Ricardo Leitner 
Reported-by: Hangbin Liu 
Signed-off-by: Xin Long 
---
 include/net/sctp/sctp.h |  3 ++-
 net/sctp/socket.c   | 29 +++--
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index d7d8cba..749a428 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -444,7 +444,8 @@ static inline int sctp_frag_point(const struct 
sctp_association *asoc, int pmtu)
if (asoc->user_frag)
frag = min_t(int, frag, asoc->user_frag);
 
-   frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN));
+   frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN -
+   sizeof(struct sctp_data_chunk)));
 
return frag;
 }
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 4c0a772..3204a9b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3140,9 +3140,9 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char 
__user *optval, unsign
  */
 static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, 
unsigned int optlen)
 {
+   struct sctp_sock *sp = sctp_sk(sk);
struct sctp_assoc_value params;
struct sctp_association *asoc;
-   struct sctp_sock *sp = sctp_sk(sk);
int val;
 
if (optlen == sizeof(int)) {
@@ -3158,26 +3158,35 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char 
__user *optval, unsigned
if (copy_from_user(, optval, optlen))
return -EFAULT;
val = params.assoc_value;
-   } else
+   } else {
return -EINVAL;
+   }
 
-   if ((val != 0) && ((val < 8) || (val > SCTP_MAX_CHUNK_LEN)))
-   return -EINVAL;
+   if (val) {
+   int min_len, max_len;
 
-   asoc = sctp_id2assoc(sk, params.assoc_id);
-   if (!asoc && params.assoc_id && sctp_style(sk, UDP))
-   return -EINVAL;
+   min_len = SCTP_DEFAULT_MINSEGMENT - sp->pf->af->net_header_len;
+   min_len -= sizeof(struct sctphdr) +
+  sizeof(struct sctp_data_chunk);
+
+   max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
 
+   if (val < min_len || val > max_len)
+   return -EINVAL;
+   }
+
+   asoc = sctp_id2assoc(sk, params.assoc_id);
if (asoc) {
if (val == 0) {
-   val = asoc->pathmtu;
-   val -= sp->pf->af->net_header_len;
+   val = asoc->pathmtu - sp->pf->af->net_header_len;
val -= sizeof(struct sctphdr) +
-   sizeof(struct sctp_data_chunk);
+  sizeof(struct sctp_data_chunk);
}
asoc->user_frag = val;
asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
} else {
+   if (params.assoc_id && sctp_style(sk, UDP))
+   return -EINVAL;
sp->user_frag = val;
}
 
-- 
2.1.0

Re: [PATCH net-next v3] net: assign err to 0 at begin in do_setlink() function

2017-11-16 Thread David Miller

From: yuan linyu 
Date: Thu, 16 Nov 2017 19:59:48 +0800

> From: yuan linyu 
> 
> each netlink attribute have proper process when error happen,
> when exit one attribute process, it implies that no error,
> so err = 0; is useless.
> 
> assign err = 0; at beginning if all attributes not set.
> 
> v1 -> v2:
>   fix review comment from David, clear err before
>   nla_for_each_nested()
> 
> v2 -> v3:
>   maybe wrong understanding of David comment,
>   provide a new version
> 
> Signed-off-by: yuan linyu 

I'm sorry I still find it hard to accept this change.

What about all of the assignments of 'err' which only branch to
'errout' if err is negative?  It is not easy to see that none of those
case ever result in 'err' holding a positive non-zero value.

The code as-is is the easiest to understand, audit and prove correct
in the error-free case.  And this because of the explicit clearing or
'err' to zero late in the function.

Thanks you.

Re: [PATCH] sfp: Add support for DWDM SFP modules

2017-11-16 Thread David Miller

From: Jan Kundrát 
Date: Wed, 15 Nov 2017 12:39:33 +0100

> Without this patch, but with CONFIG_SFP enabled, my NIC won't detect
> module unplugging, which is suboptimal.
> 
> I'm using an OEM "Cisco compatible" DWDM fixed-frequency 100Ghz-grid SFP
> module. It reports itself as a 0x0b 0x24. According to SFF-8024, byte 0
> value "0Bh" refers to a "DWDM-SFP/SFP+ (not using SFF-8472)". In
> practice, there's a lot of shared properties here.
> 
> Everything is apparently defined in a document called "DWDM SFP MSA
> (Multi-source Agreement), Revision 1.0, 19th September 2005". I don't
> have access to that ocument (yet). Its likely source, the
> http://www.dwdmsfpmsa.org/ has been down for years.
> 
> From the datasheets that I was able to find on random vendors' web, the
> second byte can vary -- 0x27 is used, too.
> 
> Tested on Clearfog Base with v4.14 and Russell King's SFP patches.
> 
> Signed-off-by: Jan Kundrát 

Russell, Florian, Amdrew, can I get a review?

Re: [PATCH] macvlan: verify MTU before lowerdev xmit

2017-11-16 Thread David Miller

From: Daniel Axtens 
Date: Tue, 14 Nov 2017 21:32:51 +1100

> If a macvlan device which is not in bridge mode receives a packet,
> it is sent straight to the lowerdev without checking against the
> device's MTU. This also happens for multicast traffic.
> 
> Add an is_skb_forwardable() check against the lowerdev before
> sending the packet out through it. I think this is the simplest
> and best way to do it, and is consistent with the use of
> dev_forward_skb() in the bridge path.
> 
> This is easy to replicate:
>  - create a VM with a macvtap connection in private mode
>  - set the lowerdev MTU to something low in the host (e.g. 1480)
>  - do not set the MTU lower in the guest (e.g. keep at 1500)
>  - netperf to a different host with the same high MTU
>  - observe that currently, the driver will forward too-big packets
>  - observe that with this patch the packets are dropped
> 
> Cc: Shannon Nelson 
> Signed-off-by: Daniel Axtens 

This is an area where we really haven't set down some clear rules
for behavior.

If an interface has a particular MTU, it must be able to successfully
send MTU sized packets on that link be it virtual or physical.

Only a "next hop" can have a different MTU and thus drop packets.
This requirement is absolutely necessary in order for proper
signalling (path MTU messages) to make their way back to the sending
host.

In this VM-->macvlan case it's more like a point to point connection
and there lacks a "next hop" to serve and the provider of proper
signalling.

This whole situation seems to be handled quite poorly in virtualized
setups.  Allowing one end of the virtual networking "link" into the
guest have a different MTU from the other end is a HUGE mistake.

There needs to be control path signalling between the guest and the
provider of the virtual link so that they can synchronize their MTU
settings.

Yes this is hard, but what is happening now doesn't fly in the long
term.

Re: [PATCH] net: bridge: add max_fdb_count

2017-11-16 Thread Willy Tarreau

Hi Stephen,

On Thu, Nov 16, 2017 at 04:27:18PM -0800, Stephen Hemminger wrote:
> On Thu, 16 Nov 2017 21:21:55 +0100
> Vincent Bernat  wrote:
> 
> >  ? 16 novembre 2017 20:23 +0100, Andrew Lunn  :
> > 
> > > struct net_bridge_fdb_entry is 40 bytes.
> > >
> > > My WiFi access point which is also a 5 port bridge, currently has 97MB
> > > free RAM. That is space for about 2.5M FDB entries. So even Roopa's
> > > 128K is not really a problem, in terms of memory.  
> > 
> > I am also interested in Sarah's patch because we can now have bridge
> > with many ports through VXLAN. The FDB can be replicated to an external
> > daemon with BGP and the cost of each additional MAC address is therefore
> > higher than just a few bytes. It seems simpler to implement a limiting
> > policy early (at the port or bridge level).
> > 
> > Also, this is a pretty standard limit to have for a bridge (switchport
> > port-security maximum on Cisco, set interface X mac-limit on
> > Juniper). And it's not something easy to do with ebtables.
> 
> I want an optional limit per port, it makes a lot of sense.
> If for no other reason that huge hash tables are a performance problems.

Except its not a limit in that it doesn't prevent new traffic from going
in, it only prevents new MACs from being learned, so suddenly you start
flooding all ports with new traffic once the limit is reached, which is
not trivial to detect nor diagnose.

> There is a bigger question about which fdb to evict but just dropping the
> new one seems to be easiest and as good as any other solution.

Usually it's better to apply LRU or random here in my opinion, as the
new entry is much more likely to be needed than older ones by definition.
In terms of CPU usage it may even be better to kill an entire series in
the hash table (eg: all nodes in the same table entry for example), as
the operation can be almost as cheap and result in not being needed for
a while again.

Willy

Re: [PATCH net 0/5] nfp: flower fixes and typo in ethtool stats name

2017-11-16 Thread David Miller

From: Jakub Kicinski 
Date: Thu, 16 Nov 2017 17:06:38 -0800

> This set comes from the flower offload team.

Series applied, thank you.

Re: [PATCH] virto_net: remove empty file 'virtio_net.'

2017-11-16 Thread David Miller

From: Jason Wang 
Date: Fri, 17 Nov 2017 10:55:44 +0800

> 
> 
> On 2017年11月17日 10:46, Joel Stanley wrote:
>> Looks like this was mistakenly added to the tree as part of
>> commit 186b3c998c50 ("virtio-net: support XDP_REDIRECT").
>>
>> Signed-off-by: Joel Stanley 
>> ---
>>   drivers/net/virtio_net. | 0
>>   1 file changed, 0 insertions(+), 0 deletions(-)
>>   delete mode 100644 drivers/net/virtio_net.
>>
>> diff --git a/drivers/net/virtio_net. b/drivers/net/virtio_net.
>> deleted file mode 100644
>> index e69de29bb2d1..
> 
> My bad, don't know what happens at that time.
> 
> This is for -net.
> 
> Acked-by: Jason Wang 

Applied, thanks everyone.

Re: [PATCH v1 net-next 0/7] net: dsa: microchip: Modify KSZ9477 DSA driver in preparation to add other KSZ switch drivers

2017-11-16 Thread David Miller


Hello, please resubmit all of this KSZ switch work when the net-next
tree opens up again as it is currently closed.

Thank you.

Re: [PATCH v2 net-next] net: dsa: microchip: Add MIB counter reading support

2017-11-16 Thread Andrew Lunn

On Thu, Nov 16, 2017 at 06:42:50PM -0800, tristram...@microchip.com wrote:
> From: Tristram Ha 
> 
> Add MIB counter reading support.

Hi Tristram

Some more details would be good here.

> +static void ksz9477_r_mib_pkt(struct ksz_device *dev, int port, u16 addr,
> +   u64 *dropped, u64 *cnt)
> +{
> + addr = ksz9477_mib_names[addr].index;
> + ksz9477_r_mib_cnt(dev, port, addr, cnt);
> +}

dropped is unused here. Which seems to make dropped in general unused.

> +static void ksz_mib_read_work(struct work_struct *work)
> +{
> + struct ksz_device *dev =
> + container_of(work, struct ksz_device, mib_read);
> + struct ksz_port *p;
> + struct ksz_port_mib *mib;
> + int i;
> +
> + for (i = 0; i < dev->mib_port_cnt; i++) {
> + p = >ports[i];
> + if (!p->on)
> + continue;
> + mib = >mib;
> + mutex_lock(>cnt_mutex);
> +
> + /* read only dropped counters when link is not up */
> + if (p->link_down)
> + p->link_down = 0;

It is not obvious to me what this is doing.

> + else if (!p->link_up)
> + mib->cnt_ptr = dev->reg_mib_cnt;
> + port_r_cnt(dev, i);
> + mutex_unlock(>cnt_mutex);
> + }
> +}
> +
> +void ksz_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *buf)
> +{
> + struct ksz_device *dev = ds->priv;
> + struct ksz_port_mib *mib;
> +
> + mib = >ports[port].mib;
> +
> + /* freeze MIB counters if supported */
> + if (dev->dev_ops->freeze_mib)
> + dev->dev_ops->freeze_mib(dev, port, true);
> + mutex_lock(>cnt_mutex);
> + port_r_cnt(dev, port);
> + mutex_unlock(>cnt_mutex);
> + if (dev->dev_ops->freeze_mib)
> + dev->dev_ops->freeze_mib(dev, port, false);
> + memcpy(buf, mib->counters, dev->mib_cnt * sizeof(u64));

Should the memcpy be made while holding the mutex? As soon as you
release it, the timer code can start updating the values in
mib->counters. That then makes the freeze pointless.

>  void ksz_switch_remove(struct ksz_device *dev)
>  {
> + /* timer started */
> + if (dev->mib_read_timer.expires) {
> + del_timer_sync(>mib_read_timer);
> + flush_work(>mib_read);
> + }

Is this race free?

   Andrew

Re: [PATCH] virto_net: remove empty file 'virtio_net.'

2017-11-16 Thread Michael S. Tsirkin

On Fri, Nov 17, 2017 at 01:16:36PM +1030, Joel Stanley wrote:
> Looks like this was mistakenly added to the tree as part of
> commit 186b3c998c50 ("virtio-net: support XDP_REDIRECT").
> 
> Signed-off-by: Joel Stanley 

Acked-by: Michael S. Tsirkin 

> ---
>  drivers/net/virtio_net. | 0
>  1 file changed, 0 insertions(+), 0 deletions(-)
>  delete mode 100644 drivers/net/virtio_net.
> 
> diff --git a/drivers/net/virtio_net. b/drivers/net/virtio_net.
> deleted file mode 100644
> index e69de29bb2d1..
> -- 
> 2.14.1

Re: [PATCH net-next] dt-bindings: net: dsa: Document additional Microchip KSZ family switches

2017-11-16 Thread Andrew Lunn

> + ksz9477: ksz9477@0 {
> + compatible = "microchip,ksz9477";
> + reg = <0>;
>  
> - 
> spi-max-frequency = <4400>;
> - spi-cpha;
> - spi-cpol;
> + spi-max-frequency = <4400>;
> + spi-cpha;
> + spi-cpol;

...

> +
> + };
> + ksz8794: ksz8794@0 {
> + compatible = "microchip,ksz8794";
> + reg = <0>;
> +
> + spi-max-frequency = <3000>;
> + spi-cpha;
> + spi-cpol;
> +
> + };

...

> + };
> + ksz8864: ksz8864@0 {
> + compatible = "microchip,ksz8864";
> + reg = <0>;
> +
> + spi-max-frequency = <1200>;
> + spi-cpha;
> + spi-cpol;
> +
...
> + };
> + };

Hi Tristram

Nitpick. These all appear to be on the same SPI bus. So they should
have different reg values.

Also, the label is supposed to be generic, not specific. So switch0:
ksz9477@0, switch1: ksz8794@1, etc.

   Andrew

Re: [PATCH v1 net-next 1/7] net: dsa: microchip: Replace license with GPL

2017-11-16 Thread Andrew Lunn

On Thu, Nov 16, 2017 at 06:41:25PM -0800, tristram...@microchip.com wrote:
> From: Tristram Ha 
> 
> Replace license with GPL.
> 
> Signed-off-by: Tristram Ha 
> Reviewed-by: Woojung Huh 

Reviewed-by: Andrew Lunn 

Andrew

Re: [PATCH v1 net-next 7/7] net: dsa: microchip: Rename ksz_9477_reg.h to ksz9477_reg.h

2017-11-16 Thread Andrew Lunn

On Thu, Nov 16, 2017 at 06:41:31PM -0800, tristram...@microchip.com wrote:
> From: Tristram Ha 
> 
> Rename ksz_9477_reg.h to ksz9477_reg.h for consistency as the product
> name is always KSZ.
> 
> Signed-off-by: Tristram Ha 
> Reviewed-by: Woojung Huh 

Reviewed-by: Andrew Lunn 

Andrew

Re: [PATCH v1 net-next 0/7] net: dsa: microchip: Modify KSZ9477 DSA driver in preparation to add other KSZ switch drivers

2017-11-16 Thread Andrew Lunn

On Thu, Nov 16, 2017 at 06:41:24PM -0800, tristram...@microchip.com wrote:
> From: Tristram Ha 
> 
> This series of patches is to modify the original KSZ9477 DSA driver so
> that other KSZ switch drivers can be added and use the common code.

Hi Tristram

http://vger.kernel.org/~davem/net-next.html

It is better to send an RFC patchset while netdev is closed and not
send it to David. He will shout at you otherwise.

   Andrew

Re: [RFC PATCH 01/14] packet: introduce AF_PACKET V4 userspace API

2017-11-16 Thread chetan L

On Thu, Nov 16, 2017 at 8:53 AM, Jesper Dangaard Brouer
 wrote:
> On Wed, 15 Nov 2017 14:21:38 -0800
> chet l  wrote:
>
>> One quick question:
>> Any thoughts on SVM support?
>
> What is SVM ?
>

Shared Virtual Memory(PCIe based). So going back to one of your
mapping examples. The protocol can be AF_CHANNEL.
Modes could be:
AF_ZC , AF_XDP_REDIRECT

Mapping types could be:
AF_NON_SVM(current setup - no PASID needed), AF_SVM(onus is on the
user to pass the PASID as part of the setsockopt), AF_SVM++

Chetan

Re: [PATCH] virto_net: remove empty file 'virtio_net.'

2017-11-16 Thread Jason Wang




On 2017年11月17日 10:46, Joel Stanley wrote:

Looks like this was mistakenly added to the tree as part of
commit 186b3c998c50 ("virtio-net: support XDP_REDIRECT").

Signed-off-by: Joel Stanley 
---
  drivers/net/virtio_net. | 0
  1 file changed, 0 insertions(+), 0 deletions(-)
  delete mode 100644 drivers/net/virtio_net.

diff --git a/drivers/net/virtio_net. b/drivers/net/virtio_net.
deleted file mode 100644
index e69de29bb2d1..


My bad, don't know what happens at that time.

This is for -net.

Acked-by: Jason Wang 

Thanks

[PATCH] virto_net: remove empty file 'virtio_net.'

2017-11-16 Thread Joel Stanley

Looks like this was mistakenly added to the tree as part of
commit 186b3c998c50 ("virtio-net: support XDP_REDIRECT").

Signed-off-by: Joel Stanley 
---
 drivers/net/virtio_net. | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 drivers/net/virtio_net.

diff --git a/drivers/net/virtio_net. b/drivers/net/virtio_net.
deleted file mode 100644
index e69de29bb2d1..
-- 
2.14.1

[PATCH v2 net-next 0/1] net: dsa: microchip: Add Microchip KSZ8795 DSA driver

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

This patch requires the previous patches for Microchip KSZ9477 DSA driver.

v2
- No new feature is introduced in tag_ksz.c

v1
- Return error codes instead of numbers
- Add more comments to clarify operation
- Use ksz8795 prefix to indicate KSZ8795 specific code
- Simplify MIB counter reading code
- Switch driver code is not accessed from tag_ksz.c

Tristram Ha (1):
  Add Microchip KSZ8795 DSA driver.

 drivers/net/dsa/microchip/Kconfig   |   17 +
 drivers/net/dsa/microchip/Makefile  |2 +
 drivers/net/dsa/microchip/ksz8795.c | 1365 +++
 drivers/net/dsa/microchip/ksz8795_reg.h | 1016 +++
 drivers/net/dsa/microchip/ksz8795_spi.c |  166 
 drivers/net/dsa/microchip/ksz_priv.h|1 +
 include/net/dsa.h   |1 +
 net/dsa/Kconfig |4 +
 net/dsa/dsa.c   |3 +
 net/dsa/dsa_priv.h  |1 +
 net/dsa/tag_ksz.c   |   32 +
 11 files changed, 2608 insertions(+)
 create mode 100644 drivers/net/dsa/microchip/ksz8795.c
 create mode 100644 drivers/net/dsa/microchip/ksz8795_reg.h
 create mode 100644 drivers/net/dsa/microchip/ksz8795_spi.c

-- 
1.9.1

[PATCH v2 net-next] net: dsa: Modify tag_ksz.c so that tail tag code can be used by other KSZ switch drivers

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Modify tag_ksz.c so that tail tag code can be used by other KSZ switch
drivers.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
v2
- No new feature is introduced

v1
- Switch driver code is not accessed from tag_ksz.c

 drivers/net/dsa/microchip/Kconfig   |  2 +-
 drivers/net/dsa/microchip/ksz9477.c |  2 +-
 include/net/dsa.h   |  2 +-
 net/dsa/Kconfig |  4 ++
 net/dsa/dsa.c   |  4 +-
 net/dsa/dsa_priv.h  |  2 +-
 net/dsa/tag_ksz.c   | 90 -
 7 files changed, 70 insertions(+), 36 deletions(-)

diff --git a/drivers/net/dsa/microchip/Kconfig 
b/drivers/net/dsa/microchip/Kconfig
index 5a8660d..ab8f9f6 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -1,7 +1,7 @@
 menuconfig MICROCHIP_KSZ9477
tristate "Microchip KSZ9477 series switch support"
depends on NET_DSA
-   select NET_DSA_TAG_KSZ
+   select NET_DSA_TAG_KSZ9477
help
  This driver adds support for Microchip KSZ9477 switch chips.
 
diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
index 4e998a4..c735dca 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -344,7 +344,7 @@ static void ksz9477_port_init_cnt(struct ksz_device *dev, 
int port)
 static enum dsa_tag_protocol ksz9477_get_tag_protocol(struct dsa_switch *ds,
  int port)
 {
-   return DSA_TAG_PROTO_KSZ;
+   return DSA_TAG_PROTO_KSZ9477;
 }
 
 static int ksz9477_phy_read16(struct dsa_switch *ds, int addr, int reg)
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2a05738..3d23c0b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -32,7 +32,7 @@ enum dsa_tag_protocol {
DSA_TAG_PROTO_BRCM_PREPEND,
DSA_TAG_PROTO_DSA,
DSA_TAG_PROTO_EDSA,
-   DSA_TAG_PROTO_KSZ,
+   DSA_TAG_PROTO_KSZ9477,
DSA_TAG_PROTO_LAN9303,
DSA_TAG_PROTO_MTK,
DSA_TAG_PROTO_QCA,
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 03c3bdf..809b0e2 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -32,6 +32,10 @@ config NET_DSA_TAG_EDSA
 config NET_DSA_TAG_KSZ
bool
 
+config NET_DSA_TAG_KSZ9477
+   bool
+   select NET_DSA_TAG_KSZ
+
 config NET_DSA_TAG_LAN9303
bool
 
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 6a9d0f5..92056a7 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -53,8 +53,8 @@ static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff 
*skb,
 #ifdef CONFIG_NET_DSA_TAG_EDSA
[DSA_TAG_PROTO_EDSA] = _netdev_ops,
 #endif
-#ifdef CONFIG_NET_DSA_TAG_KSZ
-   [DSA_TAG_PROTO_KSZ] = _netdev_ops,
+#ifdef CONFIG_NET_DSA_TAG_KSZ9477
+   [DSA_TAG_PROTO_KSZ9477] = _netdev_ops,
 #endif
 #ifdef CONFIG_NET_DSA_TAG_LAN9303
[DSA_TAG_PROTO_LAN9303] = _netdev_ops,
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7d03669..a2955a8 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -200,7 +200,7 @@ static inline struct dsa_port *dsa_slave_to_port(const 
struct net_device *dev)
 extern const struct dsa_device_ops edsa_netdev_ops;
 
 /* tag_ksz.c */
-extern const struct dsa_device_ops ksz_netdev_ops;
+extern const struct dsa_device_ops ksz9477_netdev_ops;
 
 /* tag_lan9303.c */
 extern const struct dsa_device_ops lan9303_netdev_ops;
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 0f62eff..7343270 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -14,34 +14,21 @@
 #include 
 #include "dsa_priv.h"
 
-/* For Ingress (Host -> KSZ), 2 bytes are added before FCS.
- * ---
- * DA(6bytes)|SA(6bytes)||Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
- * ---
- * tag0 : Prioritization (not used now)
- * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
- *
- * For Egress (KSZ -> Host), 1 byte is added before FCS.
- * ---
- * DA(6bytes)|SA(6bytes)||Data(nbytes)|tag0(1byte)|FCS(4bytes)
- * ---
- * tag0 : zero-based value represents port
- *   (eg, 0x00=port1, 0x02=port3, 0x06=port7)
- */
-
-#defineKSZ_INGRESS_TAG_LEN 2
-#defineKSZ_EGRESS_TAG_LEN  1
+/* Usually only one byte is used for tail tag. */
+#define KSZ_INGRESS_TAG_LEN1
+#define KSZ_EGRESS_TAG_LEN 1
 
-static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev,
+   int len,
+   void

[PATCH net-next 1/1] net: dsa: microchip: Add Microchip KSZ8895 DSA driver

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Add Microchip KSZ8895 DSA driver.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
 drivers/net/dsa/microchip/Kconfig   |   17 +
 drivers/net/dsa/microchip/Makefile  |2 +
 drivers/net/dsa/microchip/ksz8895.c | 1276 +++
 drivers/net/dsa/microchip/ksz8895_reg.h |  824 
 drivers/net/dsa/microchip/ksz8895_spi.c |  157 
 drivers/net/dsa/microchip/ksz_priv.h|1 +
 6 files changed, 2277 insertions(+)
 create mode 100644 drivers/net/dsa/microchip/ksz8895.c
 create mode 100644 drivers/net/dsa/microchip/ksz8895_reg.h
 create mode 100644 drivers/net/dsa/microchip/ksz8895_spi.c

diff --git a/drivers/net/dsa/microchip/Kconfig 
b/drivers/net/dsa/microchip/Kconfig
index cb95d3d..b854c4b 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -27,3 +27,20 @@ config MICROCHIP_KSZ8795_SPI_DRIVER
 
  It is required to use the KSZ8795 switch driver as the only access
  is through SPI.
+
+menuconfig MICROCHIP_KSZ8895
+   tristate "Microchip KSZ8895 series switch support"
+   depends on NET_DSA
+   select NET_DSA_TAG_KSZ8795
+   help
+ This driver adds support for Microchip KSZ8895 switch chips.
+
+config MICROCHIP_KSZ8895_SPI_DRIVER
+   tristate "KSZ8895 series SPI connected switch driver"
+   depends on MICROCHIP_KSZ8895 && SPI
+   default y
+   help
+ This driver accesses KSZ8895 chip through SPI.
+
+ It is required to use the KSZ8895 switch driver as the only access
+ is through SPI.
diff --git a/drivers/net/dsa/microchip/Makefile 
b/drivers/net/dsa/microchip/Makefile
index 99a283e..8dd6312 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_MICROCHIP_KSZ9477) += ksz9477.o 
ksz_common.o
 obj-$(CONFIG_MICROCHIP_KSZ9477_SPI_DRIVER) += ksz9477_spi.o
 obj-$(CONFIG_MICROCHIP_KSZ8795)+= ksz8795.o ksz_common.o
 obj-$(CONFIG_MICROCHIP_KSZ8795_SPI_DRIVER) += ksz8795_spi.o
+obj-$(CONFIG_MICROCHIP_KSZ8895)+= ksz8895.o ksz_common.o
+obj-$(CONFIG_MICROCHIP_KSZ8895_SPI_DRIVER) += ksz8895_spi.o
diff --git a/drivers/net/dsa/microchip/ksz8895.c 
b/drivers/net/dsa/microchip/ksz8895.c
new file mode 100644
index 000..e04643c
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz8895.c
@@ -0,0 +1,1276 @@
+/*
+ * Microchip KSZ8895 switch driver
+ *
+ * Copyright (C) 2017 Microchip Technology Inc.
+ * Tristram Ha 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ksz_priv.h"
+#include "ksz_common.h"
+#include "ksz8895_reg.h"
+
+static const struct {
+   char string[ETH_GSTRING_LEN];
+} ksz8895_mib_names[TOTAL_SWITCH_COUNTER_NUM] = {
+   { "rx" },
+   { "rx_hi" },
+   { "rx_undersize" },
+   { "rx_fragments" },
+   { "rx_oversize" },
+   { "rx_jabbers" },
+   { "rx_symbol_err" },
+   { "rx_crc_err" },
+   { "rx_align_err" },
+   { "rx_mac_ctrl" },
+   { "rx_pause" },
+   { "rx_bcast" },
+   { "rx_mcast" },
+   { "rx_ucast" },
+   { "rx_64_or_less" },
+   { "rx_65_127" },
+   { "rx_128_255" },
+   { "rx_256_511" },
+   { "rx_512_1023" },
+   { "rx_1024_1522" },
+   { "tx" },
+   { "tx_hi" },
+   { "tx_late_col" },
+   { "tx_pause" },
+   { "tx_bcast" },
+   { "tx_mcast" },
+   { "tx_ucast" },
+   { "tx_deferred" },
+   { "tx_total_col" },
+   { "tx_exc_col" },
+   { "tx_single_col" },
+   { "tx_mult_col" },
+   { "rx_discards" },
+   { "tx_discards" },
+};
+
+static int ksz8895_reset_switch(struct ksz_device *dev)
+{
+   /* reset switch */
+   ksz_write8(dev, REG_POWER_MANAGEMENT_1,
+  SW_SOFTWARE_POWER_DOWN << SW_POWER_MANAGEMENT_MODE_S);
+   ksz_write8(dev, REG_POWER_MANAGEMENT_1, 0);
+
+   return 0;
+}
+
+static void ksz8895_set_prio_queue(struct ksz_device *dev, int port, int queue)
+{
+   u8 hi;
+   u8 lo;
+
+   /* Number of queues can only be 1, 2, or 4. */
+   switch (queue) {
+

[PATCH v2 net-next 1/1] net: dsa: microchip: Add Microchip KSZ8795 DSA driver

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Add Microchip KSZ8795 DSA driver.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
 drivers/net/dsa/microchip/Kconfig   |   17 +
 drivers/net/dsa/microchip/Makefile  |2 +
 drivers/net/dsa/microchip/ksz8795.c | 1365 +++
 drivers/net/dsa/microchip/ksz8795_reg.h | 1016 +++
 drivers/net/dsa/microchip/ksz8795_spi.c |  166 
 drivers/net/dsa/microchip/ksz_priv.h|1 +
 include/net/dsa.h   |1 +
 net/dsa/Kconfig |4 +
 net/dsa/dsa.c   |3 +
 net/dsa/dsa_priv.h  |1 +
 net/dsa/tag_ksz.c   |   32 +
 11 files changed, 2608 insertions(+)
 create mode 100644 drivers/net/dsa/microchip/ksz8795.c
 create mode 100644 drivers/net/dsa/microchip/ksz8795_reg.h
 create mode 100644 drivers/net/dsa/microchip/ksz8795_spi.c

diff --git a/drivers/net/dsa/microchip/Kconfig 
b/drivers/net/dsa/microchip/Kconfig
index ab8f9f6..cb95d3d 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -10,3 +10,20 @@ config MICROCHIP_KSZ9477_SPI_DRIVER
depends on MICROCHIP_KSZ9477 && SPI
help
  Select to enable support for registering switches configured through 
SPI.
+
+menuconfig MICROCHIP_KSZ8795
+   tristate "Microchip KSZ8795 series switch support"
+   depends on NET_DSA
+   select NET_DSA_TAG_KSZ8795
+   help
+ This driver adds support for Microchip KSZ8795 switch chips.
+
+config MICROCHIP_KSZ8795_SPI_DRIVER
+   tristate "KSZ8795 series SPI connected switch driver"
+   depends on MICROCHIP_KSZ8795 && SPI
+   default y
+   help
+ This driver accesses KSZ8795 chip through SPI.
+
+ It is required to use the KSZ8795 switch driver as the only access
+ is through SPI.
diff --git a/drivers/net/dsa/microchip/Makefile 
b/drivers/net/dsa/microchip/Makefile
index 13dd8f0..99a283e 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_MICROCHIP_KSZ9477)+= ksz9477.o ksz_common.o
 obj-$(CONFIG_MICROCHIP_KSZ9477_SPI_DRIVER) += ksz9477_spi.o
+obj-$(CONFIG_MICROCHIP_KSZ8795)+= ksz8795.o ksz_common.o
+obj-$(CONFIG_MICROCHIP_KSZ8795_SPI_DRIVER) += ksz8795_spi.o
diff --git a/drivers/net/dsa/microchip/ksz8795.c 
b/drivers/net/dsa/microchip/ksz8795.c
new file mode 100644
index 000..bdec1ed
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz8795.c
@@ -0,0 +1,1365 @@
+/*
+ * Microchip KSZ8795 switch driver
+ *
+ * Copyright (C) 2017 Microchip Technology Inc.
+ * Tristram Ha 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ksz_priv.h"
+#include "ksz_common.h"
+#include "ksz8795_reg.h"
+
+static const struct {
+   char string[ETH_GSTRING_LEN];
+} ksz8795_mib_names[TOTAL_SWITCH_COUNTER_NUM] = {
+   { "rx_hi" },
+   { "rx_undersize" },
+   { "rx_fragments" },
+   { "rx_oversize" },
+   { "rx_jabbers" },
+   { "rx_symbol_err" },
+   { "rx_crc_err" },
+   { "rx_align_err" },
+   { "rx_mac_ctrl" },
+   { "rx_pause" },
+   { "rx_bcast" },
+   { "rx_mcast" },
+   { "rx_ucast" },
+   { "rx_64_or_less" },
+   { "rx_65_127" },
+   { "rx_128_255" },
+   { "rx_256_511" },
+   { "rx_512_1023" },
+   { "rx_1024_1522" },
+   { "rx_1523_2000" },
+   { "rx_2001" },
+   { "tx_hi" },
+   { "tx_late_col" },
+   { "tx_pause" },
+   { "tx_bcast" },
+   { "tx_mcast" },
+   { "tx_ucast" },
+   { "tx_deferred" },
+   { "tx_total_col" },
+   { "tx_exc_col" },
+   { "tx_single_col" },
+   { "tx_mult_col" },
+   { "rx_total" },
+   { "tx_total" },
+   { "rx_discards" },
+   { "tx_discards" },
+};
+
+static int ksz8795_reset_switch(struct ksz_device *dev)
+{
+   /* reset switch */
+   ksz_write8(dev, REG_POWER_MANAGEMENT_1,
+  SW_SOFTWARE_POWER_DOWN << SW_POWER_MANAGEMENT_MODE_S);
+   ksz_write8(dev, REG_POWER_MANAGEMENT_1, 0);
+

[PATCH net-next 0/1] net: dsa: microchip: Add Microchip KSZ8895 DSA driver

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

This patch requires the previous patch for Microchip KSZ8795 DSA driver.

Tristram Ha (1):
  Add Microchip KSZ8895 DSA driver.

 drivers/net/dsa/microchip/Kconfig   |   17 +
 drivers/net/dsa/microchip/Makefile  |2 +
 drivers/net/dsa/microchip/ksz8895.c | 1276 +++
 drivers/net/dsa/microchip/ksz8895_reg.h |  824 
 drivers/net/dsa/microchip/ksz8895_spi.c |  157 
 drivers/net/dsa/microchip/ksz_priv.h|1 +
 6 files changed, 2277 insertions(+)
 create mode 100644 drivers/net/dsa/microchip/ksz8895.c
 create mode 100644 drivers/net/dsa/microchip/ksz8895_reg.h
 create mode 100644 drivers/net/dsa/microchip/ksz8895_spi.c

-- 
1.9.1

[PATCH v2 net-next] net: dsa: microchip: Add MIB counter reading support

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Add MIB counter reading support.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
v2
- Only MIB counter related code in patch

v1
- Simplify MIB counter reading code

 drivers/net/dsa/microchip/ksz9477.c| 121 ++---
 drivers/net/dsa/microchip/ksz_common.c | 100 +++
 drivers/net/dsa/microchip/ksz_common.h |   2 +
 drivers/net/dsa/microchip/ksz_priv.h   |   7 +-
 4 files changed, 185 insertions(+), 45 deletions(-)

diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
index 22a4b34..4e998a4 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -271,6 +271,76 @@ static int ksz9477_reset_switch(struct ksz_device *dev)
return 0;
 }
 
+static void ksz9477_r_mib_cnt(struct ksz_device *dev, int port, u16 addr,
+ u64 *cnt)
+{
+   u32 data;
+   int timeout;
+   struct ksz_port *p = >ports[port];
+
+   /* retain the flush/freeze bit */
+   data = p->freeze ? MIB_COUNTER_FLUSH_FREEZE : 0;
+   data |= MIB_COUNTER_READ;
+   data |= (addr << MIB_COUNTER_INDEX_S);
+   ksz_pwrite32(dev, port, REG_PORT_MIB_CTRL_STAT__4, data);
+
+   timeout = 1000;
+   do {
+   ksz_pread32(dev, port, REG_PORT_MIB_CTRL_STAT__4,
+   );
+   usleep_range(1, 10);
+   if (!(data & MIB_COUNTER_READ))
+   break;
+   } while (timeout-- > 0);
+
+   /* failed to read MIB. get out of loop */
+   if (!timeout) {
+   dev_dbg(dev->dev, "Failed to get MIB\n");
+   return;
+   }
+
+   /* count resets upon read */
+   ksz_pread32(dev, port, REG_PORT_MIB_DATA, );
+   *cnt += data;
+}
+
+static void ksz9477_r_mib_pkt(struct ksz_device *dev, int port, u16 addr,
+ u64 *dropped, u64 *cnt)
+{
+   addr = ksz9477_mib_names[addr].index;
+   ksz9477_r_mib_cnt(dev, port, addr, cnt);
+}
+
+static void ksz9477_freeze_mib(struct ksz_device *dev, int port, bool freeze)
+{
+   struct ksz_port *p = >ports[port];
+   u32 val = freeze ? MIB_COUNTER_FLUSH_FREEZE : 0;
+
+   /* enable/disable the port for flush/freeze function */
+   mutex_lock(>mib.cnt_mutex);
+   ksz_pwrite32(dev, port, REG_PORT_MIB_CTRL_STAT__4, val);
+
+   /* used by MIB counter reading code to know freeze is enabled */
+   p->freeze = freeze;
+   mutex_unlock(>mib.cnt_mutex);
+}
+
+static void ksz9477_port_init_cnt(struct ksz_device *dev, int port)
+{
+   struct ksz_port_mib *mib = >ports[port].mib;
+
+   /* flush all enabled port MIB counters */
+   mutex_lock(>cnt_mutex);
+   ksz_pwrite32(dev, port, REG_PORT_MIB_CTRL_STAT__4,
+MIB_COUNTER_FLUSH_FREEZE);
+   ksz_write8(dev, REG_SW_MAC_CTRL_6, SW_MIB_COUNTER_FLUSH);
+   ksz_pwrite32(dev, port, REG_PORT_MIB_CTRL_STAT__4, 0);
+   mutex_unlock(>cnt_mutex);
+
+   mib->cnt_ptr = 0;
+   memset(mib->counters, 0, dev->mib_cnt * sizeof(u64));
+}
+
 static enum dsa_tag_protocol ksz9477_get_tag_protocol(struct dsa_switch *ds,
  int port)
 {
@@ -350,47 +420,6 @@ static void ksz9477_get_strings(struct dsa_switch *ds, int 
port, uint8_t *buf)
}
 }
 
-static void ksz_get_ethtool_stats(struct dsa_switch *ds, int port,
- uint64_t *buf)
-{
-   struct ksz_device *dev = ds->priv;
-   int i;
-   u32 data;
-   int timeout;
-
-   mutex_lock(>stats_mutex);
-
-   for (i = 0; i < TOTAL_SWITCH_COUNTER_NUM; i++) {
-   data = MIB_COUNTER_READ;
-   data |= ((ksz9477_mib_names[i].index & 0xFF) <<
-   MIB_COUNTER_INDEX_S);
-   ksz_pwrite32(dev, port, REG_PORT_MIB_CTRL_STAT__4, data);
-
-   timeout = 1000;
-   do {
-   ksz_pread32(dev, port, REG_PORT_MIB_CTRL_STAT__4,
-   );
-   usleep_range(1, 10);
-   if (!(data & MIB_COUNTER_READ))
-   break;
-   } while (timeout-- > 0);
-
-   /* failed to read MIB. get out of loop */
-   if (!timeout) {
-   dev_dbg(dev->dev, "Failed to get MIB\n");
-   break;
-   }
-
-   /* count resets upon read */
-   ksz_pread32(dev, port, REG_PORT_MIB_DATA, );
-
-   dev->mib_value[i] += (uint64_t)data;
-   buf[i] = dev->mib_value[i];
-   }
-
-   mutex_unlock(>stats_mutex);
-}
-
 static void ksz9477_cfg_port_member(struct ksz_device *dev, int port,
u8 member)
 {
@@ -1161,9 +1190,14 @@ static int ksz9477_setup(struct

[PATCH v1 net-next 7/7] net: dsa: microchip: Rename ksz_9477_reg.h to ksz9477_reg.h

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Rename ksz_9477_reg.h to ksz9477_reg.h for consistency as the product
name is always KSZ.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
 drivers/net/dsa/microchip/ksz9477.c | 2 +-
 drivers/net/dsa/microchip/{ksz_9477_reg.h => ksz9477_reg.h} | 0
 drivers/net/dsa/microchip/ksz_priv.h| 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename drivers/net/dsa/microchip/{ksz_9477_reg.h => ksz9477_reg.h} (100%)

diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
index 05fa859..22a4b34 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -31,7 +31,7 @@
 
 #include "ksz_priv.h"
 #include "ksz_common.h"
-#include "ksz_9477_reg.h"
+#include "ksz9477_reg.h"
 
 static const struct {
int index;
diff --git a/drivers/net/dsa/microchip/ksz_9477_reg.h 
b/drivers/net/dsa/microchip/ksz9477_reg.h
similarity index 100%
rename from drivers/net/dsa/microchip/ksz_9477_reg.h
rename to drivers/net/dsa/microchip/ksz9477_reg.h
diff --git a/drivers/net/dsa/microchip/ksz_priv.h 
b/drivers/net/dsa/microchip/ksz_priv.h
index d92a7c1..bfe9066 100644
--- a/drivers/net/dsa/microchip/ksz_priv.h
+++ b/drivers/net/dsa/microchip/ksz_priv.h
@@ -26,7 +26,7 @@
 #include 
 #include 
 
-#include "ksz_9477_reg.h"
+#include "ksz9477_reg.h"
 
 struct ksz_io_ops;
 
-- 
1.9.1

[PATCH v1 net-next 5/7] net: dsa: microchip: Break KSZ9477 DSA driver into two files

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Break KSZ9477 DSA driver into two files in preparation to add more KSZ
switch drivers.
Add common functions in ksz_common.h so that other KSZ switch drivers
can access code in ksz_common.c.
Add ksz_spi.h for common functions used by KSZ switch SPI drivers.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
Reviewed-by: Pavel Machek 
Reviewed-by: Florian Fainelli 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/microchip/Makefile  |2 +-
 drivers/net/dsa/microchip/ksz9477.c | 1321 +++
 drivers/net/dsa/microchip/ksz9477_spi.c |  143 ++--
 drivers/net/dsa/microchip/ksz_common.c  | 1134 +++---
 drivers/net/dsa/microchip/ksz_common.h  |  230 ++
 drivers/net/dsa/microchip/ksz_priv.h|  229 +++---
 drivers/net/dsa/microchip/ksz_spi.h |   82 ++
 7 files changed, 1919 insertions(+), 1222 deletions(-)
 create mode 100644 drivers/net/dsa/microchip/ksz9477.c
 create mode 100644 drivers/net/dsa/microchip/ksz_common.h
 create mode 100644 drivers/net/dsa/microchip/ksz_spi.h

diff --git a/drivers/net/dsa/microchip/Makefile 
b/drivers/net/dsa/microchip/Makefile
index 5b6325b..13dd8f0 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_MICROCHIP_KSZ9477)+= ksz_common.o
+obj-$(CONFIG_MICROCHIP_KSZ9477)+= ksz9477.o ksz_common.o
 obj-$(CONFIG_MICROCHIP_KSZ9477_SPI_DRIVER) += ksz9477_spi.o
diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
new file mode 100644
index 000..080cb76
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -0,0 +1,1321 @@
+/*
+ * Microchip KSZ9477 switch driver main logic
+ *
+ * Copyright (C) 2017 Microchip Technology Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ksz_priv.h"
+#include "ksz_common.h"
+#include "ksz_9477_reg.h"
+
+static const struct {
+   int index;
+   char string[ETH_GSTRING_LEN];
+} ksz9477_mib_names[TOTAL_SWITCH_COUNTER_NUM] = {
+   { 0x00, "rx_hi" },
+   { 0x01, "rx_undersize" },
+   { 0x02, "rx_fragments" },
+   { 0x03, "rx_oversize" },
+   { 0x04, "rx_jabbers" },
+   { 0x05, "rx_symbol_err" },
+   { 0x06, "rx_crc_err" },
+   { 0x07, "rx_align_err" },
+   { 0x08, "rx_mac_ctrl" },
+   { 0x09, "rx_pause" },
+   { 0x0A, "rx_bcast" },
+   { 0x0B, "rx_mcast" },
+   { 0x0C, "rx_ucast" },
+   { 0x0D, "rx_64_or_less" },
+   { 0x0E, "rx_65_127" },
+   { 0x0F, "rx_128_255" },
+   { 0x10, "rx_256_511" },
+   { 0x11, "rx_512_1023" },
+   { 0x12, "rx_1024_1522" },
+   { 0x13, "rx_1523_2000" },
+   { 0x14, "rx_2001" },
+   { 0x15, "tx_hi" },
+   { 0x16, "tx_late_col" },
+   { 0x17, "tx_pause" },
+   { 0x18, "tx_bcast" },
+   { 0x19, "tx_mcast" },
+   { 0x1A, "tx_ucast" },
+   { 0x1B, "tx_deferred" },
+   { 0x1C, "tx_total_col" },
+   { 0x1D, "tx_exc_col" },
+   { 0x1E, "tx_single_col" },
+   { 0x1F, "tx_mult_col" },
+   { 0x80, "rx_total" },
+   { 0x81, "tx_total" },
+   { 0x82, "rx_discards" },
+   { 0x83, "tx_discards" },
+};
+
+static void ksz9477_cfg32(struct ksz_device *dev, u32 addr, u32 bits, bool set)
+{
+   u32 data;
+
+   ksz_read32(dev, addr, );
+   if (set)
+   data |= bits;
+   else
+   data &= ~bits;
+   ksz_write32(dev, addr, data);
+}
+
+static void ksz9477_port_cfg32(struct ksz_device *dev, int port, int offset,
+  u32 bits, bool set)
+{
+   u32 addr;
+   u32 data;
+
+   addr = PORT_CTRL_ADDR(port, offset);
+   ksz_read32(dev, addr, );
+
+   if (set)
+   data |= bits;
+   else
+   data &= ~bits;
+
+   ksz_write32(dev, addr, data);
+}
+
+static int ksz9477_wait_vlan_ctrl_ready(struct ksz_device *dev, u32 waiton,
+   int timeout)
+{
+   u8 data;
+
+   do {
+   ksz_read8(dev, REG_SW_VLAN_CTRL, );
+   if (!(data & waiton))
+

[PATCH v1 net-next 6/7] net: dsa: microchip: Prepare PHY for proper advertisement

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Prepare PHY for proper advertisement and get link status for the port.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
 drivers/net/dsa/microchip/ksz9477.c| 13 +
 drivers/net/dsa/microchip/ksz_common.c | 20 
 drivers/net/dsa/microchip/ksz_common.h |  2 ++
 drivers/net/dsa/microchip/ksz_priv.h   |  2 ++
 4 files changed, 37 insertions(+)

diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
index 080cb76..05fa859 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -980,6 +980,17 @@ static void ksz9477_port_mirror_del(struct dsa_switch *ds, 
int port,
 PORT_MIRROR_SNIFFER, false);
 }
 
+static void ksz9477_phy_setup(struct ksz_device *dev, int port,
+ struct phy_device *phy)
+{
+   if (port < dev->phy_port_cnt) {
+   /* SUPPORTED_Asym_Pause and SUPPORTED_Pause can be removed to
+* disable flow control when rate limiting is used.
+*/
+   phy->advertising = phy->supported;
+   }
+}
+
 static void ksz9477_port_setup(struct ksz_device *dev, int port, bool cpu_port)
 {
u8 data8;
@@ -1161,6 +1172,7 @@ static int ksz9477_setup(struct dsa_switch *ds)
.setup  = ksz9477_setup,
.phy_read   = ksz9477_phy_read16,
.phy_write  = ksz9477_phy_write16,
+   .adjust_link= ksz_adjust_link,
.port_enable= ksz_enable_port,
.port_disable   = ksz_disable_port,
.get_strings= ksz9477_get_strings,
@@ -1303,6 +1315,7 @@ static void ksz9477_switch_exit(struct ksz_device *dev)
.get_port_addr = ksz9477_get_port_addr,
.cfg_port_member = ksz9477_cfg_port_member,
.flush_dyn_mac_table = ksz9477_flush_dyn_mac_table,
+   .phy_setup = ksz9477_phy_setup,
.port_setup = ksz9477_port_setup,
.shutdown = ksz9477_reset_switch,
.detect = ksz9477_switch_detect,
diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index 1c9c4c5..e50ea56 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -69,6 +69,25 @@ int ksz_phy_write16(struct dsa_switch *ds, int addr, int 
reg, u16 val)
return 0;
 }
 
+void ksz_adjust_link(struct dsa_switch *ds, int port,
+struct phy_device *phydev)
+{
+   struct ksz_device *dev = ds->priv;
+   struct ksz_port *p = >ports[port];
+
+   if (phydev->link) {
+   p->speed = phydev->speed;
+   p->duplex = phydev->duplex;
+   p->flow_ctrl = phydev->pause;
+   p->link_up = 1;
+   dev->live_ports |= (1 << port) & dev->on_ports;
+   } else if (p->link_up) {
+   p->link_up = 0;
+   p->link_down = 1;
+   dev->live_ports &= ~(1 << port);
+   }
+}
+
 int ksz_sset_count(struct dsa_switch *ds)
 {
struct ksz_device *dev = ds->priv;
@@ -249,6 +268,7 @@ int ksz_enable_port(struct dsa_switch *ds, int port, struct 
phy_device *phy)
 
/* setup slave port */
dev->dev_ops->port_setup(dev, port, false);
+   dev->dev_ops->phy_setup(dev, port, phy);
 
/* port_stp_state_set() will be called after to enable the port so
 * there is no need to do anything.
diff --git a/drivers/net/dsa/microchip/ksz_common.h 
b/drivers/net/dsa/microchip/ksz_common.h
index 1c1cbad..9d71387 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -26,6 +26,8 @@
 
 int ksz_phy_read16(struct dsa_switch *ds, int addr, int reg);
 int ksz_phy_write16(struct dsa_switch *ds, int addr, int reg, u16 val);
+void ksz_adjust_link(struct dsa_switch *ds, int port,
+struct phy_device *phydev);
 int ksz_sset_count(struct dsa_switch *ds);
 int ksz_port_bridge_join(struct dsa_switch *ds, int port,
 struct net_device *br);
diff --git a/drivers/net/dsa/microchip/ksz_priv.h 
b/drivers/net/dsa/microchip/ksz_priv.h
index 4126749..d92a7c1 100644
--- a/drivers/net/dsa/microchip/ksz_priv.h
+++ b/drivers/net/dsa/microchip/ksz_priv.h
@@ -150,6 +150,8 @@ struct ksz_dev_ops {
u32 (*get_port_addr)(int port, int offset);
void (*cfg_port_member)(struct ksz_device *dev, int port, u8 member);
void (*flush_dyn_mac_table)(struct ksz_device *dev, int port);
+   void (*phy_setup)(struct ksz_device *dev, int port,
+ struct phy_device *phy);
void (*port_setup)(struct ksz_device *dev, int port, bool cpu_port);
void (*r_phy)(struct ksz_device *dev, u16 phy, u16 reg, u16 *val);
void (*w_phy)(struct ksz_device *dev, u16 phy, u16 reg, u16 val);
-- 
1.9.1

[PATCH net-next] dt-bindings: net: dsa: Document additional Microchip KSZ family switches

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Document additional Microchip KSZ family switches.

Signed-off-by: Tristram Ha 
Reviewed-by: Pavel Machek 
Reviewed-by: Florian Fainelli 
---
 Documentation/devicetree/bindings/net/dsa/ksz.txt | 189 --
 1 file changed, 136 insertions(+), 53 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/dsa/ksz.txt 
b/Documentation/devicetree/bindings/net/dsa/ksz.txt
index fd23904..705f9d9 100644
--- a/Documentation/devicetree/bindings/net/dsa/ksz.txt
+++ b/Documentation/devicetree/bindings/net/dsa/ksz.txt
@@ -3,8 +3,15 @@ Microchip KSZ Series Ethernet switches
 
 Required properties:
 
-- compatible: For external switch chips, compatible string must be exactly one
-  of: "microchip,ksz9477"
+- compatible: "microchip,ksz9477",
+ "microchip,ksz8795",
+ "microchip,ksz8794",
+ "microchip,ksz8765",
+ "microchip,ksz8895",
+ "microchip,ksz8864",
+ "microchip,ksz8873",
+ "microchip,ksz8863",
+ "microchip,ksz8463"
 
 See Documentation/devicetree/bindings/dsa/dsa.txt for a list of additional
 required and optional properties.
@@ -13,58 +20,134 @@ Examples:
 
 Ethernet switch connected via SPI to the host, CPU port wired to eth0:
 
- eth0: ethernet@10001000 {
- fixed-link {
- speed = <1000>;
- full-duplex;
- };
- };
+   eth0: ethernet@10001000 {
+   fixed-link {
+   speed = <1000>;
+   full-duplex;
+   };
+   };
 
- spi1: spi@f8008000 {
- pinctrl-0 = <_spi_ksz>;
- cs-gpios = < 25 0>;
- id = <1>;
+   spi1: spi@f8008000 {
+   cs-gpios = < 25 0>;
+   id = <1>;
 
- ksz9477: ksz9477@0 {
- compatible = 
"microchip,ksz9477";
- reg = <0>;
+   ksz9477: ksz9477@0 {
+   compatible = "microchip,ksz9477";
+   reg = <0>;
 
- spi-max-frequency 
= <4400>;
- spi-cpha;
- spi-cpol;
+   spi-max-frequency = <4400>;
+   spi-cpha;
+   spi-cpol;
+
+   ports {
+   #address-cells = <1>;
+   #size-cells = <0>;
+   port@0 {
+   reg = <0>;
+   label = "lan1";
+   };
+   port@1 {
+   reg = <1>;
+   label = "lan2";
+   };
+   port@2 {
+   reg = <2>;
+   label = "lan3";
+   };
+   port@3 {
+   reg = <3>;
+   label = "lan4";
+   };
+   port@4 {
+   reg = <4>;
+   label = "lan5";
+   };
+   port@5 {
+   reg = <5>;
+   label = "cpu";
+   ethernet = <>;
+   fixed-link {
+   speed = <1000>;
+   full-duplex;
+   };
+   };
+   port@6 {
+   reg = <6>;
+   label = "lan6";
+   fixed-link {
+   speed = <1000>;
+   full-duplex;
+   };
+   };
+   };
+   };
+   ksz8794: ksz8794@0 {
+   compatible =

[PATCH v1 net-next 4/7] net: dsa: microchip: Rename ksz_spi.c to ksz9477_spi.c

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Rename ksz_spi.c to ksz9477_spi.c and update Kconfig in preparation to add
more KSZ switch drivers.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
Reviewed-by: Pavel Machek 
Reviewed-by: Florian Fainelli 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/microchip/Kconfig  | 12 ++--
 drivers/net/dsa/microchip/Makefile |  4 ++--
 drivers/net/dsa/microchip/{ksz_spi.c => ksz9477_spi.c} |  0
 3 files changed, 8 insertions(+), 8 deletions(-)
 rename drivers/net/dsa/microchip/{ksz_spi.c => ksz9477_spi.c} (100%)

diff --git a/drivers/net/dsa/microchip/Kconfig 
b/drivers/net/dsa/microchip/Kconfig
index a8b8f59..5a8660d 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -1,12 +1,12 @@
-menuconfig MICROCHIP_KSZ
-   tristate "Microchip KSZ series switch support"
+menuconfig MICROCHIP_KSZ9477
+   tristate "Microchip KSZ9477 series switch support"
depends on NET_DSA
select NET_DSA_TAG_KSZ
help
- This driver adds support for Microchip KSZ switch chips.
+ This driver adds support for Microchip KSZ9477 switch chips.
 
-config MICROCHIP_KSZ_SPI_DRIVER
-   tristate "KSZ series SPI connected switch driver"
-   depends on MICROCHIP_KSZ && SPI
+config MICROCHIP_KSZ9477_SPI_DRIVER
+   tristate "KSZ9477 series SPI connected switch driver"
+   depends on MICROCHIP_KSZ9477 && SPI
help
  Select to enable support for registering switches configured through 
SPI.
diff --git a/drivers/net/dsa/microchip/Makefile 
b/drivers/net/dsa/microchip/Makefile
index ed335e2..5b6325b 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_MICROCHIP_KSZ)+= ksz_common.o
-obj-$(CONFIG_MICROCHIP_KSZ_SPI_DRIVER) += ksz_spi.o
+obj-$(CONFIG_MICROCHIP_KSZ9477)+= ksz_common.o
+obj-$(CONFIG_MICROCHIP_KSZ9477_SPI_DRIVER) += ksz9477_spi.o
diff --git a/drivers/net/dsa/microchip/ksz_spi.c 
b/drivers/net/dsa/microchip/ksz9477_spi.c
similarity index 100%
rename from drivers/net/dsa/microchip/ksz_spi.c
rename to drivers/net/dsa/microchip/ksz9477_spi.c
-- 
1.9.1

[PATCH v1 net-next 2/7] net: dsa: microchip: Clean up code according to patch check suggestions

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Clean up code according to patch check suggestions.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
Reviewed-by: Pavel Machek 
Reviewed-by: Florian Fainelli 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/microchip/ksz_common.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index ed67ef6..1388f1a 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -899,9 +899,9 @@ static void ksz_port_mdb_add(struct dsa_switch *ds, int 
port,
 
if (static_table[0] & ALU_V_STATIC_VALID) {
/* check this has same vid & mac address */
-   if (((static_table[2] >> ALU_V_FID_S) == (mdb->vid)) &&
+   if (((static_table[2] >> ALU_V_FID_S) == mdb->vid) &&
((static_table[2] & ALU_V_MAC_ADDR_HI) == mac_hi) &&
-   (static_table[3] == mac_lo)) {
+   static_table[3] == mac_lo) {
/* found matching one */
break;
}
@@ -972,9 +972,9 @@ static int ksz_port_mdb_del(struct dsa_switch *ds, int port,
if (static_table[0] & ALU_V_STATIC_VALID) {
/* check this has same vid & mac address */
 
-   if (((static_table[2] >> ALU_V_FID_S) == (mdb->vid)) &&
+   if (((static_table[2] >> ALU_V_FID_S) == mdb->vid) &&
((static_table[2] & ALU_V_MAC_ADDR_HI) == mac_hi) &&
-   (static_table[3] == mac_lo)) {
+   static_table[3] == mac_lo) {
/* found matching one */
break;
}
-- 
1.9.1

[PATCH v1 net-next 0/7] net: dsa: microchip: Modify KSZ9477 DSA driver in preparation to add other KSZ switch drivers

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

This series of patches is to modify the original KSZ9477 DSA driver so
that other KSZ switch drivers can be added and use the common code.

There are several steps to accomplish this achievement.  First is to
rename some function names with a prefix to indicate chip specific
function.  Second is to move common code into header that can be shared.
Last is to modify tag_ksz.c so that it can handle many tail tag formats
used by different KSZ switch drivers.

ksz_common.c will contain the common code used by all KSZ switch drivers.
ksz9477.c will contain KSZ9477 code from the original ksz_common.c.
ksz9477_spi.c is renamed from ksz_spi.c.
ksz9477_reg.h is renamed from ksz_9477_reg.h.
ksz_common.h is added to provide common code access to KSZ switch
drivers.
ksz_spi.h is added to provide common SPI access functions to KSZ SPI
drivers.

v1
- Each patch in the set is self-contained
- Use ksz9477 prefix to indicate KSZ9477 specific code

Tristram Ha (7):
  Replace license with GPL.
  Clean up code according to patch check suggestions.
  Rename some functions with ksz9477 prefix to separate chip specific
code from common code.
  Rename ksz_spi.c to ksz9477_spi.c and update Kconfig in preparation to
add more KSZ switch drivers.
  Break KSZ9477 DSA driver into two files in preparation to add more KSZ
switch drivers.   Add common functions in ksz_common.h so that other
KSZ switch drivers can access code in ksz_common.c.   Add ksz_spi.h
for common functions used by KSZ switch SPI drivers.
  Prepare PHY for proper advertisement and get link status for the port.
  Rename ksz_9477_reg.h to ksz9477_reg.h for consistency as the product
name is always KSZ.

 drivers/net/dsa/microchip/Kconfig  |   12 +-
 drivers/net/dsa/microchip/Makefile |4 +-
 drivers/net/dsa/microchip/ksz9477.c| 1334 
 .../microchip/{ksz_9477_reg.h => ksz9477_reg.h}|   23 +-
 drivers/net/dsa/microchip/ksz9477_spi.c|  188 +++
 drivers/net/dsa/microchip/ksz_common.c | 1167 +++--
 drivers/net/dsa/microchip/ksz_common.h |  232 
 drivers/net/dsa/microchip/ksz_priv.h   |  256 ++--
 drivers/net/dsa/microchip/ksz_spi.c|  216 
 drivers/net/dsa/microchip/ksz_spi.h|   82 ++
 10 files changed, 2127 insertions(+), 1387 deletions(-)
 create mode 100644 drivers/net/dsa/microchip/ksz9477.c
 rename drivers/net/dsa/microchip/{ksz_9477_reg.h => ksz9477_reg.h} (98%)
 create mode 100644 drivers/net/dsa/microchip/ksz9477_spi.c
 create mode 100644 drivers/net/dsa/microchip/ksz_common.h
 delete mode 100644 drivers/net/dsa/microchip/ksz_spi.c
 create mode 100644 drivers/net/dsa/microchip/ksz_spi.h

-- 
1.9.1

[PATCH v1 net-next 3/7] net: dsa: microchip: Rename some functions with ksz9477 prefix chip specific code from common code.

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Rename some functions with ksz9477 prefix to separate chip specific code
from common code.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
Reviewed-by: Pavel Machek 
Reviewed-by: Florian Fainelli 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/microchip/ksz_common.c | 118 +
 1 file changed, 60 insertions(+), 58 deletions(-)

diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index 1388f1a..06227ef 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -265,9 +265,8 @@ static int wait_alu_sta_ready(struct ksz_device *dev, u32 
waiton, int timeout)
return 0;
 }
 
-static int ksz_reset_switch(struct dsa_switch *ds)
+static int ksz9477_reset_switch(struct ksz_device *dev)
 {
-   struct ksz_device *dev = ds->priv;
u8 data8;
u16 data16;
u32 data32;
@@ -300,7 +299,7 @@ static int ksz_reset_switch(struct dsa_switch *ds)
return 0;
 }
 
-static void port_setup(struct ksz_device *dev, int port, bool cpu_port)
+static void ksz9477_port_setup(struct ksz_device *dev, int port, bool cpu_port)
 {
u8 data8;
u16 data16;
@@ -346,7 +345,7 @@ static void port_setup(struct ksz_device *dev, int port, 
bool cpu_port)
ksz_pread16(dev, port, REG_PORT_PHY_INT_ENABLE, );
 }
 
-static void ksz_config_cpu_port(struct dsa_switch *ds)
+static void ksz9477_config_cpu_port(struct dsa_switch *ds)
 {
struct ksz_device *dev = ds->priv;
int i;
@@ -358,12 +357,12 @@ static void ksz_config_cpu_port(struct dsa_switch *ds)
dev->cpu_port = i;
 
/* enable cpu port */
-   port_setup(dev, i, true);
+   ksz9477_port_setup(dev, i, true);
}
}
 }
 
-static int ksz_setup(struct dsa_switch *ds)
+static int ksz9477_setup(struct dsa_switch *ds)
 {
struct ksz_device *dev = ds->priv;
int ret = 0;
@@ -373,7 +372,7 @@ static int ksz_setup(struct dsa_switch *ds)
if (!dev->vlan_cache)
return -ENOMEM;
 
-   ret = ksz_reset_switch(ds);
+   ret = ksz9477_reset_switch(dev);
if (ret) {
dev_err(ds->dev, "failed to reset switch\n");
return ret;
@@ -382,7 +381,7 @@ static int ksz_setup(struct dsa_switch *ds)
/* accept packet up to 2000bytes */
ksz_cfg(dev, REG_SW_MAC_CTRL_1, SW_LEGAL_PACKET_DISABLE, true);
 
-   ksz_config_cpu_port(ds);
+   ksz9477_config_cpu_port(ds);
 
ksz_cfg(dev, REG_SW_MAC_CTRL_1, MULTICAST_STORM_DISABLE, true);
 
@@ -395,13 +394,13 @@ static int ksz_setup(struct dsa_switch *ds)
return 0;
 }
 
-static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
- int port)
+static enum dsa_tag_protocol ksz9477_get_tag_protocol(struct dsa_switch *ds,
+ int port)
 {
return DSA_TAG_PROTO_KSZ;
 }
 
-static int ksz_phy_read16(struct dsa_switch *ds, int addr, int reg)
+static int ksz9477_phy_read16(struct dsa_switch *ds, int addr, int reg)
 {
struct ksz_device *dev = ds->priv;
u16 val = 0;
@@ -411,7 +410,8 @@ static int ksz_phy_read16(struct dsa_switch *ds, int addr, 
int reg)
return val;
 }
 
-static int ksz_phy_write16(struct dsa_switch *ds, int addr, int reg, u16 val)
+static int ksz9477_phy_write16(struct dsa_switch *ds, int addr, int reg,
+  u16 val)
 {
struct ksz_device *dev = ds->priv;
 
@@ -426,7 +426,7 @@ static int ksz_enable_port(struct dsa_switch *ds, int port,
struct ksz_device *dev = ds->priv;
 
/* setup slave port */
-   port_setup(dev, port, false);
+   ksz9477_port_setup(dev, port, false);
 
return 0;
 }
@@ -445,7 +445,7 @@ static int ksz_sset_count(struct dsa_switch *ds)
return TOTAL_SWITCH_COUNTER_NUM;
 }
 
-static void ksz_get_strings(struct dsa_switch *ds, int port, uint8_t *buf)
+static void ksz9477_get_strings(struct dsa_switch *ds, int port, uint8_t *buf)
 {
int i;
 
@@ -495,7 +495,8 @@ static void ksz_get_ethtool_stats(struct dsa_switch *ds, 
int port,
mutex_unlock(>stats_mutex);
 }
 
-static void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
+static void ksz9477_port_stp_state_set(struct dsa_switch *ds, int port,
+  u8 state)
 {
struct ksz_device *dev = ds->priv;
u8 data;
@@ -540,7 +541,8 @@ static void ksz_port_fast_age(struct dsa_switch *ds, int 
port)
ksz_write8(dev, REG_SW_LUE_CTRL_1, data8);
 }
 
-static int ksz_port_vlan_filtering(struct dsa_switch *ds, int port, bool flag)
+static int ksz9477_port_vlan_filtering(struct

[PATCH v1 net-next 1/7] net: dsa: microchip: Replace license with GPL

2017-11-16 Thread Tristram.Ha

From: Tristram Ha 

Replace license with GPL.

Signed-off-by: Tristram Ha 
Reviewed-by: Woojung Huh 
---
 drivers/net/dsa/microchip/ksz_9477_reg.h | 23 ---
 drivers/net/dsa/microchip/ksz_common.c   | 23 ---
 drivers/net/dsa/microchip/ksz_priv.h | 23 ---
 drivers/net/dsa/microchip/ksz_spi.c  | 23 ---
 4 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/drivers/net/dsa/microchip/ksz_9477_reg.h 
b/drivers/net/dsa/microchip/ksz_9477_reg.h
index 6aa6752..26a0e4b 100644
--- a/drivers/net/dsa/microchip/ksz_9477_reg.h
+++ b/drivers/net/dsa/microchip/ksz_9477_reg.h
@@ -1,19 +1,20 @@
 /*
  * Microchip KSZ9477 register definitions
  *
- * Copyright (C) 2017
+ * Copyright (C) 2017 Microchip Technology Inc.
  *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
  *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
  */
 
 #ifndef __KSZ9477_REGS_H
diff --git a/drivers/net/dsa/microchip/ksz_common.c 
b/drivers/net/dsa/microchip/ksz_common.c
index b5be93a..ed67ef6 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1,19 +1,20 @@
 /*
  * Microchip switch driver main logic
  *
- * Copyright (C) 2017
+ * Copyright (C) 2017 Microchip Technology Inc.
  *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
  *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
  */
 
 #include 
diff --git a/drivers/net/dsa/microchip/ksz_priv.h 
b/drivers/net/dsa/microchip/ksz_priv.h
index 2a98dbd..d461468 100644
--- a/drivers/net/dsa/microchip/ksz_priv.h
+++ b/drivers/net/dsa/microchip/ksz_priv.h
@@ -1,19 +1,20 @@
 /*
  * Microchip KSZ series switch common definitions
  *
- * Copyright (C) 2017
+ * Copyright (C) 2017 Microchip Technology Inc.
  *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
  *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL

Re: [PATCH 7.x ubsan fix 3/6] net: sctp, forbid negative length

2017-11-16 Thread Kefeng Wang

sorry, please ignore...


On 2017/11/17 10:33, Kefeng Wang wrote:
> From: Jiri Slaby 
>
> mainline inclusion
> from mainline-4.9
> commit a4b8e71b05c27bae6bad3bdecddbc6b68a3ad8cf
> category: bugfix
> bugzilla: 3214
> DTS: NA
> CVE: NA
>
> -
>
> Most of getsockopt handlers in net/sctp/socket.c check len against
> sizeof some structure like:
> if (len < sizeof(int))
> return -EINVAL;
>
> On the first look, the check seems to be correct. But since len is int
> and sizeof returns size_t, int gets promoted to unsigned size_t too. So
> the test returns false for negative lengths. Yes, (-1 < sizeof(long)) is
> false.
>
> Fix this in sctp by explicitly checking len < 0 before any getsockopt
> handler is called.
>
> Note that sctp_getsockopt_events already handled the negative case.
> Since we added the < 0 check elsewhere, this one can be removed.
>
> If not checked, this is the result:
> UBSAN: Undefined behaviour in ../mm/page_alloc.c:2722:19
> shift exponent 52 is too large for 32-bit type 'int'
> CPU: 1 PID: 24535 Comm: syz-executor Not tainted 4.8.1-0-syzkaller #1
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
>   88006d99f2a8 b2f7bdea 41b58ab3
>  b4363c14 b2f7bcde 88006d99f2d0 88006d99f270
>    0034 b5096422
> Call Trace:
>  [] ? __ubsan_handle_shift_out_of_bounds+0x29c/0x300
> ...
>  [] ? kmalloc_order+0x24/0x90
>  [] ? kmalloc_order_trace+0x24/0x220
>  [] ? __kmalloc+0x330/0x540
>  [] ? sctp_getsockopt_local_addrs+0x174/0xca0 [sctp]
>  [] ? sctp_getsockopt+0x10d/0x1b0 [sctp]
>  [] ? sock_common_getsockopt+0xb9/0x150
>  [] ? SyS_getsockopt+0x1a5/0x270
>
> Signed-off-by: Jiri Slaby 
> Cc: Vlad Yasevich 
> Cc: Neil Horman 
> Cc: "David S. Miller" 
> Cc: linux-s...@vger.kernel.org
> Cc: netdev@vger.kernel.org
> Acked-by: Neil Horman 
> Signed-off-by: David S. Miller 
> (cherry picked from commit a4b8e71b05c27bae6bad3bdecddbc6b68a3ad8cf)
> Signed-off-by: Kefeng Wang 
> ---
>  net/sctp/socket.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 425a1a3..3a42f98 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -4589,7 +4589,7 @@ static int sctp_getsockopt_disable_fragments(struct 
> sock *sk, int len,
>  static int sctp_getsockopt_events(struct sock *sk, int len, char __user 
> *optval,
> int __user *optlen)
>  {
> - if (len <= 0)
> + if (len == 0)
>   return -EINVAL;
>   if (len > sizeof(struct sctp_event_subscribe))
>   len = sizeof(struct sctp_event_subscribe);
> @@ -6252,6 +6252,9 @@ static int sctp_getsockopt(struct sock *sk, int level, 
> int optname,
>   if (get_user(len, optlen))
>   return -EFAULT;
>  
> + if (len < 0)
> + return -EINVAL;
> +
>   lock_sock(sk);
>  
>   switch (optname) {

[PATCH 7.x ubsan fix 3/6] net: sctp, forbid negative length

2017-11-16 Thread Kefeng Wang

From: Jiri Slaby 

mainline inclusion
from mainline-4.9
commit a4b8e71b05c27bae6bad3bdecddbc6b68a3ad8cf
category: bugfix
bugzilla: 3214
DTS: NA
CVE: NA

-

Most of getsockopt handlers in net/sctp/socket.c check len against
sizeof some structure like:
if (len < sizeof(int))
return -EINVAL;

On the first look, the check seems to be correct. But since len is int
and sizeof returns size_t, int gets promoted to unsigned size_t too. So
the test returns false for negative lengths. Yes, (-1 < sizeof(long)) is
false.

Fix this in sctp by explicitly checking len < 0 before any getsockopt
handler is called.

Note that sctp_getsockopt_events already handled the negative case.
Since we added the < 0 check elsewhere, this one can be removed.

If not checked, this is the result:
UBSAN: Undefined behaviour in ../mm/page_alloc.c:2722:19
shift exponent 52 is too large for 32-bit type 'int'
CPU: 1 PID: 24535 Comm: syz-executor Not tainted 4.8.1-0-syzkaller #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
  88006d99f2a8 b2f7bdea 41b58ab3
 b4363c14 b2f7bcde 88006d99f2d0 88006d99f270
   0034 b5096422
Call Trace:
 [] ? __ubsan_handle_shift_out_of_bounds+0x29c/0x300
...
 [] ? kmalloc_order+0x24/0x90
 [] ? kmalloc_order_trace+0x24/0x220
 [] ? __kmalloc+0x330/0x540
 [] ? sctp_getsockopt_local_addrs+0x174/0xca0 [sctp]
 [] ? sctp_getsockopt+0x10d/0x1b0 [sctp]
 [] ? sock_common_getsockopt+0xb9/0x150
 [] ? SyS_getsockopt+0x1a5/0x270

Signed-off-by: Jiri Slaby 
Cc: Vlad Yasevich 
Cc: Neil Horman 
Cc: "David S. Miller" 
Cc: linux-s...@vger.kernel.org
Cc: netdev@vger.kernel.org
Acked-by: Neil Horman 
Signed-off-by: David S. Miller 
(cherry picked from commit a4b8e71b05c27bae6bad3bdecddbc6b68a3ad8cf)
Signed-off-by: Kefeng Wang 
---
 net/sctp/socket.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 425a1a3..3a42f98 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4589,7 +4589,7 @@ static int sctp_getsockopt_disable_fragments(struct sock 
*sk, int len,
 static int sctp_getsockopt_events(struct sock *sk, int len, char __user 
*optval,
  int __user *optlen)
 {
-   if (len <= 0)
+   if (len == 0)
return -EINVAL;
if (len > sizeof(struct sctp_event_subscribe))
len = sizeof(struct sctp_event_subscribe);
@@ -6252,6 +6252,9 @@ static int sctp_getsockopt(struct sock *sk, int level, 
int optname,
if (get_user(len, optlen))
return -EFAULT;
 
+   if (len < 0)
+   return -EINVAL;
+
lock_sock(sk);
 
switch (optname) {
-- 
1.8.3.1

[lkp-robot] [socket] 3bc58b1a49: BUG:using__this_cpu_add()in_preemptible[#]code:krfcommd

2017-11-16 Thread kernel test robot


FYI, we noticed the following commit (built with gcc-6):

commit: 3bc58b1a49b4179db035ca1da289abc0f25a4df2 ("socket: Move the socket 
inuse to namespace.")
url: 
https://github.com/0day-ci/linux/commits/Tonghao-Zhang/socket-Move-the-socket-inuse-to-namespace/20171116-175157


in testcase: trinity
with following parameters:

runtime: 300s

test-description: Trinity is a linux system call fuzz tester.
test-url: http://codemonkey.org.uk/projects/trinity/


on test machine: qemu-system-x86_64 -enable-kvm -m 420M

caused below changes (please refer to attached dmesg/kmsg for entire 
log/backtrace):


+-+++
| | 2798b80b38 | 
3bc58b1a49 |
+-+++
| boot_successes  | 12 | 0  
|
| boot_failures   | 0  | 14 
|
| BUG:using__this_cpu_add()in_preemptible[#]code:swapper  | 0  | 6  
|
| BUG:using__this_cpu_add()in_preemptible[#]code:ubus | 0  | 5  
|
| BUG:using__this_cpu_add()in_preemptible[#]code:ubusd| 0  | 5  
|
| BUG:using__this_cpu_add()in_preemptible[#]code:krfcommd | 0  | 8  
|
+-+++



[6.960477] BUG: using __this_cpu_add() in preemptible [] code: 
krfcommd/178
[6.960483] caller is __this_cpu_preempt_check+0x13/0x20
[6.960485] CPU: 0 PID: 178 Comm: krfcommd Not tainted 
4.14.0-rc7-02177-g3bc58b1 #1
[6.960487] Call Trace:
[6.960493]  dump_stack+0x65/0x92
[6.960495]  check_preemption_disabled+0xf4/0x110
[6.960497]  __this_cpu_preempt_check+0x13/0x20
[6.960501]  sock_alloc+0x86/0xa0
[6.960503]  __sock_create+0x99/0x1b0
[6.960506]  ? _raw_spin_unlock_irqrestore+0x7f/0x90
[6.960508]  sock_create_kern+0x14/0x20
[6.960511]  rfcomm_l2sock_create+0x28/0x50
[6.960514]  rfcomm_run+0x58/0x12a0
[6.960517]  ? wait_woken+0x90/0x90
[6.960520]  kthread+0x134/0x140
[6.960522]  ? rfcomm_check_accept+0xa0/0xa0
[6.960524]  ? __kthread_bind_mask+0x90/0x90
[6.960526]  ret_from_fork+0x25/0x30
[6.975298] Bluetooth: HIDP socket layer initialized
[6.986709] RPC: Registered rdma transport module.
[6.987631] RPC: Registered rdma backchannel transport module.
[6.988865] NET: Registered protocol family 33
[6.989705] Key type rxrpc registered
[6.990451] Key type rxrpc_s registered
[6.996746] NET: Registered protocol family 41
[7.003443] l2tp_core: L2TP core driver, V2.0
[7.004276] l2tp_ip: L2TP IP encapsulation support (L2TPv3)
[7.005348] l2tp_netlink: L2TP netlink interface
[7.006262] l2tp_eth: L2TP ethernet pseudowire support (L2TPv3)
[7.007458] l2tp_debugfs: L2TP debugfs support
[7.008297] l2tp_ip6: L2TP IP encapsulation support for IPv6 (L2TPv3)
[7.009521] 8021q: 802.1Q VLAN Support v1.8
[7.010453] sctp: Hash tables configured (bind 64/64)
[7.011602] NET: Registered protocol family 43
[7.012658] Key type dns_resolver registered
[7.013544] Key type ceph registered
[7.020091] libceph: loaded (mon/osd proto 15/24)
[7.026824] batman_adv: B.A.T.M.A.N. advanced 2017.4 (compatibility version 
15) loaded
[7.028338] openvswitch: Open vSwitch switching datapath
[7.029592] mpls_gso: MPLS GSO support
[7.030507] start plist test
[7.040725] end plist test
[7.041253] mce: Unable to init MCE device (rc: -5)
[7.042908] RAS: Correctable Errors collector initialized.
[7.044011] ... APIC ID:   (0)
[7.044794] ... APIC VERSION: 01050014
[7.045526] 
[7.046882] 
[7.047273] 8000
[7.049659] number of MP IRQ sources: 15.
[7.050460] number of IO-APIC #0 registers: 24.
[7.051318] testing the IO APIC...
[7.052305] IO APIC #0..
[7.052859]  register #00: 
[7.053597] ...: physical APIC id: 00
[7.054417] ...: Delivery Type: 0
[7.055174] ...: LTS  : 0
[7.055934]  register #01: 00170011
[7.056651] ... : max redirection entries: 17
[7.057628] ... : PRQ implemented: 0
[7.058435] ... : IO APIC version: 11
[7.059253]  register #02: 
[7.059975] ... : arbitration: 00
[7.060754]  IRQ redirection table:
[7.061480] IOAPIC 0:
[7.061937]  pin00, disabled, edge , high, V(00), IRR(0), S(0), physical, 
D(00), M(0)
[7.063418]  pin01, enabled , edge , high, V(31), IRR(0), S(0), logical , 
D(01), M(1)
[7.0648

[PATCH net 0/5] nfp: flower fixes and typo in ethtool stats name

2017-11-16 Thread Jakub Kicinski

Hi!

This set comes from the flower offload team.  From Pieter we have a fix
to the semantics of the flag telling FW whether to allocate or free
a mask and correction of a typo in name of one of the MAC statistics 
(reveive -> received, we use past participle to match HW docs).

Dirk fixes propagation of max MTU to representors.

John improves VXLAN offload.  The old code was not using egress_dev at
all, so Jiri missed it in his conversion.  The validation of ingress
port is still not perfect, we will have to wait for shared block dust
to settle to tackle it.  This is how John explains the cases:


The following example rule is now correctly offloaded in net-next kernel:

tc filter add dev vxlan0 ... enc_dst_port 4789 ... skip_sw \
  action redirect dev nfp_p0

The following rule will not be offloaded to the NFP (previously it
incorrectly matched vxlan packets - it shouldn't as ingress dev is not
a vxlan netdev):

tc filter add dev nfp_p0 ... enc_dst_port 4789 ... skip_sw \
  action redirect dev nfp_p0

Rules that are not matching on tunnels and are an egress offload are
rejected. The standard match code assumes the offloaded repr is the
ingress port. Rejecting egress offloads removes the chances of false
interpretation of the rules on the NFP.

A know issue is that the following rule example could still be offloaded
and incorrectly match tunnel data:

tc filter add dev dummy ... enc_dst_port 4789 ... skip_sw \
  action redirect dev nfp_p0

Because the egress register callback does not give information on the
ingress netdev, the patch assumes that if it is not a repr then it is 
the correct tunnel netdev. This may not be the case. The chances of this
happening is reduced as it is enforced that the rule match on the well
known vxlan port but it is still possible.


Dirk van der Merwe (1):
  nfp: inherit the max_mtu from the PF netdev

John Hurley (2):
  nfp: register flower reprs for egress dev offload
  nfp: remove false positive offloads in flower vxlan

Pieter Jansen van Vuuren (2):
  nfp: fix flower offload metadata flag usage
  nfp: fix vlan receive MAC statistics typo

 drivers/net/ethernet/netronome/nfp/flower/main.c   | 18 +++
 drivers/net/ethernet/netronome/nfp/flower/main.h   |  5 +--
 .../net/ethernet/netronome/nfp/flower/metadata.c   |  7 +++--
 .../net/ethernet/netronome/nfp/flower/offload.c| 36 ++
 drivers/net/ethernet/netronome/nfp/nfp_app.h   | 20 
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  2 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c  | 11 ++-
 drivers/net/ethernet/netronome/nfp/nfp_port.h  |  2 +-
 8 files changed, 88 insertions(+), 13 deletions(-)

-- 
2.14.1

[PATCH net 4/5] nfp: register flower reprs for egress dev offload

2017-11-16 Thread Jakub Kicinski

From: John Hurley 

Register a callback for offloading flows that have a repr as their egress
device. The new egdev_register function is added to net-next for the 4.15
release.

Signed-off-by: John Hurley 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/flower/main.c| 18 ++
 drivers/net/ethernet/netronome/nfp/flower/main.h|  2 ++
 drivers/net/ethernet/netronome/nfp/flower/offload.c |  6 ++
 drivers/net/ethernet/netronome/nfp/nfp_app.h| 20 
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c   |  9 -
 5 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c 
b/drivers/net/ethernet/netronome/nfp/flower/main.c
index e0283bb24f06..8fcc90c0d2d3 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -125,6 +125,21 @@ nfp_flower_repr_netdev_stop(struct nfp_app *app, struct 
nfp_repr *repr)
return nfp_flower_cmsg_portmod(repr, false);
 }
 
+static int
+nfp_flower_repr_netdev_init(struct nfp_app *app, struct net_device *netdev)
+{
+   return tc_setup_cb_egdev_register(netdev,
+ nfp_flower_setup_tc_egress_cb,
+ netdev_priv(netdev));
+}
+
+static void
+nfp_flower_repr_netdev_clean(struct nfp_app *app, struct net_device *netdev)
+{
+   tc_setup_cb_egdev_unregister(netdev, nfp_flower_setup_tc_egress_cb,
+netdev_priv(netdev));
+}
+
 static void nfp_flower_sriov_disable(struct nfp_app *app)
 {
struct nfp_flower_priv *priv = app->priv;
@@ -452,6 +467,9 @@ const struct nfp_app_type app_flower = {
.vnic_init  = nfp_flower_vnic_init,
.vnic_clean = nfp_flower_vnic_clean,
 
+   .repr_init  = nfp_flower_repr_netdev_init,
+   .repr_clean = nfp_flower_repr_netdev_clean,
+
.repr_open  = nfp_flower_repr_netdev_open,
.repr_stop  = nfp_flower_repr_netdev_stop,
 
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h 
b/drivers/net/ethernet/netronome/nfp/flower/main.h
index a69ea62e9c9c..e6b26c5ae6e0 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -196,5 +196,7 @@ void nfp_tunnel_del_ipv4_off(struct nfp_app *app, __be32 
ipv4);
 void nfp_tunnel_add_ipv4_off(struct nfp_app *app, __be32 ipv4);
 void nfp_tunnel_request_route(struct nfp_app *app, struct sk_buff *skb);
 void nfp_tunnel_keep_alive(struct nfp_app *app, struct sk_buff *skb);
+int nfp_flower_setup_tc_egress_cb(enum tc_setup_type type, void *type_data,
+ void *cb_priv);
 
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c 
b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index cdbb5464b790..a0193e0c24a0 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -465,6 +465,12 @@ nfp_flower_repr_offload(struct nfp_app *app, struct 
net_device *netdev,
return -EOPNOTSUPP;
 }
 
+int nfp_flower_setup_tc_egress_cb(enum tc_setup_type type, void *type_data,
+ void *cb_priv)
+{
+   return -EINVAL;
+}
+
 static int nfp_flower_setup_tc_block_cb(enum tc_setup_type type,
void *type_data, void *cb_priv)
 {
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h 
b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 54b67c9b8d5b..0e5e0305ad1c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -76,6 +76,8 @@ extern const struct nfp_app_type app_flower;
  * @vnic_free: free up app's vNIC state
  * @vnic_init: vNIC netdev was registered
  * @vnic_clean:vNIC netdev about to be unregistered
+ * @repr_init: representor about to be registered
+ * @repr_clean:representor about to be unregistered
  * @repr_open: representor netdev open callback
  * @repr_stop: representor netdev stop callback
  * @start: start application logic
@@ -109,6 +111,9 @@ struct nfp_app_type {
int (*vnic_init)(struct nfp_app *app, struct nfp_net *nn);
void (*vnic_clean)(struct nfp_app *app, struct nfp_net *nn);
 
+   int (*repr_init)(struct nfp_app *app, struct net_device *netdev);
+   void (*repr_clean)(struct nfp_app *app, struct net_device *netdev);
+
int (*repr_open)(struct nfp_app *app, struct nfp_repr *repr);
int (*repr_stop)(struct nfp_app *app, struct nfp_repr *repr);
 
@@ -212,6 +217,21 @@ static inline int nfp_app_repr_stop(struct nfp_app *app, 
struct nfp_repr *repr)
return app->type->repr_stop(app, repr);
 }
 
+static inline int
+nfp_app_repr_init(struct nfp_app *app, struct net_device *netdev)
+{
+   if

[PATCH net 5/5] nfp: remove false positive offloads in flower vxlan

2017-11-16 Thread Jakub Kicinski

From: John Hurley 

Pass information to the match offload on whether or not the repr is the
ingress or egress dev. Only accept tunnel matches if repr is the egress
dev.

This means rules such as the following are successfully offloaded:
tc .. add dev vxlan0 .. enc_dst_port 4789 .. action redirect dev nfp_p0

While rules such as the following are rejected:
tc .. add dev nfp_p0 .. enc_dst_port 4789 .. action redirect dev vxlan0

Also reject non tunnel flows that are offloaded to an egress dev.
Non tunnel matches assume that the offload dev is the ingress port and
offload a match accordingly.

Fixes: 611aec101ab7 ("nfp: compile flower vxlan tunnel metadata match fields")
Signed-off-by: John Hurley 
Reviewed-by: Jakub Kicinski 
---
 .../net/ethernet/netronome/nfp/flower/offload.c| 32 +-
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c 
b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index a0193e0c24a0..f5d73b83dcc2 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -131,7 +131,8 @@ static bool nfp_flower_check_higher_than_mac(struct 
tc_cls_flower_offload *f)
 
 static int
 nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
-   struct tc_cls_flower_offload *flow)
+   struct tc_cls_flower_offload *flow,
+   bool egress)
 {
struct flow_dissector_key_basic *mask_basic = NULL;
struct flow_dissector_key_basic *key_basic = NULL;
@@ -167,6 +168,9 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls 
*ret_key_ls,
skb_flow_dissector_target(flow->dissector,
  
FLOW_DISSECTOR_KEY_ENC_CONTROL,
  flow->key);
+   if (!egress)
+   return -EOPNOTSUPP;
+
if (mask_enc_ctl->addr_type != 0x ||
enc_ctl->addr_type != FLOW_DISSECTOR_KEY_IPV4_ADDRS)
return -EOPNOTSUPP;
@@ -194,6 +198,9 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls 
*ret_key_ls,
 
key_layer |= NFP_FLOWER_LAYER_VXLAN;
key_size += sizeof(struct nfp_flower_vxlan);
+   } else if (egress) {
+   /* Reject non tunnel matches offloaded to egress repr. */
+   return -EOPNOTSUPP;
}
 
if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
@@ -315,7 +322,7 @@ nfp_flower_allocate_new(struct nfp_fl_key_ls *key_layer)
  */
 static int
 nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
-  struct tc_cls_flower_offload *flow)
+  struct tc_cls_flower_offload *flow, bool egress)
 {
struct nfp_flower_priv *priv = app->priv;
struct nfp_fl_payload *flow_pay;
@@ -326,7 +333,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct 
net_device *netdev,
if (!key_layer)
return -ENOMEM;
 
-   err = nfp_flower_calculate_key_layers(key_layer, flow);
+   err = nfp_flower_calculate_key_layers(key_layer, flow, egress);
if (err)
goto err_free_key_ls;
 
@@ -447,7 +454,7 @@ nfp_flower_get_stats(struct nfp_app *app, struct 
tc_cls_flower_offload *flow)
 
 static int
 nfp_flower_repr_offload(struct nfp_app *app, struct net_device *netdev,
-   struct tc_cls_flower_offload *flower)
+   struct tc_cls_flower_offload *flower, bool egress)
 {
if (!eth_proto_is_802_3(flower->common.protocol) ||
flower->common.chain_index)
@@ -455,7 +462,7 @@ nfp_flower_repr_offload(struct nfp_app *app, struct 
net_device *netdev,
 
switch (flower->command) {
case TC_CLSFLOWER_REPLACE:
-   return nfp_flower_add_offload(app, netdev, flower);
+   return nfp_flower_add_offload(app, netdev, flower, egress);
case TC_CLSFLOWER_DESTROY:
return nfp_flower_del_offload(app, netdev, flower);
case TC_CLSFLOWER_STATS:
@@ -468,7 +475,18 @@ nfp_flower_repr_offload(struct nfp_app *app, struct 
net_device *netdev,
 int nfp_flower_setup_tc_egress_cb(enum tc_setup_type type, void *type_data,
  void *cb_priv)
 {
-   return -EINVAL;
+   struct nfp_repr *repr = cb_priv;
+
+   if (!tc_can_offload(repr->netdev))
+   return -EOPNOTSUPP;
+
+   switch (type) {
+   case TC_SETUP_CLSFLOWER:
+   return nfp_flower_repr_offload(repr->app, repr->netdev,
+  type_data, true);
+   default:
+   return -EOPNOTSUPP;
+   }
 }
 
 static int nfp_flower_setup_tc_block_cb(enum tc_setup_type type,
@@

[PATCH net 3/5] nfp: inherit the max_mtu from the PF netdev

2017-11-16 Thread Jakub Kicinski

From: Dirk van der Merwe 

The PF netdev is used for data transfer for reprs, so reprs inherit the
maximum MTU settings of the PF netdev.

Fixes: 5de73ee46704 ("nfp: general representor implementation")
Signed-off-by: Dirk van der Merwe 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index 1bce8c131bb9..fa052a929170 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -297,6 +297,8 @@ int nfp_repr_init(struct nfp_app *app, struct net_device 
*netdev,
netdev->netdev_ops = _repr_netdev_ops;
netdev->ethtool_ops = _port_ethtool_ops;
 
+   netdev->max_mtu = pf_netdev->max_mtu;
+
SWITCHDEV_SET_OPS(netdev, _port_switchdev_ops);
 
if (nfp_app_has_tc(app)) {
-- 
2.14.1

[PATCH net 1/5] nfp: fix flower offload metadata flag usage

2017-11-16 Thread Jakub Kicinski

From: Pieter Jansen van Vuuren 

Hardware has no notion of new or last mask id, instead it makes use of the
message type (i.e. add flow or del flow) in combination with a single bit
in metadata flags to determine when to add or delete a mask id. Previously
we made use of the new or last flags to indicate that a new mask should be
allocated or deallocated, respectively. This incorrect behaviour is fixed
by making use single bit in metadata flags to indicate mask allocation or
deallocation.

Fixes: 43f84b72c50d ("nfp: add metadata to each flow offload")
Signed-off-by: Pieter Jansen van Vuuren 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/flower/main.h | 3 +--
 drivers/net/ethernet/netronome/nfp/flower/metadata.c | 7 +--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h 
b/drivers/net/ethernet/netronome/nfp/flower/main.h
index c90e72b7ff5a..a69ea62e9c9c 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -52,8 +52,7 @@ struct nfp_app;
 #define NFP_FLOWER_MASK_ELEMENT_RS 1
 #define NFP_FLOWER_MASK_HASH_BITS  10
 
-#define NFP_FL_META_FLAG_NEW_MASK  128
-#define NFP_FL_META_FLAG_LAST_MASK 1
+#define NFP_FL_META_FLAG_MANAGE_MASK   BIT(7)
 
 #define NFP_FL_MASK_REUSE_TIME_NS  4
 #define NFP_FL_MASK_ID_LOCATION1
diff --git a/drivers/net/ethernet/netronome/nfp/flower/metadata.c 
b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
index 193520ef23f0..db977cf8e933 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/metadata.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
@@ -282,7 +282,7 @@ nfp_check_mask_add(struct nfp_app *app, char *mask_data, 
u32 mask_len,
id = nfp_add_mask_table(app, mask_data, mask_len);
if (id < 0)
return false;
-   *meta_flags |= NFP_FL_META_FLAG_NEW_MASK;
+   *meta_flags |= NFP_FL_META_FLAG_MANAGE_MASK;
}
*mask_id = id;
 
@@ -299,6 +299,9 @@ nfp_check_mask_remove(struct nfp_app *app, char *mask_data, 
u32 mask_len,
if (!mask_entry)
return false;
 
+   if (meta_flags)
+   *meta_flags &= ~NFP_FL_META_FLAG_MANAGE_MASK;
+
*mask_id = mask_entry->mask_id;
mask_entry->ref_cnt--;
if (!mask_entry->ref_cnt) {
@@ -306,7 +309,7 @@ nfp_check_mask_remove(struct nfp_app *app, char *mask_data, 
u32 mask_len,
nfp_release_mask_id(app, *mask_id);
kfree(mask_entry);
if (meta_flags)
-   *meta_flags |= NFP_FL_META_FLAG_LAST_MASK;
+   *meta_flags |= NFP_FL_META_FLAG_MANAGE_MASK;
}
 
return true;
-- 
2.14.1

[PATCH net 2/5] nfp: fix vlan receive MAC statistics typo

2017-11-16 Thread Jakub Kicinski

From: Pieter Jansen van Vuuren 

Correct typo in vlan receive MAC stats. Previously the MAC statistics
reported in ethtool for vlan receive contained a typo resulting in ethtool
reporting rx_vlan_reveive_ok instead of rx_vlan_received_ok.

Fixes: a5950182c00e ("nfp: map mac_stats and vf_cfg BARs")
Fixes: 098ce840c9ef ("nfp: report MAC statistics in ethtool")
Reported-by: Brendan Galloway 
Signed-off-by: Pieter Jansen van Vuuren 
Reviewed-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 2 +-
 drivers/net/ethernet/netronome/nfp/nfp_port.h| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 60c8d733a37d..2801ecd09eab 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -104,7 +104,7 @@ static const struct nfp_et_stat nfp_mac_et_stats[] = {
{ "rx_frame_too_long_errors",
NFP_MAC_STATS_RX_FRAME_TOO_LONG_ERRORS, },
{ "rx_range_length_errors", NFP_MAC_STATS_RX_RANGE_LENGTH_ERRORS, },
-   { "rx_vlan_reveive_ok", NFP_MAC_STATS_RX_VLAN_REVEIVE_OK, },
+   { "rx_vlan_received_ok",NFP_MAC_STATS_RX_VLAN_RECEIVED_OK, },
{ "rx_errors",  NFP_MAC_STATS_RX_IN_ERRORS, },
{ "rx_broadcast_pkts",  NFP_MAC_STATS_RX_IN_BROADCAST_PKTS, },
{ "rx_drop_events", NFP_MAC_STATS_RX_DROP_EVENTS, },
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h 
b/drivers/net/ethernet/netronome/nfp/nfp_port.h
index 51dcb9c603ee..21bd4aa32646 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h
@@ -157,7 +157,7 @@ void nfp_devlink_port_unregister(struct nfp_port *port);
/* unused 0x008 */
 #define NFP_MAC_STATS_RX_FRAME_TOO_LONG_ERRORS (NFP_MAC_STATS_BASE + 
0x010)
 #define NFP_MAC_STATS_RX_RANGE_LENGTH_ERRORS   (NFP_MAC_STATS_BASE + 
0x018)
-#define NFP_MAC_STATS_RX_VLAN_REVEIVE_OK   (NFP_MAC_STATS_BASE + 
0x020)
+#define NFP_MAC_STATS_RX_VLAN_RECEIVED_OK  (NFP_MAC_STATS_BASE + 
0x020)
 #define NFP_MAC_STATS_RX_IN_ERRORS (NFP_MAC_STATS_BASE + 
0x028)
 #define NFP_MAC_STATS_RX_IN_BROADCAST_PKTS (NFP_MAC_STATS_BASE + 
0x030)
 #define NFP_MAC_STATS_RX_DROP_EVENTS   (NFP_MAC_STATS_BASE + 
0x038)
-- 
2.14.1

Re: [PATCH iproute2 v2] Add "show" subcommand to "ip fou"

2017-11-16 Thread Stephen Hemminger

On Thu, 16 Nov 2017 16:53:22 -0800
Greg Greenway  wrote:

> Sample output:
> 
> $ sudo ./ip/ip fou add port 111 ipproto 11
> $ sudo ./ip/ip fou add port 222 ipproto 22 -6
> $ ./ip/ip fou show
> port 222 ipproto 22 -6
> port 111 ipproto 11
> 
> Signed-off-by: Greg Greenway 

Applied, thanks. Sorry for the annoying procedural problem.

[PATCH iproute2 v2] Add "show" subcommand to "ip fou"

2017-11-16 Thread Greg Greenway

Sample output:

$ sudo ./ip/ip fou add port 111 ipproto 11
$ sudo ./ip/ip fou add port 222 ipproto 22 -6
$ ./ip/ip fou show
port 222 ipproto 22 -6
port 111 ipproto 11

Signed-off-by: Greg Greenway 
---
 ip/ipfou.c | 60 
 1 file changed, 60 insertions(+)

diff --git a/ip/ipfou.c b/ip/ipfou.c
index 00dbe15..ecbaf11 100644
--- a/ip/ipfou.c
+++ b/ip/ipfou.c
@@ -28,6 +28,7 @@ static void usage(void)
fprintf(stderr, "Usage: ip fou add port PORT "
"{ ipproto PROTO  | gue } [ -6 ]\n");
fprintf(stderr, "   ip fou del port PORT [ -6 ]\n");
+   fprintf(stderr, "   ip fou show\n");
fprintf(stderr, "\n");
fprintf(stderr, "Where: PROTO { ipproto-name | 1..255 }\n");
fprintf(stderr, "   PORT { 1..65535 }\n");
@@ -134,6 +135,63 @@ static int do_del(int argc, char **argv)
return 0;
 }
 
+static int print_fou_mapping(const struct sockaddr_nl *who,
+struct nlmsghdr *n, void *arg)
+{
+   FILE *fp = (FILE *)arg;
+   struct genlmsghdr *ghdr;
+   struct rtattr *tb[FOU_ATTR_MAX + 1];
+   int len = n->nlmsg_len;
+   unsigned family;
+
+   if (n->nlmsg_type != genl_family)
+   return 0;
+
+   len -= NLMSG_LENGTH(GENL_HDRLEN);
+   if (len < 0)
+   return -1;
+
+   ghdr = NLMSG_DATA(n);
+   parse_rtattr(tb, FOU_ATTR_MAX, (void *) ghdr + GENL_HDRLEN, len);
+
+   if (tb[FOU_ATTR_PORT])
+   fprintf(fp, "port %u", 
ntohs(rta_getattr_u16(tb[FOU_ATTR_PORT])));
+   if (tb[FOU_ATTR_TYPE] && rta_getattr_u8(tb[FOU_ATTR_TYPE]) == 
FOU_ENCAP_GUE)
+   fprintf(fp, " gue");
+   else if (tb[FOU_ATTR_IPPROTO])
+   fprintf(fp, " ipproto %u", 
rta_getattr_u8(tb[FOU_ATTR_IPPROTO]));
+   if (tb[FOU_ATTR_AF]) {
+   family = rta_getattr_u8(tb[FOU_ATTR_AF]);
+   if (family == AF_INET6)
+   fprintf(fp, " -6");
+   }
+   fprintf(fp, "\n");
+
+   return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+   FOU_REQUEST(req, 4096, FOU_CMD_GET, NLM_F_REQUEST | NLM_F_DUMP);
+
+   if (argc > 0) {
+   fprintf(stderr, "\"ip fou show\" does not take any 
arguments.\n");
+   return -1;
+   }
+
+   if (rtnl_send(_rth, , req.n.nlmsg_len) < 0) {
+   perror("Cannot send show request");
+   exit(1);
+   }
+
+   if (rtnl_dump_filter(_rth, print_fou_mapping, stdout) < 0) {
+   fprintf(stderr, "Dump terminated\n");
+   return 1;
+   }
+
+   return 0;
+}
+
 int do_ipfou(int argc, char **argv)
 {
if (argc < 1)
@@ -149,6 +207,8 @@ int do_ipfou(int argc, char **argv)
return do_add(argc-1, argv+1);
if (matches(*argv, "delete") == 0)
return do_del(argc-1, argv+1);
+   if (matches(*argv, "show") == 0)
+   return do_show(argc-1, argv+1);
fprintf(stderr, "Command \"%s\" is unknown, try \"ip fou help\".\n", 
*argv);
exit(-1);
 }
-- 
2.7.4

iproute2: make ip route list to search by metric too

2017-11-16 Thread Alexander Zubkov

Hello all,

Currently routes in the Linux routing table have these "key" fields:
prefix, tos, table, metric (as I know). I.e. we cannot have two
different routes with the same set of this fields. And "ip route list"
command can be provided with all but one of those fields. We cannot
pass metric to it and this is inconvenient. I ask if this behaviour
can be changed by someone. We can even use "secondary" fields, for
example type, dev or via, but not metric unfortunately.
Sorry, I can not provide patches. I have written code long time ago. I
tried to trace it, but as I see it parses arguments and fills some
structures. And then my tries to understand failed.
I opened the bug: https://bugzilla.kernel.org/show_bug.cgi?id=197897,
but I was pointed out that this mailing list is a better place for
this question.

--
Alexander Zubkov

Re: [PATCH iproute2] Add "show" subcommand to "ip fou"

2017-11-16 Thread Stephen Hemminger

On Fri, 03 Nov 2017 10:19:22 -0700
Greg Greenway  wrote:

> On Nov 1, 2017, at 2:03 PM, Stephen Hemminger  
> wrote:
> > 
> > On Tue, 31 Oct 2017 13:00:47 -0700
> > Greg Greenway  wrote:
> >   
> >> +  if (tb[FOU_ATTR_AF]) {
> >> +  family = rta_getattr_u8(tb[FOU_ATTR_AF]);
> >> +  if (family == AF_INET)
> >> +  family_str = "AF_INET";
> >> +  else if (family == AF_INET6)
> >> +  family_str = "AF_INET6";
> >> +  else
> >> +  family_str = "unknown";
> >> +  fprintf(fp, "af %s ", family_str);  
> > 
> > The unwritten rule for ip commands is that the show function
> > must format the output with same command syntax as the other commands 
> > set/add/delete.
> > Since there is no "af AF_INET" option to ip fou, this breaks that 
> > convention.
> > Either ignore the address family, change the add command, or output with 
> > same
> > syntax (-6); preferably the latter.  
> 
> That makes sense.  Here's a corrected version.  It also avoids a 
> trailing-space in the output.

Yes, your followup looks correct but since it didn't follow the mailing list
patch protocol it was not picked up and managed by patchwork.
 https://patchwork.ozlabs.org/patch/832717/

You need to post the patch as new patch (ie not a followup) with the "v2" 
designation
in order to get it correctly picked up and managed by patchwork.

Re: [PATCH] net: bridge: add max_fdb_count

2017-11-16 Thread Stephen Hemminger

On Thu, 16 Nov 2017 21:21:55 +0100
Vincent Bernat  wrote:

>  ❦ 16 novembre 2017 20:23 +0100, Andrew Lunn  :
> 
> > struct net_bridge_fdb_entry is 40 bytes.
> >
> > My WiFi access point which is also a 5 port bridge, currently has 97MB
> > free RAM. That is space for about 2.5M FDB entries. So even Roopa's
> > 128K is not really a problem, in terms of memory.  
> 
> I am also interested in Sarah's patch because we can now have bridge
> with many ports through VXLAN. The FDB can be replicated to an external
> daemon with BGP and the cost of each additional MAC address is therefore
> higher than just a few bytes. It seems simpler to implement a limiting
> policy early (at the port or bridge level).
> 
> Also, this is a pretty standard limit to have for a bridge (switchport
> port-security maximum on Cisco, set interface X mac-limit on
> Juniper). And it's not something easy to do with ebtables.

I want an optional limit per port, it makes a lot of sense.
If for no other reason that huge hash tables are a performance problems.

There is a bigger question about which fdb to evict but just dropping the
new one seems to be easiest and as good as any other solution.

Re: [iproute2 1/1] tipc: change family attribute from u32 to u16

2017-11-16 Thread Stephen Hemminger

On Wed, 15 Nov 2017 17:25:44 +0100
Jon Maloy  wrote:

> commit 28033ae4e0f ("net: netlink: Update attr validation to require
> exact length for some types") introduces a stricter control on attributes
> of type NLA_U* and NLA_S*.
> 
> Since the tipc tool is sending a family attribute of u32 instead of as
> expected u16 the tool is now effectively broken.
> 
> We fix this by changing the type of the said attribute.
> 
> Signed-off-by: Jon Maloy 

Applied.

I noticed devlink has the same problem!
devlink/mnlg.c: mnl_attr_put_u32(nlh, CTRL_ATTR_FAMILY_ID, nlg->id);

Re: GRO disabled with IPv4 options

2017-11-16 Thread Tom Herbert

On Thu, Nov 16, 2017 at 1:40 PM, Herbert Xu  wrote:
> On Thu, Nov 16, 2017 at 04:12:43PM +0100, Cristian Klein wrote:
>>
>> Does somebody know the rationale for this? Is it because IPv4
>> options are rarely used, hence implementing GRO in that case does
>> not pay off or are there some caveats? Specifically would it make
>
> Precisely.  GRO is about optimising for the common case.  At the
> time there was no impetus to support IP options.
>
>> sense to do GRO when the IPv4 options are byte-identical in
>> consecutive packets?
>
> Yes there is no reason why we can't do this.  As long as it doesn't
> penalise the non-IP-option case too much.
>
Of course it would also be nice to have GRO support for various IPv6
extension headers, at this point we're more likely to see those rather
than IP options in real deployment!

Tom

> Cheers,
> --
> Email: Herbert Xu 
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: GRO disabled with IPv4 options

2017-11-16 Thread Eric Dumazet

On Thu, 2017-11-16 at 16:12 +0100, Cristian Klein wrote:
> [CC-ing Herbert Xu, who is to 'git blame' for the code in question. :)]
> 
> Dear all,
> 
> We are working on a research prototype which, among others, adds a new 
> IPv4 option. During testing we noticed that the packets captured by 
> tcpdump shrank from 10s of KBs to the MTU, which indicates that Generic 
> Receive Offload (GRO) got disabled.
> 
> Upon further investigation, we found the following line in 
> `inet_gro_receive`:
> 
>   if (*(u8 *)iph != 0x45)
>   goto out_unlock;
> 
> in plain English, don't do GRO if any IPv4 options are present.
> 
> Does somebody know the rationale for this? Is it because IPv4 options 
> are rarely used, hence implementing GRO in that case does not pay off or 
> are there some caveats? Specifically would it make sense to do GRO when 
> the IPv4 options are byte-identical in consecutive packets?

I guess nobody really uses IPV4 options, and you are the first caring
enough ;)

Re: [PATCH v3 2/2] sock: Move the socket inuse to namespace.

2017-11-16 Thread Tonghao Zhang

On Fri, Nov 17, 2017 at 4:20 AM, Cong Wang  wrote:
> On Wed, Nov 15, 2017 at 7:36 AM, Tonghao Zhang  
> wrote:
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index b899d8669388..f01ed0b41bde 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -145,6 +145,10 @@
>>  static DEFINE_MUTEX(proto_list_mutex);
>>  static LIST_HEAD(proto_list);
>>
>> +#ifdef CONFIG_PROC_FS
>> +static void sock_inuse_add(struct net *net, int val);
>> +#endif
>> +
>>  /**
>>   * sk_ns_capable - General socket capability test
>>   * @sk: Socket to use a capability on or through
>> @@ -1536,6 +1540,7 @@ struct sock *sk_alloc(struct net *net, int family, 
>> gfp_t priority,
>> if (likely(sk->sk_net_refcnt))
>> get_net(net);
>> sock_net_set(sk, net);
>> +   sock_inuse_add(net, 1);
>
> You don't need to define a nop for sock_inuse_add() in
> !CONFIG_PROC_FS case?
Yes, we should. But we cant config the CONFIG_PROC_FS in 'make menuconfig'
Then !CONFIG_PROC_FS is a rare event.  so I dont check it there, such
as  other counter sock_prot_inuse_add.
A patch will be sent for fixing  !CONFIG_PROC_FS. and v4 will be sent
too. Thanks a lot, cong.

[PATCH] qed: fix unnecessary call to memset cocci warnings

2017-11-16 Thread Vasyl Gomonovych

Use kzalloc rather than kmalloc followed by memset with 0

drivers/net/ethernet/qlogic/qed/qed_dcbx.c:1280:13-20: WARNING:
kzalloc should be used for dcbx_info, instead of kmalloc/memset
Generated by: scripts/coccinelle/api/alloc/kzalloc-simple.cocci

Signed-off-by: Vasyl Gomonovych 
---
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c 
b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index 8f6ccc0c39e5..cc9e0dfcee48 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -1277,11 +1277,10 @@ static struct qed_dcbx_get *qed_dcbnl_get_dcbx(struct 
qed_hwfn *hwfn,
 {
struct qed_dcbx_get *dcbx_info;
 
-   dcbx_info = kmalloc(sizeof(*dcbx_info), GFP_ATOMIC);
+   dcbx_info = kzalloc(sizeof(*dcbx_info), GFP_ATOMIC);
if (!dcbx_info)
return NULL;
 
-   memset(dcbx_info, 0, sizeof(*dcbx_info));
if (qed_dcbx_query_params(hwfn, dcbx_info, type)) {
kfree(dcbx_info);
return NULL;
-- 
1.9.1

Re: GRO disabled with IPv4 options

2017-11-16 Thread Herbert Xu

On Thu, Nov 16, 2017 at 04:12:43PM +0100, Cristian Klein wrote:
>
> Does somebody know the rationale for this? Is it because IPv4
> options are rarely used, hence implementing GRO in that case does
> not pay off or are there some caveats? Specifically would it make

Precisely.  GRO is about optimising for the common case.  At the
time there was no impetus to support IP options.

> sense to do GRO when the IPv4 options are byte-identical in
> consecutive packets?

Yes there is no reason why we can't do this.  As long as it doesn't
penalise the non-IP-option case too much.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: [PATCH] qed: fix unnecessary call to memset cocci warnings

2017-11-16 Thread Vasyl Gomonovych

Sorry.

Re: [PATCH iproute2] Add "show" subcommand to "ip fou"

2017-11-16 Thread Greg Greenway


> On Nov 3, 2017, at 10:25 AM, Tom Herbert  wrote:
> 
> On Fri, Nov 3, 2017 at 10:19 AM, Greg Greenway  wrote:
>> On Nov 1, 2017, at 2:03 PM, Stephen Hemminger  
>> wrote:
>>> 
>>> On Tue, 31 Oct 2017 13:00:47 -0700
>>> Greg Greenway  wrote:
>>> 
 +if (tb[FOU_ATTR_AF]) {
 +family = rta_getattr_u8(tb[FOU_ATTR_AF]);
 +if (family == AF_INET)
 +family_str = "AF_INET";
 +else if (family == AF_INET6)
 +family_str = "AF_INET6";
 +else
 +family_str = "unknown";
 +fprintf(fp, "af %s ", family_str);
>>> 
>>> The unwritten rule for ip commands is that the show function
>>> must format the output with same command syntax as the other commands 
>>> set/add/delete.
>>> Since there is no "af AF_INET" option to ip fou, this breaks that 
>>> convention.
>>> Either ignore the address family, change the add command, or output with 
>>> same
>>> syntax (-6); preferably the latter.
>> 
>> That makes sense.  Here's a corrected version.  It also avoids a 
>> trailing-space in the output.
>> 
>> From: Greg Greenway 
>> Date: Tue, 31 Oct 2017 12:47:35 -0700
>> Subject: [PATCH] Add "show" subcommand to "ip fou".
>> 
>> Sample output:
>> 
>> $ sudo ./ip/ip fou add port 111 ipproto 11
>> $ sudo ./ip/ip fou add port 222 ipproto 22 -6
>> $ ./ip/ip fou show
>> port 222 ipproto 22 -6
>> port 111 ipproto 11
>> 
>> Signed-off-by: Greg Greenway 
>> ---
>> ip/ipfou.c | 60 
>> 1 file changed, 60 insertions(+)
>> 
>> diff --git a/ip/ipfou.c b/ip/ipfou.c
>> index 00dbe15..ecbaf11 100644
>> --- a/ip/ipfou.c
>> +++ b/ip/ipfou.c
>> @@ -28,6 +28,7 @@ static void usage(void)
>>fprintf(stderr, "Usage: ip fou add port PORT "
>>"{ ipproto PROTO  | gue } [ -6 ]\n");
>>fprintf(stderr, "   ip fou del port PORT [ -6 ]\n");
>> +   fprintf(stderr, "   ip fou show\n");
>>fprintf(stderr, "\n");
>>fprintf(stderr, "Where: PROTO { ipproto-name | 1..255 }\n");
>>fprintf(stderr, "   PORT { 1..65535 }\n");
>> @@ -134,6 +135,63 @@ static int do_del(int argc, char **argv)
>>return 0;
>> }
>> 
>> +static int print_fou_mapping(const struct sockaddr_nl *who,
>> +struct nlmsghdr *n, void *arg)
>> +{
>> +   FILE *fp = (FILE *)arg;
>> +   struct genlmsghdr *ghdr;
>> +   struct rtattr *tb[FOU_ATTR_MAX + 1];
>> +   int len = n->nlmsg_len;
>> +   unsigned family;
>> +
>> +   if (n->nlmsg_type != genl_family)
>> +   return 0;
>> +
>> +   len -= NLMSG_LENGTH(GENL_HDRLEN);
>> +   if (len < 0)
>> +   return -1;
>> +
>> +   ghdr = NLMSG_DATA(n);
>> +   parse_rtattr(tb, FOU_ATTR_MAX, (void *) ghdr + GENL_HDRLEN, len);
>> +
>> +   if (tb[FOU_ATTR_PORT])
>> +   fprintf(fp, "port %u", 
>> ntohs(rta_getattr_u16(tb[FOU_ATTR_PORT])));
>> +   if (tb[FOU_ATTR_TYPE] && rta_getattr_u8(tb[FOU_ATTR_TYPE]) == 
>> FOU_ENCAP_GUE)
>> +   fprintf(fp, " gue");
>> +   else if (tb[FOU_ATTR_IPPROTO])
>> +   fprintf(fp, " ipproto %u", 
>> rta_getattr_u8(tb[FOU_ATTR_IPPROTO]));
>> +   if (tb[FOU_ATTR_AF]) {
>> +   family = rta_getattr_u8(tb[FOU_ATTR_AF]);
>> +   if (family == AF_INET6)
>> +   fprintf(fp, " -6");
>> +   }
>> +   fprintf(fp, "\n");
>> +
>> +   return 0;
>> +}
>> +
>> +static int do_show(int argc, char **argv)
>> +{
>> +   FOU_REQUEST(req, 4096, FOU_CMD_GET, NLM_F_REQUEST | NLM_F_DUMP);
>> +
>> +   if (argc > 0) {
>> +   fprintf(stderr, "\"ip fou show\" does not take any 
>> arguments.\n");
>> +   return -1;
>> +   }
>> +
>> +   if (rtnl_send(_rth, , req.n.nlmsg_len) < 0) {
>> +   perror("Cannot send show request");
>> +   exit(1);
>> +   }
>> +
>> +   if (rtnl_dump_filter(_rth, print_fou_mapping, stdout) < 0) {
>> +   fprintf(stderr, "Dump terminated\n");
>> +   return 1;
>> +   }
>> +
>> +   return 0;
>> +}
>> +
>> int do_ipfou(int argc, char **argv)
>> {
>>if (argc < 1)
>> @@ -149,6 +207,8 @@ int do_ipfou(int argc, char **argv)
>>return do_add(argc-1, argv+1);
>>if (matches(*argv, "delete") == 0)
>>return do_del(argc-1, argv+1);
>> +   if (matches(*argv, "show") == 0)
>> +   return do_show(argc-1, argv+1);
>>fprintf(stderr, "Command \"%s\" is unknown, try \"ip fou help\".\n", 
>> *argv);
>>exit(-1);
>> }
>> --
>> 2.7.4
>> 
> Acked-by: Tom Herbert 

Are there any other issues/concerns, or anything else I need to do for this 
patch to be accepted?

Thanks,
Greg

Re: [PATCH] net: bridge: add max_fdb_count

2017-11-16 Thread Sarah Newman

On 11/16/2017 11:36 AM, Nikolay Aleksandrov wrote:
> On 16 November 2017 21:23:25 EET, Andrew Lunn  wrote:
>>> Linux bridges can also be used in small embedded devices. With no
>> limit,
>>> the likely result from those devices being attacked is the device
>> gets
>>> thrown away for being unreliable.
>>
>> Hi Sarah
>>
>> Just to get a gut feeling...
>>
>> struct net_bridge_fdb_entry is 40 bytes.
>>
>> My WiFi access point which is also a 5 port bridge, currently has 97MB
>> free RAM. That is space for about 2.5M FDB entries. So even Roopa's
>> 128K is not really a problem, in terms of memory.

The recommendation was a default maximum of ~4B entries rather than 128k or 
256k entries.

2.5M entries over a 300 second aging period is ~8.3kpps.
>>> Maybe what's needed is two thresholds, one for warning and one for
>> enforcement.
>>> The warning limit would need to be low enough that the information
>> had a good chance
>>> of being logged before the system was under too much load to be able
>> to convey
>>> that information. The enforcement limit could be left as default
>> inactive until
>>> shown that it needed to be otherwise.
>>
>> What exactly is the problem here? Does the DoS exhaust memory, or does
>> the hashing algorithm not scale?

My personal observation was 100% CPU usage, not memory exhaustion. Others have 
documented memory exhaustion.

> 
> Just a note - when net-next opens I'll send patches
> which move the fdb to a resizeable hashtable that scales nicely even with 
> hundreds of thousands of entries so only the memory issue will remain.

Thank you.

I believe that under attack, the number of entries could exceed hundreds of 
thousands when accumulated over the default aging time. Perhaps it would
still make sense to support a hard limit, even if it is quite high by default?

> 
>>
>> It is more work, but the table could be more closely tied to the
>> memory management code. When memory is getting low, callbacks are made
>> asking to free up memory. Register such a callback and throw away part
>> of the table when memory is getting low. There is then no need to
>> limit the size, but print a rate limited warning when asked to reduce
>> the size.

That sounds reasonable, though I think it would only trigger in the small 
embedded devices before CPU usage became an issue.

--Sarah

Re: linux-next: build warning after merge of the netfilter-next tree

2017-11-16 Thread Stephen Rothwell

Hi Pablo,

On Thu, 16 Nov 2017 15:18:00 +0100 Pablo Neira Ayuso  
wrote:
>
> The patch is already at davem's tree. It's flying there.

Excellent, thanks.

-- 
Cheers,
Stephen Rothwell

Re: [PATCH] net: bridge: add max_fdb_count

2017-11-16 Thread Vincent Bernat

 ❦ 16 novembre 2017 20:23 +0100, Andrew Lunn  :

> struct net_bridge_fdb_entry is 40 bytes.
>
> My WiFi access point which is also a 5 port bridge, currently has 97MB
> free RAM. That is space for about 2.5M FDB entries. So even Roopa's
> 128K is not really a problem, in terms of memory.

I am also interested in Sarah's patch because we can now have bridge
with many ports through VXLAN. The FDB can be replicated to an external
daemon with BGP and the cost of each additional MAC address is therefore
higher than just a few bytes. It seems simpler to implement a limiting
policy early (at the port or bridge level).

Also, this is a pretty standard limit to have for a bridge (switchport
port-security maximum on Cisco, set interface X mac-limit on
Juniper). And it's not something easy to do with ebtables.
-- 
Use the good features of a language; avoid the bad ones.
- The Elements of Programming Style (Kernighan & Plauger)

Re: [PATCH v3 2/2] sock: Move the socket inuse to namespace.

2017-11-16 Thread Cong Wang

On Wed, Nov 15, 2017 at 7:36 AM, Tonghao Zhang  wrote:
> diff --git a/net/core/sock.c b/net/core/sock.c
> index b899d8669388..f01ed0b41bde 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -145,6 +145,10 @@
>  static DEFINE_MUTEX(proto_list_mutex);
>  static LIST_HEAD(proto_list);
>
> +#ifdef CONFIG_PROC_FS
> +static void sock_inuse_add(struct net *net, int val);
> +#endif
> +
>  /**
>   * sk_ns_capable - General socket capability test
>   * @sk: Socket to use a capability on or through
> @@ -1536,6 +1540,7 @@ struct sock *sk_alloc(struct net *net, int family, 
> gfp_t priority,
> if (likely(sk->sk_net_refcnt))
> get_net(net);
> sock_net_set(sk, net);
> +   sock_inuse_add(net, 1);

You don't need to define a nop for sock_inuse_add() in
!CONFIG_PROC_FS case?

Re: [PATCH] [net-next,v2] ibmvnic: fix dma_mapping_error call

2017-11-16 Thread Desnes Augusto Nunes do Rosário


First of all, I apologize for sending this patch to net-next!

Since this is a fix, it should had been sent to the regular net tree, 
which I'll do now with the proper fixes tag. My mistake!


Thanks for understanding and please discard this one.

On 11/16/2017 04:33 PM, Desnes Augusto Nunes do Rosario wrote:

This patch fixes the dma_mapping_error call to use the correct dma_addr
which is inside the ibmvnic_vpd struct. Moreover, it fixes a uninitialized
  warning for the local dma_addr.

Signed-off-by: Desnes A. Nunes do Rosario 
---
  drivers/net/ethernet/ibm/ibmvnic.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 04aaacb..1dc4aef 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -849,7 +849,6 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
  {
struct device *dev = >vdev->dev;
union ibmvnic_crq crq;
-   dma_addr_t dma_addr;
int len = 0;

if (adapter->vpd->buff)
@@ -879,7 +878,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
adapter->vpd->dma_addr =
dma_map_single(dev, adapter->vpd->buff, adapter->vpd->len,
   DMA_FROM_DEVICE);
-   if (dma_mapping_error(dev, dma_addr)) {
+   if (dma_mapping_error(dev, adapter->vpd->dma_addr)) {
dev_err(dev, "Could not map VPD buffer\n");
kfree(adapter->vpd->buff);
return -ENOMEM;



--
Desnes Augusto Nunes do Rosário
--

Linux Developer - IBM / Brazil
M.Sc. in Electrical and Computer Engineering - UFRN

(11) 9595-30-900
desn...@br.ibm.com

Re: [PATCH] net: bridge: add max_fdb_count

2017-11-16 Thread Nikolay Aleksandrov

On 16 November 2017 21:23:25 EET, Andrew Lunn  wrote:
>> Linux bridges can also be used in small embedded devices. With no
>limit,
>> the likely result from those devices being attacked is the device
>gets
>> thrown away for being unreliable.
>
>Hi Sarah
>
>Just to get a gut feeling...
>
>struct net_bridge_fdb_entry is 40 bytes.
>
>My WiFi access point which is also a 5 port bridge, currently has 97MB
>free RAM. That is space for about 2.5M FDB entries. So even Roopa's
>128K is not really a problem, in terms of memory.
>
>> Maybe what's needed is two thresholds, one for warning and one for
>enforcement.
>> The warning limit would need to be low enough that the information
>had a good chance
>> of being logged before the system was under too much load to be able
>to convey
>> that information. The enforcement limit could be left as default
>inactive until
>> shown that it needed to be otherwise.
>
>What exactly is the problem here? Does the DoS exhaust memory, or does
>the hashing algorithm not scale?

Just a note - when net-next opens I'll send patches
which move the fdb to a resizeable hashtable that scales nicely even with 
hundreds of thousands of entries so only the memory issue will remain.

>
>It is more work, but the table could be more closely tied to the
>memory management code. When memory is getting low, callbacks are made
>asking to free up memory. Register such a callback and throw away part
>of the table when memory is getting low. There is then no need to
>limit the size, but print a rate limited warning when asked to reduce
>the size.
>
>Andrew


-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

Re: [RFC PATCH 01/14] packet: introduce AF_PACKET V4 userspace API

2017-11-16 Thread chetan L

On Wed, Nov 15, 2017 at 5:44 PM, David Miller  wrote:
> From: chet l 
> Date: Wed, 15 Nov 2017 14:34:32 -0800
>
>> I have not reviewed the entire patchset but I think if we could add a
>> version_hdr and then unionize the fields, it might be easier to add
>> SVM support without having to spin v5. I could be wrong though.
>
> Please, NO VERSION FIELDS!
>
> Design things properly from the start rather than using a crutch of
> being able to "adjust things later".

Agreed. If this step in tpkt_v4 is able to follow what req1/2/3 did as
part of the setsockopt(..) API then it should be ok. If its a
different API then it will be difficult for the follow-on version(s)
to make seamless changes.

Look at tpacket_req3 for example. Since there was no hdr, I had no
option but to align the fields with tpacket_req/req2 during the setup.
I won't have access to a SMMUv3 capable ARM platform anytime soon. So
I can't actually test/write anything as of now.

Chetan

Re: [PATCH] net: bridge: add max_fdb_count

2017-11-16 Thread Andrew Lunn

> Linux bridges can also be used in small embedded devices. With no limit,
> the likely result from those devices being attacked is the device gets
> thrown away for being unreliable.

Hi Sarah

Just to get a gut feeling...

struct net_bridge_fdb_entry is 40 bytes.

My WiFi access point which is also a 5 port bridge, currently has 97MB
free RAM. That is space for about 2.5M FDB entries. So even Roopa's
128K is not really a problem, in terms of memory.

> Maybe what's needed is two thresholds, one for warning and one for 
> enforcement.
> The warning limit would need to be low enough that the information had a good 
> chance
> of being logged before the system was under too much load to be able to convey
> that information. The enforcement limit could be left as default inactive 
> until
> shown that it needed to be otherwise.

What exactly is the problem here? Does the DoS exhaust memory, or does
the hashing algorithm not scale?

It is more work, but the table could be more closely tied to the
memory management code. When memory is getting low, callbacks are made
asking to free up memory. Register such a callback and throw away part
of the table when memory is getting low. There is then no need to
limit the size, but print a rate limited warning when asked to reduce
the size.

Andrew

Re: [PATCH] [net-next] ibmvnic: This patch fixes the dma_mapping_error call to use the correct dma_addr which is inside the ibmvnic_vpd struct.

2017-11-16 Thread Desnes Augusto Nunes do Rosário


Version 2 of this patch has been already sent with correct styling.

On 11/16/2017 04:28 PM, Desnes Augusto Nunes do Rosario wrote:

Signed-off-by: Desnes A. Nunes do Rosario 
---
  drivers/net/ethernet/ibm/ibmvnic.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 04aaacb..1dc4aef 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -849,7 +849,6 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
  {
struct device *dev = >vdev->dev;
union ibmvnic_crq crq;
-   dma_addr_t dma_addr;
int len = 0;

if (adapter->vpd->buff)
@@ -879,7 +878,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
adapter->vpd->dma_addr =
dma_map_single(dev, adapter->vpd->buff, adapter->vpd->len,
   DMA_FROM_DEVICE);
-   if (dma_mapping_error(dev, dma_addr)) {
+   if (dma_mapping_error(dev, adapter->vpd->dma_addr)) {
dev_err(dev, "Could not map VPD buffer\n");
kfree(adapter->vpd->buff);
return -ENOMEM;



--
Desnes Augusto Nunes do Rosário
--

Linux Developer - IBM / Brazil
M.Sc. in Electrical and Computer Engineering - UFRN

(11) 9595-30-900
desn...@br.ibm.com

[PULL] vhost/virtio/qemu: cleanups and fixes

2017-11-16 Thread Michael S. Tsirkin

DMA support in FW CFG had to be pushed out as it caused ltp failures -
likely a compatibility issue, and could be a hypervisor bug, but we need
to figure it out first. There's still a small chance it'll happen
shortly, then I might do another pull request just for that.

The following changes since commit bebc6082da0a9f5d47a1ea2edc099bf671058bd4:

  Linux 4.14 (2017-11-12 10:46:13 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus

for you to fetch changes up to c1d0c3f623ada808904dec676da0126f5b800630:

  fw_cfg: fix the command line module name (2017-11-14 23:57:40 +0200)


virtio, vhost, qemu: bugfixes, cleanups

Fixes in qemu, vhost and virtio.

Signed-off-by: Michael S. Tsirkin 


Byungchul Park (1):
  vhost/scsi: Use safe iteration in vhost_scsi_complete_cmd_work()

Marc-André Lureau (1):
  fw_cfg: fix the command line module name

Michael S. Tsirkin (2):
  virtio_balloon: fix deadlock on OOM
  vhost: fix end of range for access_ok

Stefan Hajnoczi (1):
  vhost/vsock: fix uninitialized vhost_vsock->guest_cid

 drivers/firmware/qemu_fw_cfg.c |  8 
 drivers/vhost/scsi.c   |  4 ++--
 drivers/vhost/vhost.c  |  4 ++--
 drivers/vhost/vsock.c  |  2 ++
 drivers/virtio/virtio_balloon.c| 24 +++-
 include/linux/balloon_compaction.h | 35 ++-
 mm/balloon_compaction.c| 28 +---
 7 files changed, 84 insertions(+), 21 deletions(-)

Re: [PATCH v4] wcn36xx: Set default BTLE coexistence config

2017-11-16 Thread Bjorn Andersson

On Thu 16 Nov 00:01 PST 2017, Ramon Fried wrote:

> From: Eyal Ilsar 
> 
> If the value for the firmware configuration parameters
> BTC_STATIC_LEN_LE_BT and BTC_STATIC_LEN_LE_WLAN are not set the duty
> cycle between BT and WLAN is such that if BT (including BLE) is active
> WLAN gets 0 bandwidth. When tuning these parameters having a too high
> value for WLAN means that BLE performance degrades.
> The "sweet" point of roughly half of the maximal values was empirically
> found to achieve a balance between BLE and Wi-Fi coexistence
> performance.
> 
> Signed-off-by: Eyal Ilsar 
> Signed-off-by: Ramon Fried 

Looks good,

Acked-by: Bjorn Andersson 

Regards,
Bjorn

> ---
>  drivers/net/wireless/ath/wcn36xx/smd.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c 
> b/drivers/net/wireless/ath/wcn36xx/smd.c
> index 9c6590d5348a..6f1e741acf3e 100644
> --- a/drivers/net/wireless/ath/wcn36xx/smd.c
> +++ b/drivers/net/wireless/ath/wcn36xx/smd.c
> @@ -73,6 +73,8 @@ static struct wcn36xx_cfg_val wcn36xx_cfg_vals[] = {
>   WCN36XX_CFG_VAL(TX_PWR_CTRL_ENABLE, 1),
>   WCN36XX_CFG_VAL(ENABLE_CLOSE_LOOP, 1),
>   WCN36XX_CFG_VAL(ENABLE_LPWR_IMG_TRANSITION, 0),
> + WCN36XX_CFG_VAL(BTC_STATIC_LEN_LE_BT, 12),
> + WCN36XX_CFG_VAL(BTC_STATIC_LEN_LE_WLAN, 3),
>   WCN36XX_CFG_VAL(MAX_ASSOC_LIMIT, 10),
>   WCN36XX_CFG_VAL(ENABLE_MCC_ADAPTIVE_SCHEDULER, 0),
>  };
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
>

[PATCH] [net-next,v2] ibmvnic: fix dma_mapping_error call

2017-11-16 Thread Desnes Augusto Nunes do Rosario

This patch fixes the dma_mapping_error call to use the correct dma_addr
which is inside the ibmvnic_vpd struct. Moreover, it fixes a uninitialized
 warning for the local dma_addr.

Signed-off-by: Desnes A. Nunes do Rosario 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 04aaacb..1dc4aef 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -849,7 +849,6 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
 {
struct device *dev = >vdev->dev;
union ibmvnic_crq crq;
-   dma_addr_t dma_addr;
int len = 0;
 
if (adapter->vpd->buff)
@@ -879,7 +878,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
adapter->vpd->dma_addr =
dma_map_single(dev, adapter->vpd->buff, adapter->vpd->len,
   DMA_FROM_DEVICE);
-   if (dma_mapping_error(dev, dma_addr)) {
+   if (dma_mapping_error(dev, adapter->vpd->dma_addr)) {
dev_err(dev, "Could not map VPD buffer\n");
kfree(adapter->vpd->buff);
return -ENOMEM;
-- 
2.9.5

[PATCH] [net-next] ibmvnic: This patch fixes the dma_mapping_error call to use the correct dma_addr which is inside the ibmvnic_vpd struct.

2017-11-16 Thread Desnes Augusto Nunes do Rosario

Signed-off-by: Desnes A. Nunes do Rosario 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 04aaacb..1dc4aef 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -849,7 +849,6 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
 {
struct device *dev = >vdev->dev;
union ibmvnic_crq crq;
-   dma_addr_t dma_addr;
int len = 0;
 
if (adapter->vpd->buff)
@@ -879,7 +878,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
adapter->vpd->dma_addr =
dma_map_single(dev, adapter->vpd->buff, adapter->vpd->len,
   DMA_FROM_DEVICE);
-   if (dma_mapping_error(dev, dma_addr)) {
+   if (dma_mapping_error(dev, adapter->vpd->dma_addr)) {
dev_err(dev, "Could not map VPD buffer\n");
kfree(adapter->vpd->buff);
return -ENOMEM;
-- 
2.9.5

Re: [PATCH] net: bridge: add max_fdb_count

2017-11-16 Thread Sarah Newman

On 11/16/2017 01:58 AM, Willy Tarreau wrote:
> Hi Sarah,
> 
> On Thu, Nov 16, 2017 at 01:20:18AM -0800, Sarah Newman wrote:
>> I note that anyone who would run up against a too-low limit on the maximum
>> number of fdb entries would also be savvy enough to fix it in a matter of
>> minutes.
> 
> I disagree on this point. There's a huge difference between experiencing
> sudden breakage under normal conditions due to arbitrary limits being set
> and being down because of an attack. While the latter is not desirable,
> it's much more easily accepted and most often requires operations anyway.
> The former is never an option.

Yes, being down during an attack is expected, assuming you know you are
being attacked.

Linux bridges can also be used in small embedded devices. With no limit,
the likely result from those devices being attacked is the device gets
thrown away for being unreliable.

> 
> And I continue to think that the default behaviour once the limit is reached
> must not be to prevent new entries from being learned but to purge older
> ones. At least it preserves normal operations.

I'm not disagreeing.

I spent maybe a couple of hours on this patch and was hoping someone else would
find more time to spend on the problem.

> 
> But given the high CPU impact you reported for a very low load, definitely
> something needs to be done
It's nice to think so.

> 
>> They could also default the limit to U32_MAX in their particular
>> distribution if it was a configuration option.
> 
> Well, I'd say that we don't have a default limit on the socket number either
> and that it happens to be the expected behaviour. It's almost impossible to
> find a suitable limit for everyone. People dealing with regular loads never
> read docs and get caught. People working in hostile environments are always
> more careful and will ensure that their limits are properly set.

Neighbor tables for ipv4/ipv6 seem more comparable. gc_thresh3 is 1024 and
typically needs to be adjusted higher for Linux routers.

As you say, there is no default limit that suits everyone. So the question is
really who is burdened with changing the default.

There is a lot of talk of not breaking existing users. The current
implementation is demonstrably vulnerable, and since the problem is likely 
silent
there's not a good way to know how often it's actually occurred. I note the
tool to trigger it is trivially available and it's a well-known type of attack.

But I understand if there have been an insufficient known number of attacks
to change the current default situation. It could be left to user space to
make a new default if it becomes a demonstrable problem.

If user space has to change at all you could again argue for a pure user-space
solution, but I'm not sure if a pure user-space solution would always have a
chance to fix the problem before the system was brought down.

> 
>> At the moment there is not even a single log message if the problem doesn't
>> result in memory exhaustion.
> 
> This probably needs to be addressed as well
Maybe what's needed is two thresholds, one for warning and one for enforcement.
The warning limit would need to be low enough that the information had a good 
chance
of being logged before the system was under too much load to be able to convey
that information. The enforcement limit could be left as default inactive until
shown that it needed to be otherwise.

--Sarah

Re: SRIOV switchdev mode BoF minutes

2017-11-16 Thread Alexander Duyck

On Thu, Nov 16, 2017 at 9:41 AM, Or Gerlitz  wrote:
> On Wed, Nov 15, 2017 at 1:05 AM, Alexander Duyck
>  wrote:
>> On Tue, Nov 14, 2017 at 1:50 PM, Or Gerlitz  wrote:
>
>>> all dealing with the sriov e-switch as a HW switch which should
>>> be programmed
>>> by the host stack according to well known industry models that apply
>>> on physical switches, e.g
>>>
>>> 1. L2 FDB (Linux Bridge)
>>> 2. L3 FIB (Linux Routers)
>>> 3. ACLS (Linux TC)
>>>
>>> [3] is what implemented by the upstream sriov switchdev drivers, [1] and 
>>> [2] we
>>> discussed on netdev, maybe you want to play with [1] for i40e? I had a 
>>> slide on
>>> that in the BoF
>
>> So for i40e we will probably explore option 1, and possibly option 3
>> though as I said we still have to figure out what we can get the
>> firmware to actually do for us. That ends up being the ultimate
>> limitation.
>
> I think Intel/Linux/sriov wise, it would be good if you put now the
> focus on that small
> corner of the universe and show support for the new community lead
> mode by having
> one of your current drivers support that.

I am trying to focus on this area. The problem is you keep assuming
what we can and can't do in our hardware. I am not certain we can
handle the "learning" aspect of things. The biggest issue is that our
hardware was designed to be a VEPA with a filter based hairpin. It
really wasn't designed to be a switch. My concern is you may have been
misinformed about what our hardware can and cannot do. In addition
changing our firmware for the parts supported by i40e isn't that easy.
In addition there is no guarantee that we can do what is being asked
per PCIe function, it might be a global impact on the entire device.
If that were the case then it isn't an option since we can't have one
function breaking another. There are a lot of what-if scenarios that
we have to sort out, if we can even get the firmware update for this
since it was mostly locked down and in maintenance mode.

> FDB support would be great and it will help transition existing legacy
> mode users to the switchdev
> mode, b/c essentially FDBs is what each driver now configures their HW
> from within, where's if
> we manage to get a bridge to be offloaded, all what left is systemd
> script that creates the VF,
> puts the driver into switchdev mode, creates a bridge with the reps,
> and that is it!!
>
> I have presented a slide in our BoF re what does it take to support
> FDB, here it is:
>
> 1. create linux bridge (e.g.1q), assign VF and uplink rep netdevices
> to the bridge
> 2. support the switchdev FDB notifications in the HW driver

This is essentially what I hope to support with source macvlan based
port representors.

> learning: respond to SWITCHDEV_FDB_ADD_TO_DEVICE events

This requires that we see the traffic. We have to figure out if we can
actually make the CPU the default target and can then get the traffic
out of the uplink interface without horribly breaking things. It will
take time to see if we can even do it.

The problem is the CPU/PF is only the default target for traffic
coming from the uplink on our devices. Anything the VF sends will
default to the uplink unless there is a filter for it to route it
otherwise.

> aging: respond to SWITCHDEV_FDB_DEL_TO_DEVICE events (del FDB from HW)
> enhance the driver/bridge API to allows drivers provide last-use
> indications on FDB entries
>
> STP:
>
> fwd  - offload FDBs as explained above
> learning - make sure HW flow miss (slow path) goes to CPU
> discard  - add drop HW rule
>
> flooding:
>
> use SW based flooding

This is much easier said than done when you are working with a device
that was architected years before switchdev was a thing. I'll see what
we can do, but I cannot make any promises.

- Alex

Re: [PATCH] net/smc: Fix preinitialization of buf_desc in __smc_buf_create()

2017-11-16 Thread Ursula Braun



On 11/16/2017 12:22 PM, Geert Uytterhoeven wrote:
> With gcc-4.1.2:
> 
> net/smc/smc_core.c: In function ‘__smc_buf_create’:
> net/smc/smc_core.c:567: warning: ‘bufsize’ may be used uninitialized in 
> this function
> 
> Indeed, if the for-loop is never executed, bufsize is used
> uninitialized.  In addition, buf_desc is stored for later use, while it
> is still a NULL pointer.
> 
> Before, error handling was done by checking if buf_desc is non-NULL.
> The cleanup changed this to an error check, but forgot to update the
> preinitialization of buf_desc to an error pointer.
> 
> Update the preinitializatin of buf_desc to fix this.
> 
> Fixes: b33982c3a6838d13 ("net/smc: cleanup function __smc_buf_create()")
> Signed-off-by: Geert Uytterhoeven 
> ---
> I don't know if this can ever happen, but the old code handled it.

The for-loop is at least executed once; thus there is no real problem.
Nevertheless the warning is ugly, and the current initialization with NULL
meaningless after the smc_buf cleanup. Therefore I add your patch to my list of
coming smc patches. Thanks!

> ---
>  net/smc/smc_core.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index 2578fbd95664af84..453c54467082d93f 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -562,7 +562,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool 
> is_rmb)
>  {
>   struct smc_connection *conn = >conn;
>   struct smc_link_group *lgr = conn->lgr;
> - struct smc_buf_desc *buf_desc = NULL;
> + struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
>   struct list_head *buf_list;
>   int bufsize, bufsize_short;
>   int sk_buf_size;
>

Re: SRIOV switchdev mode BoF minutes

2017-11-16 Thread Or Gerlitz

On Wed, Nov 15, 2017 at 1:05 AM, Alexander Duyck
 wrote:
> On Tue, Nov 14, 2017 at 1:50 PM, Or Gerlitz  wrote:

>> all dealing with the sriov e-switch as a HW switch which should
>> be programmed
>> by the host stack according to well known industry models that apply
>> on physical switches, e.g
>>
>> 1. L2 FDB (Linux Bridge)
>> 2. L3 FIB (Linux Routers)
>> 3. ACLS (Linux TC)
>>
>> [3] is what implemented by the upstream sriov switchdev drivers, [1] and [2] 
>> we
>> discussed on netdev, maybe you want to play with [1] for i40e? I had a slide 
>> on
>> that in the BoF

> So for i40e we will probably explore option 1, and possibly option 3
> though as I said we still have to figure out what we can get the
> firmware to actually do for us. That ends up being the ultimate
> limitation.

I think Intel/Linux/sriov wise, it would be good if you put now the
focus on that small
corner of the universe and show support for the new community lead
mode by having
one of your current drivers support that.

FDB support would be great and it will help transition existing legacy
mode users to the switchdev
mode, b/c essentially FDBs is what each driver now configures their HW
from within, where's if
we manage to get a bridge to be offloaded, all what left is systemd
script that creates the VF,
puts the driver into switchdev mode, creates a bridge with the reps,
and that is it!!

I have presented a slide in our BoF re what does it take to support
FDB, here it is:

1. create linux bridge (e.g.1q), assign VF and uplink rep netdevices
to the bridge
2. support the switchdev FDB notifications in the HW driver

learning: respond to SWITCHDEV_FDB_ADD_TO_DEVICE events

aging: respond to SWITCHDEV_FDB_DEL_TO_DEVICE events (del FDB from HW)
enhance the driver/bridge API to allows drivers provide last-use
indications on FDB entries

STP:

fwd  - offload FDBs as explained above
learning - make sure HW flow miss (slow path) goes to CPU
discard  - add drop HW rule

flooding:

use SW based flooding

[PATCH] rsi: fix memory leak on buf and usb_reg_buf

2017-11-16 Thread Colin King

From: Colin Ian King 

In the cases where len is too long, the error return path fails to
kfree allocated buffers buf and usb_reg_buf.  The simplest fix is to
perform the sanity check on len before the allocations to avoid having
to do the kfree'ing in the first place.

Detected by CoverityScan, CID#1452258,1452259 ("Resource Leak")

Fixes: 59f73e2ae185 ("rsi: check length before USB read/write register")
Signed-off-by: Colin Ian King 
---
 drivers/net/wireless/rsi/rsi_91x_usb.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_usb.c 
b/drivers/net/wireless/rsi/rsi_91x_usb.c
index 08730227cd18..8f8443833348 100644
--- a/drivers/net/wireless/rsi/rsi_91x_usb.c
+++ b/drivers/net/wireless/rsi/rsi_91x_usb.c
@@ -162,13 +162,13 @@ static int rsi_usb_reg_read(struct usb_device *usbdev,
u8 *buf;
int status = -ENOMEM;
 
+   if (len > RSI_USB_CTRL_BUF_SIZE)
+   return -EINVAL;
+
buf  = kmalloc(RSI_USB_CTRL_BUF_SIZE, GFP_KERNEL);
if (!buf)
return status;
 
-   if (len > RSI_USB_CTRL_BUF_SIZE)
-   return -EINVAL;
-
status = usb_control_msg(usbdev,
 usb_rcvctrlpipe(usbdev, 0),
 USB_VENDOR_REGISTER_READ,
@@ -207,13 +207,13 @@ static int rsi_usb_reg_write(struct usb_device *usbdev,
u8 *usb_reg_buf;
int status = -ENOMEM;
 
+   if (len > RSI_USB_CTRL_BUF_SIZE)
+   return -EINVAL;
+
usb_reg_buf  = kmalloc(RSI_USB_CTRL_BUF_SIZE, GFP_KERNEL);
if (!usb_reg_buf)
return status;
 
-   if (len > RSI_USB_CTRL_BUF_SIZE)
-   return -EINVAL;
-
usb_reg_buf[0] = (value & 0x00ff);
usb_reg_buf[1] = (value & 0xff00) >> 8;
usb_reg_buf[2] = 0x0;
-- 
2.14.1

[PATCH] net/netlabel: Add list_next_rcu() in rcu_dereference().

2017-11-16 Thread Tim Hansen

Add list_next_rcu() for fetching next list in rcu_deference safely.

Found with sparse in linux-next tree on tag next-20171116.

Signed-off-by: Tim Hansen <devtimhan...@gmail.com>
---
 net/netlabel/netlabel_addrlist.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netlabel/netlabel_addrlist.h b/net/netlabel/netlabel_addrlist.h
index d0f38bc..ac709f0 100644
--- a/net/netlabel/netlabel_addrlist.h
+++ b/net/netlabel/netlabel_addrlist.h
@@ -87,7 +87,7 @@ static inline struct netlbl_af4list 
*__af4list_valid_rcu(struct list_head *s,
struct list_head *i = s;
struct netlbl_af4list *n = __af4list_entry(s);
while (i != h && !n->valid) {
-   i = rcu_dereference(i->next);
+   i = rcu_dereference(list_next_rcu(i));
n = __af4list_entry(i);
}
return n;
@@ -154,7 +154,7 @@ static inline struct netlbl_af6list 
*__af6list_valid_rcu(struct list_head *s,
struct list_head *i = s;
struct netlbl_af6list *n = __af6list_entry(s);
while (i != h && !n->valid) {
-   i = rcu_dereference(i->next);
+   i = rcu_dereference(list_next_rcu(i));
n = __af6list_entry(i);
}
return n;
-- 
2.1.4

Re: [PATCH] net: bridge: add max_fdb_count

2017-11-16 Thread Stephen Hemminger

On Wed, 15 Nov 2017 22:20:23 -0800
Roopa Prabhu  wrote:

> On Wed, Nov 15, 2017 at 10:13 PM, Toshiaki Makita
>  wrote:
> > On 2017/11/16 13:54, Sarah Newman wrote:  
> >> On 11/15/2017 08:05 PM, Toshiaki Makita wrote:  
> >>> On 2017/11/16 11:25, Andrew Lunn wrote:  
> > Also what do the vendors using bridge for L2 offload to switch think?  
> 
>  The Marvell L2 switches which DSA supports have 8K FDB/MDB entries. So
>  maybe 1024 is a bit low?  
> >>>
> >>> How about U32_MAX by default since it is currently not restricted.
> >>> (assuming the field will be changed to u32 as per Stephen's feedback).
> >>>
> >>> Otherwise users may suffer from unexpected behavior change by updating
> >>> kernel?
> >>>  
> >>
> >> U32_MAX seems like much too high a default to be helpful to a typical 
> >> user. How many devices are realistically on a single bridge in the wild? 
> >> Double
> >> that seems like a reasonable default.  
> >
> > I'm suggesting the most unrealistic number to essentially disable the
> > restriction by default.
> > My understanding is that we put a priority on not to break existing
> > users even if the new restriction looks reasonable for most people.  
> 
> +1 , and yes, 1024 is very low. some of the switches we use support
> around 128K FDB entries and we have seen that number increase fairly
> quickly in newer generation switches. Default should be no limit to
> not break existing users.

New features can not break existing users.

My recommendation would be that 0 be used as a magic value to indicate
no limit and that would be the default.

Also the limit should be controllable on a per port of bridge (interface) basis.

Re: [RFC PATCH 01/14] packet: introduce AF_PACKET V4 userspace API

2017-11-16 Thread Jesper Dangaard Brouer

On Wed, 15 Nov 2017 14:21:38 -0800
chet l  wrote:

> One quick question:
> Any thoughts on SVM support?

What is SVM ?

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCH v3 3/4] net: nb8800: Move HW init to ndo_open()

2017-11-16 Thread Marc Gonzalez

On 16/11/2017 17:23, Andrew Lunn wrote:

> Maybe take a look at your memory barriers. Most accesses using the
> _relaxed() version, i.e, no barrier. And then there are specific
> barriers when needed. One could be missing.
> 
> As a quick test, drop the _relaxed. Force a barrier with each
> access. If that works, it is a clear indication you have a barrier
> problem.

That was an interesting suggestion, thanks!

Unfortunately, adding wmb() in dozens of strategic places doesn't
prevent the issue where network connectivity is lost :-(

Regards.

Re: [PATCH v3 3/4] net: nb8800: Move HW init to ndo_open()

2017-11-16 Thread Andrew Lunn

> I'm starting to think there is some kind of race condition between
> SW and HW handling of descriptors. This might also explain the
> out-of-order warnings.

Hi Marc

Maybe take a look at your memory barriers. Most accesses using the
_relaxed() version, i.e, no barrier. And then there are specific
barriers when needed. One could be missing.

As a quick test, drop the _relaxed. Force a barrier with each
access. If that works, it is a clear indication you have a barrier
problem.

Andrew

Re: Broken netlink ABI

2017-11-16 Thread David Ahern

On 11/14/17 1:24 PM, Jon Maloy wrote:
> 
> 
>> -Original Message-
>> From: netdev-ow...@vger.kernel.org [mailto:netdev-
>> ow...@vger.kernel.org] On Behalf Of David Ahern
>> Sent: Tuesday, November 14, 2017 15:18
>> To: Jon Maloy ; netdev@vger.kernel.org; Jiri
>> Pirko 
>> Cc: David Miller (da...@davemloft.net) 
>> Subject: Re: Broken netlink ABI
>>
>> On 11/14/17 1:15 PM, David Ahern wrote:
>>> On 11/14/17 12:19 PM, Jon Maloy wrote:
 When I give the command:
 ~$ tipc node set addr 1.1.2

 I get the following response:

 error: Numerical result out of range
 Unable to get TIPC nl family id (module loaded?) error, message
 initialisation failed
>>>
>>> tipc is sending a u32 for the family attribute when it should be a u16:
>>>
>>> diff --git a/tipc/msg.c b/tipc/msg.c
>>> index 22c6bb20..dc09d05048f3 100644
>>> --- a/tipc/msg.c
>>> +++ b/tipc/msg.c
>>> @@ -125,7 +125,7 @@ static int get_family(void)
>>> genl->cmd = CTRL_CMD_GETFAMILY;
>>> genl->version = 1;
>>>
>>> -   mnl_attr_put_u32(nlh, CTRL_ATTR_FAMILY_ID, GENL_ID_CTRL);
>>> +   mnl_attr_put_u16(nlh, CTRL_ATTR_FAMILY_ID, GENL_ID_CTRL);
>>> mnl_attr_put_strz(nlh, CTRL_ATTR_FAMILY_NAME,
>>> TIPC_GENL_V2_NAME);
>>>
>>> if ((err = msg_query(nlh, family_id_cb, _family)))
>>>
>>> With the above change the tipc command runs fine.
> 
> I can fix that, but that that doesn't change the fact that binaries that have 
> been around and worked flawlessly for years now all by sudden have stopped 
> working.

The command has to be broken on some platforms (big endian?); it is
sending a u32 value which is truncated to u16 by the kernel.

> Whether the user is doing right or wrong, that if for me the very definition 
> of a broken ABI, and is unacceptable.
> 
> Either you have to remove the test in your patch, or you can try to identify 
> tipc and devlink in the code and exempt those from your test.
> 

DaveM: opinions? I expected fallout like this. Should I just log a
warning telling users they are running broken commands?

Re: [RFC PATCH net-next 0/2] Configuring PFC stall prevention via ethtool

2017-11-16 Thread Andrew Lunn

> I don't like adding another ethtool_ops callback tightly tied to the
> structures passed via ioctl() but when I started to think what to
> suggest as an alternative, I started to wonder if it is really necessary
> to add a new ethtool command at all. Couldn't this be handled as
> a tunable?

I agree with Michal here.

And as he pointed out, there does not need to be a 1:1 mapping between
ethtool(1) and the kAPI. I suggest extending the existing -a option,
and have it make two system calls if needed.

Andrew

Re: [RFC PATCH net-next 0/2] Configuring PFC stall prevention via ethtool

2017-11-16 Thread Andrew Lunn

On Thu, Nov 16, 2017 at 11:17:36AM +0200, Eran Ben Elisha wrote:
> On Thu, Nov 16, 2017 at 4:44 AM, Andrew Lunn  wrote:
> >> What do other vendors support? Time? Number of pause frames sent?
> >
> > So i checked a few Marvell Switches. You can also specify a time. It
> > is a little bit more complex than that, since the units of time depend
> > on the link speed. But converting a time in ms to what the register
> > wants is possible.
> >
> > So i'm thinking rather than a poorly defined 'Auto', passing a time
> > would be better.
> >
> >   Andrew
> Hi Andrew,
> 
> We were using the term 'Auto' for few reasons.
> 1. Not confusing the user with the question of what is the correct
> value (100 ms is good? Bad?)
> 2. Allowing exposure of new mechanism in the future without user need
> to change its commands
> 3. Letting the device to decide on best approach according to its
> capabilities, link speed, etc.
> 
> Our initial thought was to expose with timeout as you suggested, but
> it felt very restrictive due to the reasons I mentioned.

I just find 'auto' to be very unclearly defined. Auto-negotiation is
well defined, it is specified in 802.3. But what does Auto mean here?
Why 8ms? Why not 42ms? or 420ms? Auto also generally means some sort
of dynamic behaviour. Make changes depending on the current
conditions. Where as your implementation seems to be fixed at 8ms.

Does 802.3 say anything about this at all? Does it list the 8 seconds
your driver defaults to?

Thanks
Andrew

GRO disabled with IPv4 options

2017-11-16 Thread Cristian Klein


[CC-ing Herbert Xu, who is to 'git blame' for the code in question. :)]

Dear all,

We are working on a research prototype which, among others, adds a new 
IPv4 option. During testing we noticed that the packets captured by 
tcpdump shrank from 10s of KBs to the MTU, which indicates that Generic 
Receive Offload (GRO) got disabled.


Upon further investigation, we found the following line in 
`inet_gro_receive`:


if (*(u8 *)iph != 0x45)
goto out_unlock;

in plain English, don't do GRO if any IPv4 options are present.

Does somebody know the rationale for this? Is it because IPv4 options 
are rarely used, hence implementing GRO in that case does not pay off or 
are there some caveats? Specifically would it make sense to do GRO when 
the IPv4 options are byte-identical in consecutive packets?


Regards,

--
Cristian Klein, PhD
Researcher @ Umeå University
http://kleinlabs.eu

Re: linux-next: build warning after merge of the netfilter-next tree

2017-11-16 Thread Pablo Neira Ayuso

On Thu, Nov 16, 2017 at 09:46:17AM +1100, Stephen Rothwell wrote:
> Hi Pablo,
> 
> On Thu, 9 Nov 2017 00:40:14 +0100 Pablo Neira Ayuso  
> wrote:
> >
> > On Wed, Nov 08, 2017 at 07:00:52PM +1100, Stephen Rothwell wrote:
> > > 
> > > On Tue, 7 Nov 2017 11:02:48 +1100 Stephen Rothwell 
> > >  wrote:  
> > > >
> > > > After merging the netfilter-next tree, today's linux-next build (powerpc
> > > > ppc64_defconfig) produced this warning:
> > > > 
> > > > net/netfilter/nf_conntrack_netlink.c:536:15: warning: 
> > > > 'ctnetlink_proto_size' defined but not used [-Wunused-function]
> > > >  static size_t ctnetlink_proto_size(const struct nf_conn *ct)
> > > >^
> > > > 
> > > > Introduced by commit
> > > > 
> > > >   5caaed151a68 ("netfilter: conntrack: don't cache nlattr_tuple_size 
> > > > result in nla_size")  
> > > 
> > > I assume that this warning will now be in the net-next tree ...  
> > 
> > It's my fault, I'll fix this in my next batch, sorry for the inconvenience.
> 
> This has now made it into Linus' tree :-(

The patch is already at davem's tree. It's flying there.

Re: [PATCH net] net/sctp: Always set scope_id in sctp_inet6_skb_msgname

2017-11-16 Thread David Miller

From: ebied...@xmission.com (Eric W. Biederman)
Date: Wed, 15 Nov 2017 22:17:48 -0600

> 
> Alexandar Potapenko while testing the kernel with KMSAN and syzkaller
> discovered that in some configurations sctp would leak 4 bytes of
> kernel stack.
> 
> Working with his reproducer I discovered that those 4 bytes that
> are leaked is the scope id of an ipv6 address returned by recvmsg.
> 
> With a little code inspection and a shrewd guess I discovered that
> sctp_inet6_skb_msgname only initializes the scope_id field for link
> local ipv6 addresses to the interface index the link local address
> pertains to instead of initializing the scope_id field for all ipv6
> addresses.
> 
> That is almost reasonable as scope_id's are meaniningful only for link
> local addresses.  Set the scope_id in all other cases to 0 which is
> not a valid interface index to make it clear there is nothing useful
> in the scope_id field.
> 
> There should be no danger of breaking userspace as the stack leak
> guaranteed that previously meaningless random data was being returned.
> 
> Fixes: 372f525b495c ("SCTP:  Resync with LKSCTP tree.")
> History-tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
> Reported-by: Alexander Potapenko 
> Tested-by: Alexander Potapenko 
> Signed-off-by: "Eric W. Biederman" 

Applied and queued up for -stable, thanks Eric.

Re: [PATCH] fealnx: Fix building error on MIPS

2017-11-16 Thread David Miller

From: Huacai Chen 
Date: Thu, 16 Nov 2017 11:07:15 +0800

> This patch try to fix the building error on MIPS. The reason is MIPS
> has already defined the LONG macro, which conflicts with the LONG enum
> in drivers/net/ethernet/fealnx.c.
> 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Huacai Chen 

Applied and queued up for -stable.

Re: [PATCH net-next] driver: ipvlan: Add new func ipvlan_is_valid_dev instead of duplicated codes

2017-11-16 Thread David Miller

From: gfree.w...@vip.163.com
Date: Thu, 16 Nov 2017 09:11:23 +0800

> From: Gao Feng 
> 
> There are multiple duplicated condition checks in the current codes, so
> I add the new func ipvlan_is_valid_dev instead of the duplicated codes to
> check if the netdev is real ipvlan dev.
> 
> Signed-off-by: Gao Feng 

net-next is currently closed, please resubmit this when the net-next
tree opens back up.

Thank you.

Re: pull request (net): ipsec 2017-11-16

2017-11-16 Thread David Miller

From: Steffen Klassert 
Date: Thu, 16 Nov 2017 11:00:38 +0100

> 1) Copy policy family in clone_policy, otherwise this can
>trigger a BUG_ON in af_key. From Herbert Xu.
> 
> 2) Revert "xfrm: Fix stack-out-of-bounds read in xfrm_state_find."
>This added a regression with transport mode when no addresses
>are configured on the policy template.
> 
> Both patches are stable candidates.
> 
> Please pull or let me know if there are problems.

Pulled, thanks Steffen.

Re: [PATCH 00/12] isdn: hisax: Fix pnp_irq's error checking

2017-11-16 Thread David Miller

From: Arvind Yadav 
Date: Thu, 16 Nov 2017 09:57:17 +0530

> The pnp_irq() function returns -1 if an error occurs.
> pnp_irq() error checking for zero is not correct.

I can't say I like all of the mixing of "-1", and unsigned
integer values (pnp_irq() returns resource_size_t, the
IsdnCard->para[] arran entries are "unsigned long") but
what can I do.

Series applied, thanks.

Re: [RFC PATCH 06/17] net: sched: explicit locking in gso_cpu fallback

2017-11-16 Thread John Fastabend

On 11/15/2017 09:51 AM, Willem de Bruijn wrote:
> On Wed, Nov 15, 2017 at 10:11 AM, John Fastabend
>  wrote:
>> On 11/14/2017 04:41 PM, Willem de Bruijn wrote:
  /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
  static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
  {
 -   struct sk_buff *skb = sch->gso_skb;
 +   struct sk_buff *skb = skb_peek(>gso_skb);

 if (skb) {
 -   sch->gso_skb = NULL;
 +   skb = __skb_dequeue(>gso_skb);
 qdisc_qstats_backlog_dec(sch, skb);
 sch->q.qlen--;
>>>
>>> In lockless qdiscs, can this race, so that __skb_dequeue returns NULL?
>>> Same for its use in qdisc_peek_dequeued.
>>>
>>
>> Yes, agree if this was used in lockless qdisc it could race. However,
>> I don't think it is actually used in the lockless cases yet. For pfifo
>> fast __skb_array_peek is used.
> 
> Oh right. That will be easy to miss when other qdiscs are converted
> to lockless. Perhaps another location to add lockdep annotations.
> 

Yep. Will add lockdep here.

> Related: what happens when pfifo_fast is used as a leaf in a non-lockless
> qdisc hierarchy, say htb? The individual leaves will still have
> TCQ_F_NOLOCK, so will try to take the qdisc lock in dequeue_skb
> and other locations, but that is already held?
> 

Right. So I guess TCQ_F_NOLOCK needs to be propagated through the chain
of qdiscs and at attach we can only set TCQ_F_NOLOCK if the entire chain
of qdisc's are NOLOCK. Will spin an update with this as well.

> Thanks for revising and resubmitting the patchset, btw!
> 

no problem will be good to finally get this out of my queue.

Re: [RFC PATCH net-next 0/2] Configuring PFC stall prevention via ethtool

2017-11-16 Thread Michal Kubecek

On Thu, Nov 16, 2017 at 02:03:21PM +0200, Eran Ben Elisha wrote:
> On Thu, Nov 16, 2017 at 10:44 AM, Michal Kubecek  wrote:
> >
> > I don't like adding another ethtool_ops callback tightly tied to the
> > structures passed via ioctl() but when I started to think what to
> > suggest as an alternative, I started to wonder if it is really necessary
> > to add a new ethtool command at all. Couldn't this be handled as
> > a tunable?
> 
> tunable seems as a good infrastructure to PHY tuning, however this
> feature is not a PHY feature.

There are two kinds: tunables (ETHTOOL_{G,S}TUNABLE) and phy tunables
(ETHTOOL_PHY_{G,S}TUNABLE). My understanding is that former is meant as
a generic interface for parameters related to net_device, latter for
parameters related to phydev.

It's only my guess but IMHO the idea behind (both kinds of) tunables was
to add at least some extensibility to the ioctl() interface so that we
don't have to add more and more new one-purpose commands (until we can
switch to netlink).

> To me, it's looks fit to ethtool -a where pause operations are being
> controlled.  Unfortunately set/get_pauseparam is not extensible and
> need a new operation.

Yes, logically it belongs there, no question about that. But the
relation between ethtool subcommands (on command line) and ioctl
commands (cmd member of the structures) is not 1:1 in general (this is
one of the reasons. After all, your patchset uses another command
anyway; my suggestion is to use an existing one (ETHTOOL_{G,S}TUNABLE)
rather than adding a new one (and new callback to ethtool_ops).

Michal Kubecek

Re: [PATCH 29/31] MAINTAINERS: Add nds32

2017-11-16 Thread Greentime Hu

2017-11-14 23:39 GMT+08:00 Joe Perches :
> On Thu, 2017-11-09 at 11:36 +0100, Arnd Bergmann wrote:
>> On Thu, Nov 9, 2017 at 10:46 AM, Greentime Hu  wrote:
>> > 2017-11-08 21:31 GMT+08:00 Rob Herring :
>> > > On Tue, Nov 7, 2017 at 11:55 PM, Greentime Hu  wrote:
>> > > > From: Greentime Hu 
>> > > >
>> > > > Signed-off-by: Greentime Hu 
>> > > > ---
>> > > >  MAINTAINERS |9 +
>> > > >  1 file changed, 9 insertions(+)
>> > > >
>> > > > diff --git a/MAINTAINERS b/MAINTAINERS
>> > > > index 2f4e462..bce1181 100644
>> > > > --- a/MAINTAINERS
>> > > > +++ b/MAINTAINERS
>> > > > @@ -857,6 +857,15 @@ X: drivers/iio/*/adjd*
>> > > >  F: drivers/staging/iio/*/ad*
>> > > >  F: drivers/staging/iio/trigger/iio-trig-bfin-timer.c
>> > > >
>> > > > +ANDES ARCHITECTURE
>> > > > +M: Greentime Hu 
>> > > > +M: Vincent Chen 
>> > > > +T: git https://github.com/andestech/linux.git
>> > > > +S: Supported
>> > > > +F: arch/nds32
>> > >
>> > > DT binding files?
>> >
>> > Thanks.
>> > I should add
>> > F:
>> > Documentation/devicetree/bindings/interrupt-controller/andestech,ativic32.txt
>> > F:Documentation/devicetree/bindings/nds32/cpus.txt
>> >
>> > I will fix it in the next version patch.
>>
>> Better make the second one the directory, in case you add more files
>> there later.
>>
Thanks. I will add the second one directory.

>> I would also add a "K: nds32" line to catch all files that have this
>> in their names.
>
> That's probably not what would be desired
>
> K: looks for keywords inside all file contents
> N: looks for filenames matching a regex pattern
>
> See the header in MAINTAINERS for more details
>

Thanks.
I will add K and N both.

Re: [PATCH v3 3/4] net: nb8800: Move HW init to ndo_open()

2017-11-16 Thread Marc Gonzalez

On 15/11/2017 17:15, Marc Gonzalez wrote:

> Given the out-of-order datagrams, I'm wondering if it's possible
> for the DMA engine to overwrite a not-yet-read descriptor?
> 
> The EOC flag should stop the DMA engine though...
> 
> Maybe some kind of race...
> 
> I don't think I've been able to trigger the wedge when 256 descriptors
> are used.

I'm still taking stabs in the dark.

Adding a 10 ms delay for every 1024 packets, and using 256 descriptors
doesn't cause any hang, even after an hour.

At 85 packets per ms, 10 ms is large enough to consume all available
descriptors... Yet no issue.

Only when I lower the number of available RX descriptors do I see
the issue. And I don't need any delay...

I'm starting to think there is some kind of race condition between
SW and HW handling of descriptors. This might also explain the
out-of-order warnings.

Lowering the number of RX descriptors to 64, and adding no delay
causes many OUT OF ORDER warnings, and the network locks up in
under a minute:


iperf3: OUT OF ORDER - incoming packet = 13031 and received packet = 0 AND SP = 
13094
iperf3: OUT OF ORDER - incoming packet = 21441 and received packet = 0 AND SP = 
21504
iperf3: OUT OF ORDER - incoming packet = 21442 and received packet = 0 AND SP = 
21504
[ ID] Interval   Transfer Bandwidth   JitterLost/Total 
Datagrams
[  5]   0.00-1.00   sec  97.4 MBytes   817 Mbits/sec  0.016 ms  8793/81753 
(11%)  
iperf3: OUT OF ORDER - incoming packet = 92503 and received packet = 0 AND SP = 
92566
iperf3: OUT OF ORDER - incoming packet = 92504 and received packet = 0 AND SP = 
92566
iperf3: OUT OF ORDER - incoming packet = 125503 and received packet = 0 AND SP 
= 125566
[  5]   1.00-2.00   sec   101 MBytes   851 Mbits/sec  0.011 ms  8835/84797 
(10%)  
iperf3: OUT OF ORDER - incoming packet = 198710 and received packet = 0 AND SP 
= 198773
iperf3: OUT OF ORDER - incoming packet = 243272 and received packet = 0 AND SP 
= 243335
iperf3: OUT OF ORDER - incoming packet = 243791 and received packet = 0 AND SP 
= 243854
[  5]   2.00-3.00   sec   101 MBytes   851 Mbits/sec  0.016 ms  8771/84782 
(10%)  
iperf3: OUT OF ORDER - incoming packet = 294545 and received packet = 0 AND SP 
= 294608
iperf3: OUT OF ORDER - incoming packet = 301719 and received packet = 0 AND SP 
= 301782
iperf3: OUT OF ORDER - incoming packet = 301720 and received packet = 0 AND SP 
= 301782
iperf3: OUT OF ORDER - incoming packet = 305218 and received packet = 0 AND SP 
= 305281
iperf3: OUT OF ORDER - incoming packet = 314655 and received packet = 0 AND SP 
= 314718
iperf3: OUT OF ORDER - incoming packet = 325473 and received packet = 0 AND SP 
= 325536
iperf3: OUT OF ORDER - incoming packet = 325474 and received packet = 0 AND SP 
= 325536
[  5]   3.00-4.00   sec   101 MBytes   850 Mbits/sec  0.013 ms  8903/84798 
(10%)  
iperf3: OUT OF ORDER - incoming packet = 340919 and received packet = 0 AND SP 
= 340982
iperf3: OUT OF ORDER - incoming packet = 340920 and received packet = 0 AND SP 
= 340982
iperf3: OUT OF ORDER - incoming packet = 346325 and received packet = 0 AND SP 
= 346388
iperf3: OUT OF ORDER - incoming packet = 346326 and received packet = 0 AND SP 
= 346388
[  5]   4.00-5.00   sec   101 MBytes   851 Mbits/sec  0.012 ms  8868/84829 
(10%)  
iperf3: OUT OF ORDER - incoming packet = 441873 and received packet = 0 AND SP 
= 441936
iperf3: OUT OF ORDER - incoming packet = 459258 and received packet = 0 AND SP 
= 459321
iperf3: OUT OF ORDER - incoming packet = 495604 and received packet = 0 AND SP 
= 495667
[  5]   5.00-6.00   sec   101 MBytes   850 Mbits/sec  0.013 ms  8835/84760 
(10%)  
iperf3: OUT OF ORDER - incoming packet = 558280 and received packet = 0 AND SP 
= 558343
iperf3: OUT OF ORDER - incoming packet = 558281 and received packet = 0 AND SP 
= 558343
iperf3: OUT OF ORDER - incoming packet = 587445 and received packet = 0 AND SP 
= 587508
iperf3: OUT OF ORDER - incoming packet = 587446 and received packet = 0 AND SP 
= 587508
[  5]   6.00-7.00   sec   102 MBytes   854 Mbits/sec  0.016 ms  8580/84843 
(10%)  
iperf3: OUT OF ORDER - incoming packet = 617324 and received packet = 0 AND SP 
= 617387
iperf3: OUT OF ORDER - incoming packet = 620412 and received packet = 0 AND SP 
= 620475
iperf3: OUT OF ORDER - incoming packet = 635845 and received packet = 0 AND SP 
= 635908
iperf3: OUT OF ORDER - incoming packet = 657102 and received packet = 0 AND SP 
= 657165
iperf3: OUT OF ORDER - incoming packet = 670672 and received packet = 0 AND SP 
= 670735
iperf3: OUT OF ORDER - incoming packet = 670673 and received packet = 0 AND SP 
= 670735
[  5]   7.00-8.00   sec   102 MBytes   853 Mbits/sec  0.012 ms  8582/84757 
(10%)  
iperf3: OUT OF ORDER - incoming packet = 676529 and received packet = 0 AND SP 
= 676592
iperf3: OUT OF ORDER - incoming packet = 682693 and received packet = 0 AND SP 
= 682756
iperf3: OUT OF ORDER - incoming packet = 682694 and received packet = 0 AND SP 
= 682756
iperf3: OUT OF ORDER - incoming packet

Re: [RFC PATCH net-next 0/2] Configuring PFC stall prevention via ethtool

2017-11-16 Thread Eran Ben Elisha

On Thu, Nov 16, 2017 at 10:44 AM, Michal Kubecek  wrote:
> On Wed, Nov 15, 2017 at 09:00:09PM +0200, Eran Ben Elisha wrote:
>> From: Inbar Karmy 
>>
>> This RFC adds support for configuring PFC stall prevention through ethtool.
>>
>> In the event where the device unexpectedly becomes unresponsive for a long
>> period of time, flow control mechanism may propagate pause frames which will
>> cause congestion spreading to the entire network.
>>
>> To prevent this scenario, the device may implement a protection mechanism for
>> monitoring and resolving such state.  The following patches allow the user to
>> control the stall prevention functionality.
>>
>> PFC stall prevention configuration is done via ethtool -a (pause).
>> Two modes are introduced:
>> Default - current behavior per driver.
>> Auto - protection mechanism controlled automatically by the driver.
>> Due to lack of extension ability of ethtool_ops set_pauseparam, a new
>> ethtool_ops get_pfc_prevention_mode is introduced.
>
> I don't like adding another ethtool_ops callback tightly tied to the
> structures passed via ioctl() but when I started to think what to
> suggest as an alternative, I started to wonder if it is really necessary
> to add a new ethtool command at all. Couldn't this be handled as
> a tunable?
>
> Michal Kubecek

tunable seems as a good infrastructure to PHY tuning, however this
feature is not a PHY feature.
To me, it's looks fit to ethtool -a where pause operations are being controlled.
Unfortunately set/get_pauseparam is not extensible and need a new operation.

Eran.

[PATCH net-next v3] net: assign err to 0 at begin in do_setlink() function

2017-11-16 Thread yuan linyu

From: yuan linyu 

each netlink attribute have proper process when error happen,
when exit one attribute process, it implies that no error,
so err = 0; is useless.

assign err = 0; at beginning if all attributes not set.

v1 -> v2:
fix review comment from David, clear err before
nla_for_each_nested()

v2 -> v3:
maybe wrong understanding of David comment,
provide a new version

Signed-off-by: yuan linyu 
---
 net/core/rtnetlink.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a..54f792b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2075,7 +2075,7 @@ static int do_setlink(const struct sk_buff *skb,
  struct nlattr **tb, char *ifname, int status)
 {
const struct net_device_ops *ops = dev->netdev_ops;
-   int err;
+   int err = 0;
 
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
struct net *net = rtnl_link_get_net(dev_net(dev), tb);
@@ -2253,7 +2253,6 @@ static int do_setlink(const struct sk_buff *skb,
status |= DO_SETLINK_NOTIFY;
}
}
-   err = 0;
 
if (tb[IFLA_VF_PORTS]) {
struct nlattr *port[IFLA_PORT_MAX+1];
@@ -2261,9 +2260,10 @@ static int do_setlink(const struct sk_buff *skb,
int vf;
int rem;
 
-   err = -EOPNOTSUPP;
-   if (!ops->ndo_set_vf_port)
+   if (!ops->ndo_set_vf_port) {
+   err = -EOPNOTSUPP;
goto errout;
+   }
 
nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
if (nla_type(attr) != IFLA_VF_PORT ||
@@ -2286,7 +2286,6 @@ static int do_setlink(const struct sk_buff *skb,
status |= DO_SETLINK_NOTIFY;
}
}
-   err = 0;
 
if (tb[IFLA_PORT_SELF]) {
struct nlattr *port[IFLA_PORT_MAX+1];
@@ -2326,7 +2325,6 @@ static int do_setlink(const struct sk_buff *skb,
status |= DO_SETLINK_NOTIFY;
}
}
-   err = 0;
 
if (tb[IFLA_PROTO_DOWN]) {
err = dev_change_proto_down(dev,
-- 
2.7.4

Re: [PATCH] net/smc: Fix preinitialization of buf_desc in __smc_buf_create()

2017-11-16 Thread Arnd Bergmann

On Thu, Nov 16, 2017 at 12:22 PM, Geert Uytterhoeven
 wrote:
> With gcc-4.1.2:
>
> net/smc/smc_core.c: In function ‘__smc_buf_create’:
> net/smc/smc_core.c:567: warning: ‘bufsize’ may be used uninitialized in 
> this function
>
> Indeed, if the for-loop is never executed, bufsize is used
> uninitialized.  In addition, buf_desc is stored for later use, while it
> is still a NULL pointer.
>
> Before, error handling was done by checking if buf_desc is non-NULL.
> The cleanup changed this to an error check, but forgot to update the
> preinitialization of buf_desc to an error pointer.
>
> Update the preinitializatin of buf_desc to fix this.
>
> Fixes: b33982c3a6838d13 ("net/smc: cleanup function __smc_buf_create()")
> Signed-off-by: Geert Uytterhoeven 
> ---
> I don't know if this can ever happen, but the old code handled it.

Acked-by: Arnd Bergmann 

This one I could reproduce with gcc-4.1 on x86, but not gcc-4.2 or higher.

1 2 >

1 - 100 of 122 matches

Mail list logo