[PATCH net v2] net: sched: do not requeue a NULL skb

2016-04-10 Thread Lars Persson
A failure in validate_xmit_skb_list() triggered an unconditional call
to dev_requeue_skb with skb=NULL. This slowly grows the queue
discipline's qlen count until all traffic through the queue stops.

By introducing a NULL check in dev_requeue_skb it was also necessary
to make the __netif_schedule call conditional to avoid scheduling an
empty queue.

Fixes: 55a93b3ea780 ("qdisc: validate skb without holding lock")
Signed-off-by: Lars Persson 
---
 net/sched/sch_generic.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f18c350..4e6a79c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -47,10 +47,13 @@ EXPORT_SYMBOL(default_qdisc_ops);
 
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-   q->gso_skb = skb;
-   q->qstats.requeues++;
-   q->q.qlen++;/* it's still part of the queue */
-   __netif_schedule(q);
+   if (skb) {
+   q->gso_skb = skb;
+   q->qstats.requeues++;
+   q->q.qlen++;/* it's still part of the queue */
+   }
+   if (qdisc_qlen(q))
+   __netif_schedule(q);
 
return 0;
 }
-- 
2.1.4



[PATCH net] cxgb4: Stop Rx Queues before freeing it up

2016-04-10 Thread Hariprasad Shenai
Stop all Ethernet RX Queues before freeing up various Ingress/Egress
Queues, etc. We were seeing cases of Ingress Queues not getting serviced
during the shutdown process leading to Ingress Paths jamming up through
the chip and blocking the shutdown effort itself.

One such case involved the Firmware sending a "Flush Token" through the
ULP-TX -> ULP-RX path for an Ethernet TX Queue being freed in order to
make sure there weren't any remaining TX Work Requests in the pipeline.
But the return path was stalled by Ingress Data unable to be delivered to
the Host because those Ingress Queues were no longer being serviced.

Based on original work by Casey Leedom 

Signed-off-by: Hariprasad Shenai 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |  3 +++
 drivers/net/ethernet/chelsio/cxgb4/sge.c   | 20 +++---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 33 ++
 3 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 984a3cc26f86..326d4009525e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -1451,6 +1451,9 @@ int t4_mdio_rd(struct adapter *adap, unsigned int mbox, 
unsigned int phy_addr,
   unsigned int mmd, unsigned int reg, u16 *valp);
 int t4_mdio_wr(struct adapter *adap, unsigned int mbox, unsigned int phy_addr,
   unsigned int mmd, unsigned int reg, u16 val);
+int t4_iq_stop(struct adapter *adap, unsigned int mbox, unsigned int pf,
+  unsigned int vf, unsigned int iqtype, unsigned int iqid,
+  unsigned int fl0id, unsigned int fl1id);
 int t4_iq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
   unsigned int vf, unsigned int iqtype, unsigned int iqid,
   unsigned int fl0id, unsigned int fl1id);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c 
b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 13b144bcf725..6278e5a74b74 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -2981,14 +2981,28 @@ void t4_free_ofld_rxqs(struct adapter *adap, int n, 
struct sge_ofld_rxq *q)
 void t4_free_sge_resources(struct adapter *adap)
 {
int i;
-   struct sge_eth_rxq *eq = adap->sge.ethrxq;
-   struct sge_eth_txq *etq = adap->sge.ethtxq;
+   struct sge_eth_rxq *eq;
+   struct sge_eth_txq *etq;
+
+   /* stop all Rx queues in order to start them draining */
+   for (i = 0; i < adap->sge.ethqsets; i++) {
+   eq = &adap->sge.ethrxq[i];
+   if (eq->rspq.desc)
+   t4_iq_stop(adap, adap->mbox, adap->pf, 0,
+  FW_IQ_TYPE_FL_INT_CAP,
+  eq->rspq.cntxt_id,
+  eq->fl.size ? eq->fl.cntxt_id : 0x,
+  0x);
+   }
 
/* clean up Ethernet Tx/Rx queues */
-   for (i = 0; i < adap->sge.ethqsets; i++, eq++, etq++) {
+   for (i = 0; i < adap->sge.ethqsets; i++) {
+   eq = &adap->sge.ethrxq[i];
if (eq->rspq.desc)
free_rspq_fl(adap, &eq->rspq,
 eq->fl.size ? &eq->fl : NULL);
+
+   etq = &adap->sge.ethtxq[i];
if (etq->q.desc) {
t4_eth_eq_free(adap, adap->mbox, adap->pf, 0,
   etq->q.cntxt_id);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c 
b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index cc1736bece0f..520ffcaef6d8 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -6940,6 +6940,39 @@ int t4_identify_port(struct adapter *adap, unsigned int 
mbox, unsigned int viid,
 }
 
 /**
+ * t4_iq_stop - stop an ingress queue and its FLs
+ * @adap: the adapter
+ * @mbox: mailbox to use for the FW command
+ * @pf: the PF owning the queues
+ * @vf: the VF owning the queues
+ * @iqtype: the ingress queue type (FW_IQ_TYPE_FL_INT_CAP, etc.)
+ * @iqid: ingress queue id
+ * @fl0id: FL0 queue id or 0x if no attached FL0
+ * @fl1id: FL1 queue id or 0x if no attached FL1
+ *
+ * Stops an ingress queue and its associated FLs, if any.  This causes
+ * any current or future data/messages destined for these queues to be
+ * tossed.
+ */
+int t4_iq_stop(struct adapter *adap, unsigned int mbox, unsigned int pf,
+  unsigned int vf, unsigned int iqtype, unsigned int iqid,
+  unsigned int fl0id, unsigned int fl1id)
+{
+   struct fw_iq_cmd c;
+
+   memset(&c, 0, sizeof(c));
+   c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_IQ_CMD) | FW_CMD_REQUEST_F |
+ FW_CMD_EXEC_F | FW_IQ_CMD_PFN_V(pf) |
+ FW_IQ_CMD_VFN_V(vf));
+   c.alloc_to_l

Re: Backport Security Fix for CVE-2015-8787 to v4.1

2016-04-10 Thread Yuki Machida

Hi Pablo,

On 2016年04月11日 13:42, Yuki Machida wrote:

Hi Pablo,

On 2016年04月07日 23:46, Pablo Neira Ayuso wrote:

On Thu, Apr 07, 2016 at 03:40:30PM +0900, Yuki Machida wrote:

Hi David,

I conformed that a patch of CVE-2015-8787 not applied at v4.1.21.
Could you please apply a patch for 4.1-stable ?

CVE-2015-8787
Upstream commit 94f9cd81436c85d8c3a318ba92e236ede73752fc


I'll request again, this time to Sasha Levin to include this in
-stable 4.1.

Thank you for your help.

David said "Please send to the netfilter team".
Therefore, I will send above patch to netfilter team.

Thank you.

Thanks.





Re: Backport Security Fix for CVE-2015-8787 to v4.1

2016-04-10 Thread Yuki Machida

Hi Pablo,

On 2016年04月07日 23:46, Pablo Neira Ayuso wrote:

On Thu, Apr 07, 2016 at 03:40:30PM +0900, Yuki Machida wrote:

Hi David,

I conformed that a patch of CVE-2015-8787 not applied at v4.1.21.
Could you please apply a patch for 4.1-stable ?

CVE-2015-8787
Upstream commit 94f9cd81436c85d8c3a318ba92e236ede73752fc


I'll request again, this time to Sasha Levin to include this in
-stable 4.1.

Thank you for your help.


Thanks.



Re: [PATCH 1/1] net: stmmac: socfgpa: Ensure emac bit set in System Manger for PTP

2016-04-10 Thread David Miller
From: Phil Reid 
Date: Thu,  7 Apr 2016 15:55:35 +0800

> When using the PTP fpga to hps clock source for the stmmac module
> the appropriate bit in the System Manager FPGA Interface Group register
> needs to be set. This is not set by the bootloader setup  when the
> HPS emac pins are being for this emac module.
> 
> This allows the PTP clock to be sourced from the FPGA and also connects
> the PTP pps and ext trig signals to the stmmac PTP hardware.
> 
> Patch proposed by Phil Collins.
> 
> Signed-off-by: Phil Reid 

Applied, thanks.


Re: [PATCH] netlink: don't send NETLINK_URELEASE for unbound sockets

2016-04-10 Thread David Miller
From: Johannes Berg 
Date: Thu,  7 Apr 2016 09:31:38 +0200

> From: Dmitry Ivanov 
> 
> All existing users of NETLINK_URELEASE use it to clean up resources that
> were previously allocated to a socket via some command. As a result, no
> users require getting this notification for unbound sockets.
> 
> Sending it for unbound sockets, however, is a problem because any user
> (including unprivileged users) can create a socket that uses the same ID
> as an existing socket. Binding this new socket will fail, but if the
> NETLINK_URELEASE notification is generated for such sockets, the users
> thereof will be tricked into thinking the socket that they allocated the
> resources for is closed.
> 
> In the nl80211 case, this will cause destruction of virtual interfaces
> that still belong to an existing hostapd process; this is the case that
> Dmitry noticed. In the NFC case, it will cause a poll abort. In the case
> of netlink log/queue it will cause them to stop reporting events, as if
> NFULNL_CFG_CMD_UNBIND/NFQNL_CFG_CMD_UNBIND had been called.
> 
> Fix this problem by checking that the socket is bound before generating
> the NETLINK_URELEASE notification.
> 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Dmitry Ivanov 
> Signed-off-by: Johannes Berg 

Applied and queued up for -stable, thanks everyone.


[PATCH net-next WIP] ethtool: generic netlink policy

2016-04-10 Thread Roopa Prabhu
From: Roopa Prabhu 

netlink for ethtool came up at netconf/netdev and we had promised to
send some of the ethtool netlink code we have.
We use a generic netlink channel for ethtool between our kernel and
user space driver. This ethtool channel nicely wraps most ethtool
commands into genl messages. And is capable of handling delayed
remote ops to userspace in some cases (dropping rtnl etc). We use
this channel to also cache some of this ethtool data in the kernel.
In this patch I have included just the genl policy for ethtool which
will apply to the generic usecase. We can certainly share the rest of
it if we see a usecase. Especially the remote handling of ethtool ops
for delayed hw operations maybe useful in other cases (today they are
tied to our remote driver in userspace). The ethtool handlers for
genl use the existing ethtool structs and call into the
respective driver handlers.

This came up again at the switchdev discussion recently and I had
promised to get this out this weekend :). This patch does not include
changes to compile the code.

We should move ethtool to netlink at some point: And I think we
should also explore the possibility of including it into the existing
new devlink generic netlink infrastructure. And ethtool stats should
move to the new stats infrastructure.

Signed-off-by: Roopa Prabhu 
Signed-off-by: Shrijeet Mukherjee 
---
 net/core/ethtool_netlink.c | 200 +
 1 file changed, 200 insertions(+)
 create mode 100644 net/core/ethtool_netlink.c

diff --git a/net/core/ethtool_netlink.c b/net/core/ethtool_netlink.c
new file mode 100644
index 000..f5445f3
--- /dev/null
+++ b/net/core/ethtool_netlink.c
@@ -0,0 +1,200 @@
+/*
+ *  net/core/ethtool_netlink.c - generic ethtool netlink handler
+ *  Copyright (C) 2015 Cumulus Networks
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static const struct nla_policy ethtool_policy[ETHTOOL_ATTR_MAX + 1] = {
+   [ETHTOOL_ATTR_IFINDEX]  = { .type = NLA_U32 },
+   [ETHTOOL_ATTR_FLAGS]= { .type = NLA_U32 },
+   [ETHTOOL_ATTR_PHYS_ID_STATE]= { .type = NLA_U8 },
+   [ETHTOOL_ATTR_SETTINGS] = { .type = NLA_BINARY,
+   .len = sizeof(struct ethtool_cmd) },
+   [ETHTOOL_ATTR_PAUSE]= { .type = NLA_BINARY,
+   .len = sizeof(struct ethtool_pauseparam) },
+   [ETHTOOL_ATTR_MODINFO]  = { .type = NLA_BINARY,
+   .len = sizeof(struct ethtool_modinfo) },
+   [ETHTOOL_ATTR_EEPROM]   = { .type = NLA_BINARY,
+   .len = sizeof(struct ethtool_eeprom) },
+   [ETHTOOL_ATTR_EEPROM_DATA]  = { .type = NLA_BINARY },
+   [ETHTOOL_ATTR_STATS]= { .type = NLA_NESTED },
+   [ETHTOOL_ATTR_STAT] = { .type = NLA_U32 },
+   [ETHTOOL_ATTR_STRINGS]  = { .type = NLA_NESTED },
+   [ETHTOOL_ATTR_STRING]   = { .type = NLA_STRING,
+   .len = ETH_GSTRING_LEN },
+   [ETHTOOL_ATTR_SSET] = { .type = NLA_U32 },
+   [ETHTOOL_ATTR_SSET_COUNT]   = { .type = NLA_U32 },
+};
+
+static struct genl_family ethtool_family = {
+   .id = GENL_ID_GENERATE,
+   .name = "ethtool_family",
+   .version = 1,
+   .maxattr = ETHTOOL_ATTR_MAX,
+};
+
+static struct genl_multicast_group ethtool_mcgrp[] = {
+   { .name = "port_mc", },
+};
+
+static LIST_HEAD(wq_list);
+
+static struct genl_ops ethtool_ops[] = {
+   {
+   .cmd = ETHTOOL_CMD_GET_SETTINGS,
+   .policy = ethtool_policy,
+   .doit = ethtool_get_settings,
+   },
+   {
+   .cmd = ETHTOOL_CMD_SET_SETTINGS,
+   .policy = ethtool_policy,
+   .doit = ethtool_set_settings,
+   },
+   {
+   .cmd = ETHTOOL_CMD_GET_PAUSE,
+   .policy = ethtool_policy,
+   .doit = ethtool_get_pause,
+   },
+   {
+   .cmd = ETHTOOL_CMD_SET_PAUSE,
+   .policy = ethtool_policy,
+   .doit = ethtool_set_pause,
+   },
+   {
+   .cmd = ETHTOOL_CMD_GET_MODULE_INFO,
+   .policy = ethtool_policy,
+   .doit = ethtool_get_module_info,
+   },
+   {
+   .cmd = ETHTOOL_CMD_SET_MODULE_INFO,
+   .policy = etht

RE: [v7, 0/5] Fix eSDHC host version register bug

2016-04-10 Thread Yangbo Lu
Hi Leo and Scott,


> -Original Message-
> From: Ulf Hansson [mailto:ulf.hans...@linaro.org]
> Sent: Wednesday, April 06, 2016 4:15 PM
> To: Yangbo Lu; Scott Wood
> Cc: devicet...@vger.kernel.org; linux-arm-ker...@lists.infradead.org;
> linux-ker...@vger.kernel.org; linuxppc-...@lists.ozlabs.org; linux-
> c...@vger.kernel.org; linux-...@vger.kernel.org; iommu@lists.linux-
> foundation.org; netdev@vger.kernel.org; linux-mmc; Rob Herring; Russell
> King; Jochen Friedrich; Joerg Roedel; Claudiu Manoil; Bhupesh Sharma;
> Zhao Qiang; Kumar Gala; Santosh Shilimkar; Yang-Leo Li; Xiaobo Xie
> Subject: Re: [v7, 0/5] Fix eSDHC host version register bug
> 
> >>
> >> I was about to queue this for next, when I noticed that checkpatch is
> >> complaining/warning lots about these patches. Can you please a round
> >> of checkpatch and fix what makes sense.
> >>
> >> Kind regards
> >> Uffe
> >
> > [Lu Yangbo-B47093] Sorry about this, Uffe...
> 
> No worries!
> 
> > Should I ignore the warnings that update MAINTAINERS?
> 
> drivers/soc/fsl/guts.c isn't part of the MAINTAINERS file, it should be.
> 
> I also realize that the FREESCALE QUICC ENGINE LIBRARY section
> drivers/soc/fsl/qe/* also need an active maintainer, as it's currently
> orphan.
> 
> Perhaps we should have create a new section for drivers/soc/fsl/* instead
> that covers all of the above? Maybe you or Scott are interested to pick
> it up?
> 
> I also noted that, "include/linux/fsl/" isn't present in MAINTAINERS,
> please add that as well.

[Lu Yangbo-B47093] Could give some advice on the MAINTAINERS for these 'fsl' 
files
since I really don’t know who should be the right person?
I will appreciate that!

Thanks a lot.

> 
> > Regarding the 'undocumented' warning, I will added a patch updates doc
> before all the patches, Ok?
> 
> Yes, good!
> 
> >
> > Thanks a lot :)
> >
> 
> Kind regards
> Uffe


Re: [PATCH] net: mark DECnet as broken

2016-04-10 Thread David Miller
From: Vegard Nossum 
Date: Thu,  7 Apr 2016 09:22:43 +0200

> There are NULL pointer dereference bugs in DECnet which can be triggered
> by unprivileged users and have been reported multiple times to LKML,
> however nobody seems confident enough in the proposed fixes to merge them
> and the consensus seems to be that nobody cares enough about DECnet to
> see it fixed anyway.
> 
> To shield unsuspecting users from the possible DOS, we should mark this
> BROKEN until somebody who actually uses this code can fix it.
> 
> Signed-off-by: Vegard Nossum 
> Link: https://lkml.org/lkml/2015/12/17/666

As stated, I'm not applying this, and rather I am fixing this as
below:


[PATCH] decnet: Do not build routes to devices without decnet private data.

In particular, make sure we check for decnet private presence
for loopback devices.

Signed-off-by: David S. Miller 
---
 net/decnet/dn_route.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 607a14f..b1dc096 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1034,10 +1034,13 @@ source_ok:
if (!fld.daddr) {
fld.daddr = fld.saddr;
 
-   err = -EADDRNOTAVAIL;
if (dev_out)
dev_put(dev_out);
+   err = -EINVAL;
dev_out = init_net.loopback_dev;
+   if (!dev_out->dn_ptr)
+   goto out;
+   err = -EADDRNOTAVAIL;
dev_hold(dev_out);
if (!fld.daddr) {
fld.daddr =
@@ -1110,6 +1113,8 @@ source_ok:
if (dev_out == NULL)
goto out;
dn_db = rcu_dereference_raw(dev_out->dn_ptr);
+   if (!dn_db)
+   goto e_inval;
/* Possible improvement - check all devices for local addr */
if (dn_dev_islocal(dev_out, fld.daddr)) {
dev_put(dev_out);
@@ -1151,6 +1156,8 @@ select_source:
dev_put(dev_out);
dev_out = init_net.loopback_dev;
dev_hold(dev_out);
+   if (!dev_out->dn_ptr)
+   goto e_inval;
fld.flowidn_oif = dev_out->ifindex;
if (res.fi)
dn_fib_info_put(res.fi);
-- 
2.1.0



Re: [PATCH net-next] bpf: simplify verifier register state assignments

2016-04-10 Thread David Miller
From: Alexei Starovoitov 
Date: Wed, 6 Apr 2016 19:39:21 -0700

> verifier is using the following structure to track the state of registers:
> struct reg_state {
> enum bpf_reg_type type;
> union {
> int imm;
> struct bpf_map *map_ptr;
> };
> };
> and later on in states_equal() does memcmp(&old->regs[i], &cur->regs[i],..)
> to find equivalent states.
> Throughout the code of verifier there are assignements to 'imm' and 'map_ptr'
> fields and it's not obvious that most of the assignments into 'imm' don't
> need to clear extra 4 bytes (like mark_reg_unknown_value() does) to make sure
> that memcmp doesn't go over junk left from 'map_ptr' assignment.
> 
> Simplify the code by converting 'int' into 'long'
> 
> Suggested-by: Daniel Borkmann 
> Signed-off-by: Alexei Starovoitov 

Applied, thanks Daniel.


Re: [PATCH RFT 2/2] macb: kill PHY reset code

2016-04-10 Thread Andrew Lunn
On Sat, Apr 09, 2016 at 01:25:03AM +0300, Sergei Shtylyov wrote:
> With  the 'phylib' now  being aware of  the "reset-gpios" PHY node property,
> there should be no need to frob the PHY reset in this  driver anymore...
> 
> Signed-off-by: Sergei Shtylyov 
> 
> ---
>  drivers/net/ethernet/cadence/macb.c |   17 -
>  drivers/net/ethernet/cadence/macb.h |1 -
>  2 files changed, 18 deletions(-)
> 
> Index: net-next/drivers/net/ethernet/cadence/macb.c
> ===
> --- net-next.orig/drivers/net/ethernet/cadence/macb.c
> +++ net-next/drivers/net/ethernet/cadence/macb.c
> @@ -2884,7 +2884,6 @@ static int macb_probe(struct platform_de
> = macb_clk_init;
>   int (*init)(struct platform_device *) = macb_init;
>   struct device_node *np = pdev->dev.of_node;
> - struct device_node *phy_node;
>   const struct macb_config *macb_config = NULL;
>   struct clk *pclk, *hclk = NULL, *tx_clk = NULL;
>   unsigned int queue_mask, num_queues;
> @@ -2977,18 +2976,6 @@ static int macb_probe(struct platform_de
>   else
>   macb_get_hwaddr(bp);
>  
> - /* Power up the PHY if there is a GPIO reset */
> - phy_node =  of_get_next_available_child(np, NULL);
> - if (phy_node) {
> - int gpio = of_get_named_gpio(phy_node, "reset-gpios", 0);
> -
> - if (gpio_is_valid(gpio)) {
> - bp->reset_gpio = gpio_to_desc(gpio);
> - gpiod_direction_output(bp->reset_gpio, 1);

Hi Sergei

The code you are deleting would of ignored the flags in the gpio
property, i.e. active low. The new code in the previous patch does
however take the flags into account. Did you check if there are any
device trees which have flags, which were never used, but are now
going to be used and thus break...

  Andrew


Re: [PATCH 1/3] bonding: do not allow rlb updates to invalid mac

2016-04-10 Thread David Miller

Please resubmit this patch series with a proper cover letter.

It should have "[PATCH 0/3] ..." as the subject line and explain
at a high level what your patch series is doing, how it is doing
it, and why it is doing it that way.

You must also be explicit about which of my trees your changes
are targetting.

Thanks.


Re: [PATCH v2] sctp: avoid refreshing heartbeat timer too often

2016-04-10 Thread David Miller
From: Marcelo Ricardo Leitner 
Date: Wed,  6 Apr 2016 15:15:19 -0300

> Currently on high rate SCTP streams the heartbeat timer refresh can
> consume quite a lot of resources as timer updates are costly and it
> contains a random factor, which a) is also costly and b) invalidates
> mod_timer() optimization for not editing a timer to the same value.
> It may even cause the timer to be slightly advanced, for no good reason.
> 
> As suggested by David Laight this patch now removes this timer update
> from hot path by leaving the timer on and re-evaluating upon its
> expiration if the heartbeat is still needed or not, similarly to what is
> done for TCP. If it's not needed anymore the timer is re-scheduled to
> the new timeout, considering the time already elapsed.
> 
> For this, we now record the last tx timestamp per transport, updated in
> the same spots as hb timer was restarted on tx. Also split up
> sctp_transport_reset_timers into sctp_transport_reset_t3_rtx and
> sctp_transport_reset_hb_timer, so we can re-arm T3 without re-arming the
> heartbeat one.
> 
> On loopback with MTU of 65535 and data chunks with 1636, so that we
> have a considerable amount of chunks without stressing system calls,
> netperf -t SCTP_STREAM -l 30, perf looked like this before:
 . ..
> And after this patch, now with netperf -l 60:
 ...
> Throughput-wise, from 6800mbps without the patch to 7050mbps with it,
> ~3.7%.
> 
> Signed-off-by: Marcelo Ricardo Leitner 

Applied, thanks Marcelo.


[net-next PATCH v2 3/5] GRO: Add support for TCP with fixed IPv4 ID field, limit tunnel IP ID values

2016-04-10 Thread Alexander Duyck
This patch does two things.

First it allows TCP to aggregate TCP frames with a fixed IPv4 ID field.  As
a result we should now be able to aggregate flows that were converted from
IPv6 to IPv4.  In addition this allows us more flexibility for future
implementations of segmentation as we may be able to use a fixed IP ID when
segmenting the flow.

The second thing this does is that it places limitations on the outer IPv4
ID header in the case of tunneled frames.  Specifically it forces the IP ID
to be incrementing by 1 unless the DF bit is set in the outer IPv4 header.
This way we can avoid creating overlapping series of IP IDs that could
possibly be fragmented if the frame goes through GRO and is then
resegmented via GSO.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdevice.h |5 -
 net/core/dev.c|1 +
 net/ipv4/af_inet.c|   35 ---
 net/ipv4/tcp_offload.c|   16 +++-
 net/ipv6/ip6_offload.c|8 ++--
 5 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eb7f037a4068..6a248a3a44bf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2121,7 +2121,10 @@ struct napi_gro_cb {
/* Used in GRE, set in fou/gue_gro_receive */
u8  is_fou:1;
 
-   /* 6 bit hole */
+   /* Used to determine if flush_id can be ignored */
+   u8  is_atomic:1;
+
+   /* 5 bit hole */
 
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum  csum;
diff --git a/net/core/dev.c b/net/core/dev.c
index e896b1953ab6..b78b586b1856 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4462,6 +4462,7 @@ static enum gro_result dev_gro_receive(struct napi_struct 
*napi, struct sk_buff
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
NAPI_GRO_CB(skb)->is_fou = 0;
+   NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 
/* Setup for GRO checksum validation */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5bbea9a0ce96..8564cab96189 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1328,6 +1328,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff 
**head,
 
for (p = *head; p; p = p->next) {
struct iphdr *iph2;
+   u16 flush_id;
 
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -1351,16 +1352,36 @@ static struct sk_buff **inet_gro_receive(struct sk_buff 
**head,
(iph->tos ^ iph2->tos) |
((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
 
-   /* Save the IP ID check to be included later when we get to
-* the transport layer so only the inner most IP ID is checked.
-* This is because some GSO/TSO implementations do not
-* correctly increment the IP ID for the outer hdrs.
-*/
-   NAPI_GRO_CB(p)->flush_id =
-   ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ 
id);
NAPI_GRO_CB(p)->flush |= flush;
+
+   /* We need to store of the IP ID check to be included later
+* when we can verify that this packet does in fact belong
+* to a given flow.
+*/
+   flush_id = (u16)(id - ntohs(iph2->id));
+
+   /* This bit of code makes it much easier for us to identify
+* the cases where we are doing atomic vs non-atomic IP ID
+* checks.  Specifically an atomic check can return IP ID
+* values 0 - 0x, while a non-atomic check can only
+* return 0 or 0x.
+*/
+   if (!NAPI_GRO_CB(p)->is_atomic ||
+   !(iph->frag_off & htons(IP_DF))) {
+   flush_id ^= NAPI_GRO_CB(p)->count;
+   flush_id = flush_id ? 0x : 0;
+   }
+
+   /* If the previous IP ID value was based on an atomic
+* datagram we can overwrite the value and ignore it.
+*/
+   if (NAPI_GRO_CB(skb)->is_atomic)
+   NAPI_GRO_CB(p)->flush_id = flush_id;
+   else
+   NAPI_GRO_CB(p)->flush_id |= flush_id;
}
 
+   NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;
skb_set_network_header(skb, off);
/* The above will be needed by the transport layer if there is one
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 08dd25d835af..d1ffd55289bd 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -239,7 +239,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, 
struct sk_buff *skb)
 
 found:
/* In

[net-next PATCH v2 4/5] GSO: Support partial segmentation offload

2016-04-10 Thread Alexander Duyck
This patch adds support for something I am referring to as GSO partial.
The basic idea is that we can support a broader range of devices for
segmentation if we use fixed outer headers and have the hardware only
really deal with segmenting the inner header.  The idea behind the naming
is due to the fact that everything before csum_start will be fixed headers,
and everything after will be the region that is handled by hardware.

With the current implementation it allows us to add support for the
following GSO types with an inner TSO_MANGLEID or TSO6 offload:
NETIF_F_GSO_GRE
NETIF_F_GSO_GRE_CSUM
NETIF_F_GSO_IPIP
NETIF_F_GSO_SIT
NETIF_F_UDP_TUNNEL
NETIF_F_UDP_TUNNEL_CSUM

In the case of hardware that already supports tunneling we may be able to
extend this further to support TSO_TCPV4 without TSO_MANGLEID if the
hardware can support updating inner IPv4 headers.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdev_features.h |5 +
 include/linux/netdevice.h   |2 ++
 include/linux/skbuff.h  |9 +++--
 net/core/dev.c  |   36 +---
 net/core/ethtool.c  |1 +
 net/core/skbuff.c   |   29 -
 net/ipv4/af_inet.c  |   20 
 net/ipv4/gre_offload.c  |   26 +-
 net/ipv4/tcp_offload.c  |   10 --
 net/ipv4/udp_offload.c  |   27 +--
 net/ipv6/ip6_offload.c  |   10 +-
 11 files changed, 151 insertions(+), 24 deletions(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 7cf272a4b5c8..9fc79df0e561 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -48,6 +48,10 @@ enum {
NETIF_F_GSO_SIT_BIT,/* ... SIT tunnel with TSO */
NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */
NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
+   NETIF_F_GSO_PARTIAL_BIT,/* ... Only segment inner-most L4
+* in hardware and all other
+* headers in software.
+*/
NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
/**/NETIF_F_GSO_LAST =  /* last bit, see GSO_MASK */
NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
@@ -122,6 +126,7 @@ enum {
 #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_TSO_MANGLEID   __NETIF_F(TSO_MANGLEID)
+#define NETIF_F_GSO_PARTIAL __NETIF_F(GSO_PARTIAL)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX__NETIF_F(HW_VLAN_STAG_RX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6a248a3a44bf..e15fbcd79be6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1654,6 +1654,7 @@ struct net_device {
netdev_features_t   vlan_features;
netdev_features_t   hw_enc_features;
netdev_features_t   mpls_features;
+   netdev_features_t   gso_partial_features;
 
int ifindex;
int group;
@@ -4004,6 +4005,7 @@ static inline bool net_gso_ok(netdev_features_t features, 
int gso_type)
BUILD_BUG_ON(SKB_GSO_SIT != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> 
NETIF_F_GSO_SHIFT));
+   BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> 
NETIF_F_GSO_SHIFT));
 
return (features & feature) == feature;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5fba16658f9d..da0ace389fec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -483,7 +483,9 @@ enum {
 
SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
 
-   SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
+   SKB_GSO_PARTIAL = 1 << 13,
+
+   SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
 };
 
 #if BITS_PER_LONG > 32
@@ -3591,7 +3593,10 @@ static inline struct sec_path *skb_sec_path(struct 
sk_buff *skb)
  * Keeps track of level of encapsulation of network headers.
  */
 struct skb_gso_cb {
-   int mac_offset;
+   union {
+   int mac_offset;
+   int data_offset;
+   };
int encap_level;
__wsum  csum;
__u16   csum_start;
diff --git a/net/core/dev.c b/net/core/dev.c
index b78b586b1856..556dd09af3b8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2711,6 +2711,19 @@ struct sk_buff *__skb_gso_segment(struc

[net-next PATCH v2 5/5] Documentation: Add documentation for TSO and GSO features

2016-04-10 Thread Alexander Duyck
This document is a starting point for defining the TSO and GSO features.
The whole thing is starting to get a bit messy so I wanted to make sure we
have notes somwhere to start describing what does and doesn't work.

Signed-off-by: Alexander Duyck 
---
 Documentation/networking/segmentation-offloads.txt |  130 
 1 file changed, 130 insertions(+)
 create mode 100644 Documentation/networking/segmentation-offloads.txt

diff --git a/Documentation/networking/segmentation-offloads.txt 
b/Documentation/networking/segmentation-offloads.txt
new file mode 100644
index ..f200467ade38
--- /dev/null
+++ b/Documentation/networking/segmentation-offloads.txt
@@ -0,0 +1,130 @@
+Segmentation Offloads in the Linux Networking Stack
+
+Introduction
+
+
+This document describes a set of techniques in the Linux networking stack
+to take advantage of segmentation offload capabilities of various NICs.
+
+The following technologies are described:
+ * TCP Segmentation Offload - TSO
+ * UDP Fragmentation Offload - UFO
+ * IPIP, SIT, GRE, and UDP Tunnel Offloads
+ * Generic Segmentation Offload - GSO
+ * Generic Receive Offload - GRO
+ * Partial Generic Segmentation Offload - GSO_PARTIAL
+
+TCP Segmentation Offload
+
+
+TCP segmentation allows a device to segment a single frame into multiple
+frames with a data payload size specified in skb_shinfo()->gso_size.
+When TCP segmentation requested the bit for either SKB_GSO_TCP or
+SKB_GSO_TCP6 should be set in skb_shinfo()->gso_type and
+skb_shinfo()->gso_size should be set to a non-zero value.
+
+TCP segmentation is dependent on support for the use of partial checksum
+offload.  For this reason TSO is normally disabled if the Tx checksum
+offload for a given device is disabled.
+
+In order to support TCP segmentation offload it is necessary to populate
+the network and transport header offsets of the skbuff so that the device
+drivers will be able determine the offsets of the IP or IPv6 header and the
+TCP header.  In addition as CHECKSUM_PARTIAL is required csum_start should
+also point to the TCP header of the packet.
+
+For IPv4 segmentation we support one of two types in terms of the IP ID.
+The default behavior is to increment the IP ID with every segment.  If the
+GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP
+ID and all segments will use the same IP ID.  If a device has
+NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO
+and we will either increment the IP ID for all frames, or leave it at a
+static value based on driver preference.
+
+UDP Fragmentation Offload
+=
+
+UDP fragmentation offload allows a device to fragment an oversized UDP
+datagram into multiple IPv4 fragments.  Many of the requirements for UDP
+fragmentation offload are the same as TSO.  However the IPv4 ID for
+fragments should not increment as a single IPv4 datagram is fragmented.
+
+IPIP, SIT, GRE, UDP Tunnel, and Remote Checksum Offloads
+
+
+In addition to the offloads described above it is possible for a frame to
+contain additional headers such as an outer tunnel.  In order to account
+for such instances an additional set of segmentation offload types were
+introduced including SKB_GSO_IPIP, SKB_GSO_SIT, SKB_GSO_GRE, and
+SKB_GSO_UDP_TUNNEL.  These extra segmentation types are used to identify
+cases where there are more than just 1 set of headers.  For example in the
+case of IPIP and SIT we should have the network and transport headers moved
+from the standard list of headers to "inner" header offsets.
+
+Currently only two levels of headers are supported.  The convention is to
+refer to the tunnel headers as the outer headers, while the encapsulated
+data is normally referred to as the inner headers.  Below is the list of
+calls to access the given headers:
+
+IPIP/SIT Tunnel:
+   Outer   Inner
+MACskb_mac_header
+Networkskb_network_header  skb_inner_network_header
+Transport  skb_transport_header
+
+UDP/GRE Tunnel:
+   Outer   Inner
+MACskb_mac_header  skb_inner_mac_header
+Networkskb_network_header  skb_inner_network_header
+Transport  skb_transport_headerskb_inner_transport_header
+
+In addition to the above tunnel types there are also SKB_GSO_GRE_CSUM and
+SKB_GSO_UDP_TUNNEL_CSUM.  These two additional tunnel types reflect the
+fact that the outer header also requests to have a non-zero checksum
+included in the outer header.
+
+Finally there is SKB_GSO_REMCSUM which indicates that a given tunnel header
+has requested a remote checksum offload.  In this case the inner headers
+will be left with a partial checksum and only the outer header checksum
+will be computed.
+
+Generic Segmentation Offload
+
+
+Generic segmentation offload is a p

[net-next PATCH v2 1/5] ethtool: Add support for toggling any of the GSO offloads

2016-04-10 Thread Alexander Duyck
The strings were missing for several of the GSO offloads that are
available.  This patch provides the missing strings so that we can toggle
or query any of them via the ethtool command.

Signed-off-by: Alexander Duyck 
---
 net/core/ethtool.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f426c5ad6149..6a7f99661c2f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -82,9 +82,11 @@ static const char 
netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
[NETIF_F_FSO_BIT] =  "tx-fcoe-segmentation",
[NETIF_F_GSO_GRE_BIT] =  "tx-gre-segmentation",
+   [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
[NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
[NETIF_F_GSO_SIT_BIT] =  "tx-sit-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_BIT] =   "tx-udp_tnl-segmentation",
+   [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
 
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] ="tx-checksum-sctp",



[net-next PATCH v2 0/5] GRO Fixed IPv4 ID support and GSO partial support

2016-04-10 Thread Alexander Duyck
This patch series sets up a few different things.

First it adds support for GRO of frames with a fixed IP ID value.  This
will allow us to perform GRO for frames that go through things like an IPv6
to IPv4 header translation.

The second item we add is support for segmenting frames that are generated
this way.  Most devices only support an incrementing IP ID value, and in
the case of TCP the IP ID can be ignored in many cases since the DF bit
should be set.  So we can technically segment these frames using existing
TSO if we are willing to allow the IP ID to be mangled.  As such I have
added a matching feature for the new form of GRO/GSO called TCP IPv4 ID
mangling.  With this enabled we can assemble and disassemble a frame with
the sequence number fixed and the only ill effect will be that the IPv4 ID
will be altered which may or may not have any noticeable effect.  As such I
have defaulted the feature to disabled.

The third item this patch series adds is support for partial GSO
segmentation.  Partial GSO segmentation allows us to split a large frame
into two pieces.  The first piece will have an even multiple of MSS worth
of data and the headers before the one pointed to by csum_start will have
been updated so that they are correct for if the data payload had already
been segmented.  By doing this we can do things such as precompute the
outer header checksums for a frame to be segmented allowing us to perform
TSO on devices that don't support tunneling, or tunneling with outer header
checksums.

This patch set is based on the net-next tree, but I included "net: remove
netdevice gso_min_segs" in my tree as I assume it is likely to be applied
before this patch set will and I wanted to avoid a merge conflict.

v2: Fixed items reported by Jesse Gross
fixed missing GSO flag in MPLS check
adding DF check for MANGLEID
Moved extra GSO feature checks into gso_features_check
Rebased batches to account for "net: remove netdevice gso_min_segs"

Driver patches from the first patch set should still be compatible.  However
I do have a few changes in them so I will submit a v2 of those to Jeff
Kirsher once these patches are accepted into net-next.

Example driver patches for i40e, ixgbe, and igb:
https://patchwork.ozlabs.org/patch/608221/
https://patchwork.ozlabs.org/patch/608224/
https://patchwork.ozlabs.org/patch/608225/

---

Alexander Duyck (5):
  ethtool: Add support for toggling any of the GSO offloads
  GSO: Add GSO type for fixed IPv4 ID
  GRO: Add support for TCP with fixed IPv4 ID field, limit tunnel IP ID 
values
  GSO: Support partial segmentation offload
  Documentation: Add documentation for TSO and GSO features


 Documentation/networking/segmentation-offloads.txt |  130 
 include/linux/netdev_features.h|8 +
 include/linux/netdevice.h  |8 +
 include/linux/skbuff.h |   27 +++-
 net/core/dev.c |   67 +-
 net/core/ethtool.c |4 +
 net/core/skbuff.c  |   29 
 net/ipv4/af_inet.c |   70 ---
 net/ipv4/gre_offload.c |   27 +++-
 net/ipv4/tcp_offload.c |   30 -
 net/ipv4/udp_offload.c |   27 +++-
 net/ipv6/ip6_offload.c |   21 +++
 net/mpls/mpls_gso.c|1 
 13 files changed, 395 insertions(+), 54 deletions(-)
 create mode 100644 Documentation/networking/segmentation-offloads.txt

--


[net-next PATCH v2 2/5] GSO: Add GSO type for fixed IPv4 ID

2016-04-10 Thread Alexander Duyck
This patch adds support for TSO using IPv4 headers with a fixed IP ID
field.  This is meant to allow us to do a lossless GRO in the case of TCP
flows that use a fixed IP ID such as those that convert IPv6 header to IPv4
headers.

In addition I am adding a feature that for now I am referring to TSO with
IP ID mangling.  Basically when this flag is enabled the device has the
option to either output the flow with incrementing IP IDs or with a fixed
IP ID regardless of what the original IP ID ordering was.  This is useful
in cases where the DF bit is set and we do not care if the original IP ID
value is maintained.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdev_features.h |3 +++
 include/linux/netdevice.h   |1 +
 include/linux/skbuff.h  |   20 +++-
 net/core/dev.c  |   34 +-
 net/core/ethtool.c  |1 +
 net/ipv4/af_inet.c  |   19 +++
 net/ipv4/gre_offload.c  |1 +
 net/ipv4/tcp_offload.c  |4 +++-
 net/ipv6/ip6_offload.c  |3 ++-
 net/mpls/mpls_gso.c |1 +
 10 files changed, 63 insertions(+), 24 deletions(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a734bf43d190..7cf272a4b5c8 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -39,6 +39,7 @@ enum {
NETIF_F_UFO_BIT,/* ... UDPv4 fragmentation */
NETIF_F_GSO_ROBUST_BIT, /* ... ->SKB_GSO_DODGY */
NETIF_F_TSO_ECN_BIT,/* ... TCP ECN support */
+   NETIF_F_TSO_MANGLEID_BIT,   /* ... IPV4 ID mangling allowed */
NETIF_F_TSO6_BIT,   /* ... TCPv6 segmentation */
NETIF_F_FSO_BIT,/* ... FCoE segmentation */
NETIF_F_GSO_GRE_BIT,/* ... GRE with TSO */
@@ -120,6 +121,7 @@ enum {
 #define NETIF_F_GSO_SIT__NETIF_F(GSO_SIT)
 #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
+#define NETIF_F_TSO_MANGLEID   __NETIF_F(TSO_MANGLEID)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX__NETIF_F(HW_VLAN_STAG_RX)
@@ -147,6 +149,7 @@ enum {
 
 /* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE   (NETIF_F_TSO | NETIF_F_TSO_ECN | \
+NETIF_F_TSO_MANGLEID | \
 NETIF_F_TSO6 | NETIF_F_UFO)
 
 /* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 347ad5de0d93..eb7f037a4068 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3992,6 +3992,7 @@ static inline bool net_gso_ok(netdev_features_t features, 
int gso_type)
BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
+   BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_FCOE!= (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 007381270ff8..5fba16658f9d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -465,23 +465,25 @@ enum {
/* This indicates the tcp segment has CWR set. */
SKB_GSO_TCP_ECN = 1 << 3,
 
-   SKB_GSO_TCPV6 = 1 << 4,
+   SKB_GSO_TCP_FIXEDID = 1 << 4,
 
-   SKB_GSO_FCOE = 1 << 5,
+   SKB_GSO_TCPV6 = 1 << 5,
 
-   SKB_GSO_GRE = 1 << 6,
+   SKB_GSO_FCOE = 1 << 6,
 
-   SKB_GSO_GRE_CSUM = 1 << 7,
+   SKB_GSO_GRE = 1 << 7,
 
-   SKB_GSO_IPIP = 1 << 8,
+   SKB_GSO_GRE_CSUM = 1 << 8,
 
-   SKB_GSO_SIT = 1 << 9,
+   SKB_GSO_IPIP = 1 << 9,
 
-   SKB_GSO_UDP_TUNNEL = 1 << 10,
+   SKB_GSO_SIT = 1 << 10,
 
-   SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
+   SKB_GSO_UDP_TUNNEL = 1 << 11,
 
-   SKB_GSO_TUNNEL_REMCSUM = 1 << 12,
+   SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
+
+   SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/dev.c b/net/core/dev.c
index 09fb1ace9dc8..e896b1953ab6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2825,14 +2825,36 @@ static netdev_features_t dflt_features_check(const 
struct sk_buff *skb,
return vlan_features_check(skb, features);
 }
 
+static netdev_features_t gso_features_check(const struct sk_buff *skb,
+   struct net_device *dev,
+   

Re: [PATCH] ath9k: remove duplicate assignment of variable ah

2016-04-10 Thread Julian Calaby
Hi All,

On Sun, Apr 10, 2016 at 9:25 PM, Colin King  wrote:
> From: Colin Ian King 
>
> ah is written twice with the same value, remove one of the
> redundant assignments to ah.
>
> Signed-off-by: Colin Ian King 

Looks right to me.

Signed-off-by: Julian Calaby 

Thanks,

> ---
>  drivers/net/wireless/ath/ath9k/init.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/wireless/ath/ath9k/init.c 
> b/drivers/net/wireless/ath/ath9k/init.c
> index 77ace8d..fb702c4 100644
> --- a/drivers/net/wireless/ath/ath9k/init.c
> +++ b/drivers/net/wireless/ath/ath9k/init.c
> @@ -477,7 +477,7 @@ static void ath9k_eeprom_request_cb(const struct firmware 
> *eeprom_blob,
>  static int ath9k_eeprom_request(struct ath_softc *sc, const char *name)
>  {
> struct ath9k_eeprom_ctx ec;
> -   struct ath_hw *ah = ah = sc->sc_ah;
> +   struct ath_hw *ah = sc->sc_ah;
> int err;
>
> /* try to load the EEPROM content asynchronously */
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Julian Calaby

Email: julian.cal...@gmail.com
Profile: http://www.google.com/profiles/julian.calaby/


Re: optimizations to sk_buff handling in rds_tcp_data_ready

2016-04-10 Thread Eric Dumazet
On Sat, 2016-04-09 at 20:18 -0400, Sowmini Varadhan wrote:
> On (04/07/16 07:16), Eric Dumazet wrote:
> > Use skb split like TCP in output path ?
> > Really, pskb_expand_head() is not supposed to copy payload ;)
> 
> Question- how come skb_split doesnt have to deal with frag_list
> and do a skb_walk_frags()? Couldn't the split-line be somewhere
> in the frag_list? Also even for the skb_split_inside_header,
> dont we have to set
>   skb_shinfo(skb1)->frag_list = skb_shinfo(skb)->frag_list;
> and cut loose the skb_shinfo(skb)->frag_list?
> 
> As I try to mimic skb_split in some new set of "skb_carve"
> funtions, I'm running into all  the various frag_list cases. I'm 
> afraid I might end up needing most of the stuff under the "Pure 
> masohism" (sic) comment in __pskb_pull_tail(). 
> 

The helper was written for TCP I guess. TCP in output path never uses
skb_shinfo(skb)->frag_list : It is guaranteed to be NULL for all skbs.






[PATCH] ravb: make ravb_ptp_interrupt() *void*

2016-04-10 Thread Sergei Shtylyov
When we have the ISS.CGIS bit set, we already know that gPTP interrupt has
happened, so an extra GIS register check at the end of ravb_ptp_interrupt()
seems superfluous.  We can model the gPTP interrupt  handler like all other
dedicated interrupt handlers in the driver and make it *void*.

Signed-off-by: Sergei Shtylyov 

---
The patch is against the Dave Miller's 'net-next.git' repo.

 drivers/net/ethernet/renesas/ravb.h  |2 +-
 drivers/net/ethernet/renesas/ravb_main.c |8 ++--
 drivers/net/ethernet/renesas/ravb_ptp.c  |9 ++---
 3 files changed, 9 insertions(+), 10 deletions(-)

Index: net-next/drivers/net/ethernet/renesas/ravb.h
===
--- net-next.orig/drivers/net/ethernet/renesas/ravb.h
+++ net-next/drivers/net/ethernet/renesas/ravb.h
@@ -1045,7 +1045,7 @@ void ravb_modify(struct net_device *ndev
 u32 set);
 int ravb_wait(struct net_device *ndev, enum ravb_reg reg, u32 mask, u32 value);
 
-irqreturn_t ravb_ptp_interrupt(struct net_device *ndev);
+void ravb_ptp_interrupt(struct net_device *ndev);
 void ravb_ptp_init(struct net_device *ndev, struct platform_device *pdev);
 void ravb_ptp_stop(struct net_device *ndev);
 
Index: net-next/drivers/net/ethernet/renesas/ravb_main.c
===
--- net-next.orig/drivers/net/ethernet/renesas/ravb_main.c
+++ net-next/drivers/net/ethernet/renesas/ravb_main.c
@@ -807,8 +807,10 @@ static irqreturn_t ravb_interrupt(int ir
}
 
/* gPTP interrupt status summary */
-   if ((iss & ISS_CGIS) && ravb_ptp_interrupt(ndev) == IRQ_HANDLED)
+   if (iss & ISS_CGIS) {
+   ravb_ptp_interrupt(ndev);
result = IRQ_HANDLED;
+   }
 
mmiowb();
spin_unlock(&priv->lock);
@@ -838,8 +840,10 @@ static irqreturn_t ravb_multi_interrupt(
}
 
/* gPTP interrupt status summary */
-   if ((iss & ISS_CGIS) && ravb_ptp_interrupt(ndev) == IRQ_HANDLED)
+   if (iss & ISS_CGIS) {
+   ravb_ptp_interrupt(ndev);
result = IRQ_HANDLED;
+   }
 
mmiowb();
spin_unlock(&priv->lock);
Index: net-next/drivers/net/ethernet/renesas/ravb_ptp.c
===
--- net-next.orig/drivers/net/ethernet/renesas/ravb_ptp.c
+++ net-next/drivers/net/ethernet/renesas/ravb_ptp.c
@@ -296,7 +296,7 @@ static const struct ptp_clock_info ravb_
 };
 
 /* Caller must hold the lock */
-irqreturn_t ravb_ptp_interrupt(struct net_device *ndev)
+void ravb_ptp_interrupt(struct net_device *ndev)
 {
struct ravb_private *priv = netdev_priv(ndev);
u32 gis = ravb_read(ndev, GIS);
@@ -319,12 +319,7 @@ irqreturn_t ravb_ptp_interrupt(struct ne
}
}
 
-   if (gis) {
-   ravb_write(ndev, ~gis, GIS);
-   return IRQ_HANDLED;
-   }
-
-   return IRQ_NONE;
+   ravb_write(ndev, ~gis, GIS);
 }
 
 void ravb_ptp_init(struct net_device *ndev, struct platform_device *pdev)



Re: [PATCH net-next v2 1/2] rtnetlink: add new RTM_GETSTATS message to dump link stats

2016-04-10 Thread roopa
On 4/10/16, 6:48 AM, Jamal Hadi Salim wrote:
> On 16-04-09 02:00 PM, roopa wrote:
>
>> This EXTENDED_HW_STATS is for ethtool like extended hw stats. This is 
>> keeping in
>> mind that we want to also move ethtool to netlink in the future and with 
>> switchdev
>> it becomes more necessary that we provide all stats closer to the other 
>> netdev stats.
>> So far hw extended stats have always been available through this separate 
>> ethtool
>> channel. The intent here is to unify the api for extended hw and software 
>> only stats.
>
> Ok, so these are _not_ the stats which are broken down by packet size
> ranges which are quiet common; but rather "proprietary" per port
> type h/w stats? I browsed a couple of users of ethtool_stats and i see
> they return proprietary looking 64 bit counters (batman for example
> has a very strange meaning to those stats).
> What i meant is a lot of ASICS have counters for byte ranges
> [64bytes-128bytes], [128bytes-256bytes], etc - sorry i cant pin a name
> to those but i am sure you have seen them and i thought those at minimal
> need their own TLV since they are always fixed.

yes, I think i have seen this. But, these if presented by the driver or 
hardware,
can be part of the extended hw stats (just like how ethtool would do).
So, there is the base netdev stats IFLA_STATS_LINK64 which is nothing but
rtnl_link_stats64 (which this patch adds). And this patch does not add but lists
examples for future additions IFLA_STATS_EXTENDED (for example: bridge
(can include igmp, stp, vlan stats here). bond can includelacp, and other stats 
here). All these will be TLV based.
and note that this patch does notrestrict or define the design for these yet.


>
>> XSTATS is per netdev can be included as a nested attribute inside 
>> IFLA_EXTENDED_STATS
>> which are per netdev. bridge vlan stats will also fall here.
>>
>
> And you are going to distinguish which come from hardware and which are
> software derived?

a) IFLA_EXTENDED_STATS - these are all software. ( for example: bridge (can 
include 
igmp, stp, vlan stats here).
bond can include lacp, and other stats here).

b) IFLA_HW_EXTENDED_STATS - for hardware stats. for a switch port these will be 
similar to ethtool
physical port stats today. For logical devices, this can include specific hw 
stats that switch asics
provide and those which cannot be added to the software stats.

And since this patch does not define the format for these stats yet, we can 
potentially design it when we see the first case of such stats.




>
>> And this api  provides netdev specific stats. We have always mapped all
>> asic stats to the switch port netdev stats. and this api does not cover the 
>> non-netdev specific stats.
>> If you are for example asking for stats for a hardware offloaded bridge, 
>> then yes, they will fall here
>> and be available on the bridge netdev. For asic stats that don't map to any 
>> netdev, devlink will be an
>> appropriate infrastructure IMO.
>>
>> I am not sure if I answered your question :).
>>
>
> It is useful to have this discussion; unfortunately these user APIS once are 
> in will never be allowed to change. The answers could come
> in time.

yep, agreed.
>
>>> Should such a command then not be rejected with an error code?
>> Dumps with no data are not rejected with an error code AFAIK. ie they don't 
>> return
>> -ENODATA. This is consistent with all other dumps (unless i missed it).
>>   But, if there is a need for an error code, i can certainly check again.
>>
>
> It is mostly because you chose a whitelist filter i.e you list what
> is allowed to be sent back. And if such a list is missing there
> needs to be an opposite default (which is a deny all).

ok, by default no-filter = no-data is what we decided. And that
keeps the api simple. will think some more about if we should return
an err in case of no-filter.


>
>

[snip]

> True, but it must be 32 bit aligned. So something like:
> struct if_stats_msg {
>  __u8  family;
>  __u8 pad1;
>  __u16 pad2;
> __u32 ifindex;
> __u32 filter_mask;
> }
>

ok ack

>
>> Yeah i remember :). But deferring it for a later incremental feature. That 
>> needs some more thought.
>
> NP ;->
>
>> Right now there is an urgent need to get the basic get stats api for a bunch 
>> of other stats: mpls, bridge vlan etc.
>> Because it is not clean to extend the current stats infra part of the link 
>> message for this. So trying to get this in first.
>>
>
> Agreed.
> The only thing i strongly feel about is the if_stats_msg - please fix
> that and lets get at least the basics in. We can resolve other things
> with further discussions.

will do. Thanks for the review.


[PATCH 4.5 155/238] ARC: [plat-axs10x] add Ethernet PHY description in .dts

2016-04-10 Thread Greg Kroah-Hartman
4.5-stable review patch.  If anyone has any objections, please let me know.

--

From: Alexey Brodkin 

commit 667a490bdb6e27db0887d2ca515b907d6aa87118 upstream.

Commit e34d65696d2e ("stmmac: create of compatible mdio bus for stmmac
driver") broke DW GMAC functionality on ARC AXS10x boards:

That's what happens on eth0 up:
  --->8
| libphy: PHY stmmac-0: not found
| eth0: Could not attach to PHY
| stmmac_open: Cannot attach to PHY (error: -19)
  --->8

Simplest solution is to add PHY description in board's .dts.
And so we do here.

Signed-off-by: Alexey Brodkin 
Cc: Rob Herring 
Cc: Phil Reid 
Cc: David S. Miller 
Cc: linux-ker...@vger.kernel.org
Cc: netdev@vger.kernel.org
Reviewed-by: Sergei Shtylyov 
Signed-off-by: Vineet Gupta 
Signed-off-by: Greg Kroah-Hartman 

---
 arch/arc/boot/dts/axs10x_mb.dtsi |8 
 1 file changed, 8 insertions(+)

--- a/arch/arc/boot/dts/axs10x_mb.dtsi
+++ b/arch/arc/boot/dts/axs10x_mb.dtsi
@@ -47,6 +47,14 @@
clocks = <&apbclk>;
clock-names = "stmmaceth";
max-speed = <100>;
+   mdio0 {
+   #address-cells = <1>;
+   #size-cells = <0>;
+   compatible = "snps,dwmac-mdio";
+   phy1: ethernet-phy@1 {
+   reg = <1>;
+   };
+   };
};
 
ehci@0x4 {




Re: [Lsf] [Lsf-pc] [LSF/MM TOPIC] Generic page-pool recycle facility?

2016-04-10 Thread Sagi Grimberg



This is also very interesting for storage targets, which face the same
issue.  SCST has a mode where it caches some fully constructed SGLs,
which is probably very similar to what NICs want to do.


I think a cached allocator for page sets + the scatterlists that
describe these page sets would not only be useful for SCSI target
implementations but also for the Linux SCSI initiator. Today the scsi-mq
code reserves space in each scsi_cmnd for a scatterlist of
SCSI_MAX_SG_SEGMENTS. If scatterlists would be cached together with page
sets less memory would be needed per scsi_cmnd.


If we go down this road how about also attaching some driver opaques
to the page sets?

I know of some drivers that can make good use of those ;)


Re: [RFC PATCH v2 5/5] Add sample for adding simple drop program to link

2016-04-10 Thread Brenden Blanco
On Sat, Apr 09, 2016 at 01:27:03PM -0400, Jamal Hadi Salim wrote:
> On 16-04-09 12:43 PM, Brenden Blanco wrote:
> >On Sat, Apr 09, 2016 at 10:48:05AM -0400, Jamal Hadi Salim wrote:
> 
> 
> >>Ok, sorry - should have looked this far before sending earlier email.
> >>So when you run concurently you see about 5Mpps per core but if you
> >>shoot all traffic at a single core you see 20Mpps?
> >No, only sender is multiple, receiver is still single core. The flow is
> >the same in all 4 of the send threads. Note that only ksoftirqd/6 is
> >active.
> 
> Got it.
> The sender was limited to the 20Mpps and you are able to keep up
> if i understand correctly.
Perhaps, though I can't say 100%. The sender is able to do about 21/22
Mpps when pause frames are disabled. The sender is likely CPU limited as
it is an older Xeon.
> 
> 
> >>
> >>Devil's advocate question:
> >>If the bottleneck is the driver - is there an advantage in adding the
> >>bpf code at all in the driver?
> >Only by adding this hook into the driver has it become the bottleneck.
> >
> >Prior to this, the bottleneck was later in the codepath, primarily in
> >allocations.
> >
> 
> Maybe useful in your commit log to show the prior and after.
I can add this, sure.
> Looking at both your and Daniel's profile you show in this email
> mlx4_en_process_rx_cq() seems to be where the action is on both, no?
I don't draw this conclusion. With the phys_dev drop,
mlx4_en_process_rx_cq is the majority time consumer. In the perf output
showing drop in tc, the functions such as dev_gro_receive,
kmem_cache_free, napi_gro_frags, inet_gro_receive, __build_skb, etc
combined add up to 60% of the time spent. None of these are called when
early drop occurs. Just because mlx4_en_process_rx_cq is at the top of
the list doesn't mean it is the lowest hanging fruit.
> 
> >If a packet is to be dropped, and a determination can be made with fewer
> >cpu cycles spent, then there is more time for the goodput.
> >
> 
> Agreed.
> 
> >Beyond that, even if the skb allocation gets 10x or 100x or whatever
> >improvement, there is still a non-zero cost associated, and dropping bad
> >packets with minimal time spent has value. The same argument holds for
> >physical nic forwarding decisions.
> >
> 
> I always go for the lowest hanging fruit.
Which to me is the 60% time spent above the driver level as shown above.
> It seemed it was the driver path in your case. When we removed
> the driver overhead (as demoed at the tc workshop in netdev11) we saw
> __netif_receive_skb_core() at the top of the profile.
> So in this case seems it was mlx4_en_process_rx_cq() - thats why i
> was saying the bottleneck is the driver.
I wouldn't call it a bottleneck when the time spent is additive,
aka run-to-completion.
> Having said that: I agree that early drop is useful if not for anything
> else to avoid the longer code path (but was worried after reading on
> thread this was going to get into a messy stack-in-the-driver and i am
> not sure it is avoidable either given a new ops interface is showing
>  up).
> 
> >>I am curious than before to see the comparison for the same bpf code
> >>running at tc level vs in the driver..
> >Here is a perf report for drop in the clsact qdisc with direct-action,
> >which Daniel earlier showed to have the best performance to-date. On my
> >machine, this gets about 6.5Mpps drop single core. Drop due to failed
> >IP lookup (not shown here) is worse @4.5Mpps.
> >
> 
> Nice.
> However, still for this to be orange/orange comparison you have to
> run it on the _same receiver machine_ as opposed to Daniel doing
> it on his for the one case. And two different kernels booted up
> one patched  with your changes and another virgin without them.
Of course the second perf report is on the same machine as the commit
message. That was generated fresh for this email thread. All of the
numbers I've quoted come from the same single-sender/single-receiver
setup. I did also revert the change the in mlx4 driver and there was no
change in the tc numbers.
> 
> cheers,
> jamal


Re: [PATCH net-next v2 1/2] rtnetlink: add new RTM_GETSTATS message to dump link stats

2016-04-10 Thread roopa
On 4/10/16, 1:16 AM, Thomas Graf wrote:
> On 04/08/16 at 11:38pm, Roopa Prabhu wrote:
>> From: Roopa Prabhu 
>>
>> This patch adds a new RTM_GETSTATS message to query link stats via netlink
>> from the kernel. RTM_NEWLINK also dumps stats today, but RTM_NEWLINK
>> returns a lot more than just stats and is expensive in some cases when
>> frequent polling for stats from userspace is a common operation.
>>
>> RTM_GETSTATS is an attempt to provide a light weight netlink message
>> to explicity query only link stats from the kernel on an interface.
>> The idea is to also keep it extensible so that new kinds of stats can be
>> added to it in the future.
>>
>> This patch adds the following attribute for NETDEV stats:
>> struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = {
>> [IFLA_STATS_LINK64]  = { .len = sizeof(struct rtnl_link_stats64) },
>> };
>>
>> This patch also allows for af family stats (an example af stats for IPV6
>> is available with the second patch in the series).
>>
>> Like any other rtnetlink message, RTM_GETSTATS can be used to get stats of
>> a single interface or all interfaces with NLM_F_DUMP.
> Awesome stuff Roopa.
>
> This currently ties everything to a net_device with a selector to
> include certain bits of that net_device. How about we take it half a
> step further and allow for non net_device stats such as IP, TCP,
> routing or ipsec stats to be retrieved as well?
yes, absolutely. and that is also the goal.
> A simple array of nested attributes replacing IFLA_STATS_* would
> allow for that, e.g.
>
> 1. {.type = ST_IPSTATS, value = { ...} }
>
> 2. {.type = ST_LINK, .value = {
> {.type = ST_LINK_NAME, .value = "eth0"},
> {.type = ST_LINK_Q, .value = 10}
>   }}
>
> 3. ...

One thing though,  Its unclear to me if we absolutely need the additional nest.
Every stats netlink msg has an ifindex in the header (if_stats_msg) if the scope
of the stats is a netdev. If the msg does not have an ifindex in the 
if_stats_msg,
it represents a global stat. ie Cant a dump, include other stats netlink msgs 
after
all the netdev msgs are done when the filter has global stat filters ?.
same will apply to RTM_GETSTATS (without NLM_F_DUMP).

Since the msg may potentially have more nest levels
in the IFLA_EXT_STATS categories, just trying to see if i can avoid adding 
another
top-level nest. We can sure add it if there is no other way to include global
stats in the same dump.
> So for your initial patch, you'd simply add a new top level attribute
> which ties all the net_device specific statistics to a top level
> attribute and we can add the non net_device specific stats later on.
> We can't do that later on though without breaking ABI so that would
> have to go in with the first iteration.
agreed
>
> We can preserve all the existing attribute formats, we simply have
> to introduce new attribute types which don't overlap and document
> which statistic identifier maps to which existing attribute id.
sure,

thanks Thomas.


Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter

2016-04-10 Thread Jamal Hadi Salim

On 16-04-10 12:53 PM, Tom Herbert wrote:


We started discussions about this in IOvisor. The Huawei project is
called ceth (Common Ethernet). It is essentially a layer called
directly from drivers intended for fast path forwarding and network
virtualization. They have put quite a bit of effort into buffer
management and other parts of the infrastructure, much of which we
would like to leverage in XDP. The code is currently in github, will
ask them to make it generally accessible.



Cant seem to find any info on it on the googles.
If it is forwarding then it should hopefully at least make use of Linux
control APIs I hope.

cheers,
jamal



Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter

2016-04-10 Thread Tom Herbert
On Sun, Apr 10, 2016 at 12:55 AM, Thomas Graf  wrote:
> On 04/09/16 at 10:26am, Alexei Starovoitov wrote:
>> On Sat, Apr 09, 2016 at 11:29:18AM -0400, Jamal Hadi Salim wrote:
>> > If this is _forwarding only_ it maybe useful to look at
>> > Alexey's old code in particular the DMA bits;
>> > he built his own lookup algorithm but sounds like bpf is
>> > a much better fit today.
>>
>> a link to these old bits?
>>
>> Just to be clear: this rfc is not the only thing we're considering.
>> In particular huawei guys did a monster effort to improve performance
>> in this area as well. We'll try to blend all the code together and
>> pick what's the best.
>
> What's the plan on opening the discussion on this? Can we get a peek?
> Is it an alternative to XDP and the driver hook? Different architecture
> or just different implementation? I understood it as another pseudo
> skb model with a path on converting to real skbs for stack processing.
>
We started discussions about this in IOvisor. The Huawei project is
called ceth (Common Ethernet). It is essentially a layer called
directly from drivers intended for fast path forwarding and network
virtualization. They have put quite a bit of effort into buffer
management and other parts of the infrastructure, much of which we
would like to leverage in XDP. The code is currently in github, will
ask them to make it generally accessible.

The programmability part, essentially BPF, should be part of a common
solution. We can define the necessary interfaces independently of the
underlying infrastructure which is really the only way we can do this
if we want the BPF programs to be portable across different
platforms-- in Linux, userspace, HW, etc.

Tom

> I really like the current proposal by Brenden for its simplicity and
> targeted compatibility with cls_bpf.


Re: [PATCH net-next v2 1/2] rtnetlink: add new RTM_GETSTATS message to dump link stats

2016-04-10 Thread Jamal Hadi Salim

On 16-04-09 02:00 PM, roopa wrote:


This EXTENDED_HW_STATS is for ethtool like extended hw stats. This is keeping in
mind that we want to also move ethtool to netlink in the future and with 
switchdev
it becomes more necessary that we provide all stats closer to the other netdev 
stats.
So far hw extended stats have always been available through this separate 
ethtool
channel. The intent here is to unify the api for extended hw and software only 
stats.


Ok, so these are _not_ the stats which are broken down by packet size
ranges which are quiet common; but rather "proprietary" per port
type h/w stats? I browsed a couple of users of ethtool_stats and i see
they return proprietary looking 64 bit counters (batman for example
has a very strange meaning to those stats).
What i meant is a lot of ASICS have counters for byte ranges
[64bytes-128bytes], [128bytes-256bytes], etc - sorry i cant pin a name
to those but i am sure you have seen them and i thought those at minimal
need their own TLV since they are always fixed.


XSTATS is per netdev can be included as a nested attribute inside 
IFLA_EXTENDED_STATS
which are per netdev. bridge vlan stats will also fall here.



And you are going to distinguish which come from hardware and which are
software derived?


And this api  provides netdev specific stats. We have always mapped all
asic stats to the switch port netdev stats. and this api does not cover the 
non-netdev specific stats.
If you are for example asking for stats for a hardware offloaded bridge, then 
yes, they will fall here
and be available on the bridge netdev. For asic stats that don't map to any 
netdev, devlink will be an
appropriate infrastructure IMO.

I am not sure if I answered your question :).



It is useful to have this discussion; unfortunately these user APIS once 
are in will never be allowed to change. The answers could come

in time.


Should such a command then not be rejected with an error code?

Dumps with no data are not rejected with an error code AFAIK. ie they don't 
return
-ENODATA. This is consistent with all other dumps (unless i missed it).
  But, if there is a need for an error code, i can certainly check again.



It is mostly because you chose a whitelist filter i.e you list what
is allowed to be sent back. And if such a list is missing there
needs to be an opposite default (which is a deny all).


+/* STATS section */
+
+struct if_stats_msg {
+__u8  family;
+__u32 ifindex;
+__u32 filter_mask;
+};


Needs to be 32 bit aligned.
Do you need 32 bits for the filter mask?

yes, i think we should keep it minimum 32 bits.


Ok, that is fine then. I thought it wont exceed
3-4 per port type but i could be wrong. 32 bits
should be safer.


Perhaps a 16bit mask and an 8bit pad for future use.

struct if_stats_msg {
__u32 ifindex;
__u16 filter_mask;
__u8  family;
__u8 pad; /* future use */
};

Or you could reverse those (from smallest to largest).


The __u8 family needs to be the first field in the structure and at the first 
byte in the header data.
hence family is first and i added the others after that. It follows the format 
for existing such structs (for other message types).



True, but it must be 32 bit aligned. So something like:
struct if_stats_msg {
 __u8  family;
 __u8 pad1;
 __u16 pad2;
__u32 ifindex;
__u32 filter_mask;
}



Yeah i remember :). But deferring it for a later incremental feature. That 
needs some more thought.


NP ;->


Right now there is an urgent need to get the basic get stats api for a bunch of 
other stats: mpls, bridge vlan etc.
Because it is not clean to extend the current stats infra part of the link 
message for this. So trying to get this in first.



Agreed.
The only thing i strongly feel about is the if_stats_msg - please fix
that and lets get at least the basics in. We can resolve other things
with further discussions.


And this patchset only adds a handler for RTM_NEWSTATS dump and get stats. Your 
stats events request should be part of the  RTM_NEWSTATS handler and can 
include other attributes (like timeout) in the future.



Ok.

cheers,
jamal

Thanks,
Roopa





Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter

2016-04-10 Thread Jamal Hadi Salim

On 16-04-09 01:26 PM, Alexei Starovoitov wrote:



yeah, no stack, no queues in bpf.


Thanks.




If this is _forwarding only_ it maybe useful to look at
Alexey's old code in particular the DMA bits;
he built his own lookup algorithm but sounds like bpf is
a much better fit today.


a link to these old bits?



Dang. Trying to remember exact name (I think it has been gone for at
least 10 years now). I know it is not CONFIG_NET_FASTROUTE although
it could have been that depending on the driver (tulip had some
nice DMA properties - which by todays standards would be considered
primitive ;->).
+Cc Robert and Alexey (Trying to figure out name of driver based
routing code that DMAed from ingress to egress port)


Just to be clear: this rfc is not the only thing we're considering.
In particular huawei guys did a monster effort to improve performance
in this area as well. We'll try to blend all the code together and
pick what's the best.



Sounds very interesting.

cheers,
jamal


Re: [PATCHv2 net-next 1/6] sctp: add sctp_info dump api for sctp_diag

2016-04-10 Thread Jamal Hadi Salim

On 16-04-09 01:21 PM, Eric Dumazet wrote:



Well, once a hole is there, nothing we can do really, because of
compatibility with old kernels / old binaries.


But when a _new_ structure is defined, this is the time where we can ask
for doing sensible things ;)



This one is fixable. sizeof() already includes the accounting of
the pad. something like:

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index fe95446..52542eb 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -158,6 +158,7 @@ struct tcp_info {
__u8tcpi_options;
__u8tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;

+   __u8pad;/*reuse this space if you need 8bits for something*/
__u32   tcpi_rto;
__u32   tcpi_ato;
__u32   tcpi_snd_mss;

cheers,
jamal


Re: AP firmware for TI wl1251 wifi chip (wl1251-fw-ap.bin)

2016-04-10 Thread Pavel Machek
Hi!

> > > wl1251 does not support AP mode, so there is no firmware for it in 
> > > the tree.
> > >
> > > Regards,
> > > Yaniv
> > 
> > Hi Yaniv! I read on some TI whitepaper, that wl1251 hardware supports 
> > some Soft-AP mode. So I expect that either special FW is needed for it 
> > or somehow it is possible to use current released. Do you have any 
> > information about it?
> 
> Hi Pali,
> This must be some typo, the device does not support Soft-AP.
> More than that, wl1251 family is not officially supported via the mainline 
> Linux.


> For Soft-AP, and other new features based on Linux you should use WiLink8 
> chip family.

Too late for that, we already have the devices, and they were manufactured 
quite long
time ago.

Is it "hardware can't do AP", "firmware can't do AP" or "current drivers 
do not support AP"?

Best regards,
Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


[PATCH] ath9k: remove duplicate assignment of variable ah

2016-04-10 Thread Colin King
From: Colin Ian King 

ah is written twice with the same value, remove one of the
redundant assignments to ah.

Signed-off-by: Colin Ian King 
---
 drivers/net/wireless/ath/ath9k/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath9k/init.c 
b/drivers/net/wireless/ath/ath9k/init.c
index 77ace8d..fb702c4 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -477,7 +477,7 @@ static void ath9k_eeprom_request_cb(const struct firmware 
*eeprom_blob,
 static int ath9k_eeprom_request(struct ath_softc *sc, const char *name)
 {
struct ath9k_eeprom_ctx ec;
-   struct ath_hw *ah = ah = sc->sc_ah;
+   struct ath_hw *ah = sc->sc_ah;
int err;
 
/* try to load the EEPROM content asynchronously */
-- 
2.7.4



[PATCH net] packet: fix heap info leak in PACKET_DIAG_MCLIST sock_diag interface

2016-04-10 Thread Mathias Krause
Because we miss to wipe the remainder of i->addr[] in packet_mc_add(),
pdiag_put_mclist() leaks uninitialized heap bytes via the
PACKET_DIAG_MCLIST netlink attribute.

Fix this by explicitly memset(0)ing the remaining bytes in i->addr[].

Fixes: eea68e2f1a00 ("packet: Report socket mclist info via diag module")
Signed-off-by: Mathias Krause 
Cc: Eric W. Biederman 
Cc: Pavel Emelyanov 
---
The bug itself precedes commit eea68e2f1a00 but the list wasn't exposed
to userland before the introduction of the packet_diag interface.
Therefore the "Fixes:" line on that commit.

 net/packet/af_packet.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 992396aa635c..86a408cf38d5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3441,6 +3441,7 @@ static int packet_mc_add(struct sock *sk, struct 
packet_mreq_max *mreq)
i->ifindex = mreq->mr_ifindex;
i->alen = mreq->mr_alen;
memcpy(i->addr, mreq->mr_address, i->alen);
+   memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
i->count = 1;
i->next = po->mclist;
po->mclist = i;
-- 
1.7.10.4



[PATCH net-next 2/4] qed: add Rx flow hash/indirection support.

2016-04-10 Thread Yuval Mintz
From: Sudarsana Reddy Kalluru 

Adds the required API for passing RSS-related configuration from qede.

Signed-off-by: Sudarsana Reddy Kalluru 
Signed-off-by: Yuval Mintz 
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c | 17 +
 include/linux/qed/qed_eth_if.h   |  1 +
 include/linux/qed/qed_if.h   | 11 +++
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c 
b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index e848d5a..5005497 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -35,19 +35,6 @@
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
 
-enum qed_rss_caps {
-   QED_RSS_IPV4= 0x1,
-   QED_RSS_IPV6= 0x2,
-   QED_RSS_IPV4_TCP= 0x4,
-   QED_RSS_IPV6_TCP= 0x8,
-   QED_RSS_IPV4_UDP= 0x10,
-   QED_RSS_IPV6_UDP= 0x20,
-};
-
-/* Should be the same as ETH_RSS_IND_TABLE_ENTRIES_NUM */
-#define QED_RSS_IND_TABLE_SIZE 128
-#define QED_RSS_KEY_SIZE 10 /* size in 32b chunks */
-
 struct qed_rss_params {
u8  update_rss_config;
u8  rss_enable;
@@ -1744,9 +1731,7 @@ static int qed_update_vport(struct qed_dev *cdev,
sp_rss_params.update_rss_capabilities = 1;
sp_rss_params.update_rss_ind_table = 1;
sp_rss_params.update_rss_key = 1;
-   sp_rss_params.rss_caps = QED_RSS_IPV4 |
-QED_RSS_IPV6 |
-QED_RSS_IPV4_TCP | QED_RSS_IPV6_TCP;
+   sp_rss_params.rss_caps = params->rss_params.rss_caps;
sp_rss_params.rss_table_size_log = 7; /* 2^7 = 128 */
memcpy(sp_rss_params.rss_ind_table,
   params->rss_params.rss_ind_table,
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index e00c8db..795c990 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -27,6 +27,7 @@ struct qed_dev_eth_info {
 struct qed_update_vport_rss_params {
u16 rss_ind_table[128];
u32 rss_key[10];
+   u8  rss_caps;
 };
 
 struct qed_update_vport_params {
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index b007011..67e8c20 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -515,4 +515,15 @@ static inline void internal_ram_wr(void __iomem *addr,
__internal_ram_wr(NULL, addr, size, data);
 }
 
+enum qed_rss_caps {
+   QED_RSS_IPV4= 0x1,
+   QED_RSS_IPV6= 0x2,
+   QED_RSS_IPV4_TCP= 0x4,
+   QED_RSS_IPV6_TCP= 0x8,
+   QED_RSS_IPV4_UDP= 0x10,
+   QED_RSS_IPV6_UDP= 0x20,
+};
+
+#define QED_RSS_IND_TABLE_SIZE 128
+#define QED_RSS_KEY_SIZE 10 /* size in 32b chunks */
 #endif
-- 
1.9.3



[PATCH net-next 0/4] qed*: [mostly] Ethtool RSS configuration

2016-04-10 Thread Yuval Mintz
Most of the content [code-wise] in this series is for allowing various
RSS-related configuration via ethtool.

In addition, this also removed an unnecessary versioning scheme between
the drivers and bump the driver version.

Dave,

Please consider applying this to `net-next'.

Thanks,
Yuval



[PATCH 3/4] qede: add Rx flow hash/indirection support.

2016-04-10 Thread Yuval Mintz
From: Sudarsana Reddy Kalluru 

Adds support for the following via ethtool:
  - UDP configuration of RSS based on 2-tuple/4-tuple.
  - RSS hash key.
  - RSS indirection table.

Signed-off-by: Sudarsana Reddy Kalluru 
Signed-off-by: Yuval Mintz 
---
 drivers/net/ethernet/qlogic/qede/qede.h |   4 +
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 237 +++-
 drivers/net/ethernet/qlogic/qede/qede_main.c|  52 +-
 3 files changed, 283 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h 
b/drivers/net/ethernet/qlogic/qede/qede.h
index e0a696a..80dbb73 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -154,6 +154,10 @@ struct qede_dev {
  SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
 
struct qede_stats   stats;
+#define QEDE_RSS_INDIR_INITED  BIT(0)
+#define QEDE_RSS_KEY_INITEDBIT(1)
+#define QEDE_RSS_CAPS_INITED   BIT(2)
+   u32 rss_params_inited; /* bit-field to track initialized rss params */
struct qed_update_vport_rss_params  rss_params;
u16 q_num_rx_buffers; /* Must be a power of two */
u16 q_num_tx_buffers; /* Must be a power of two */
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c 
b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index c49dc10..f0982f1 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -569,6 +569,236 @@ static int qede_set_phys_id(struct net_device *dev,
return 0;
 }
 
+static int qede_get_rss_flags(struct qede_dev *edev, struct ethtool_rxnfc 
*info)
+{
+   info->data = RXH_IP_SRC | RXH_IP_DST;
+
+   switch (info->flow_type) {
+   case TCP_V4_FLOW:
+   case TCP_V6_FLOW:
+   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+   break;
+   case UDP_V4_FLOW:
+   if (edev->rss_params.rss_caps & QED_RSS_IPV4_UDP)
+   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+   break;
+   case UDP_V6_FLOW:
+   if (edev->rss_params.rss_caps & QED_RSS_IPV6_UDP)
+   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+   break;
+   case IPV4_FLOW:
+   case IPV6_FLOW:
+   break;
+   default:
+   info->data = 0;
+   break;
+   }
+
+   return 0;
+}
+
+static int qede_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info,
+ u32 *rules __always_unused)
+{
+   struct qede_dev *edev = netdev_priv(dev);
+
+   switch (info->cmd) {
+   case ETHTOOL_GRXRINGS:
+   info->data = edev->num_rss;
+   return 0;
+   case ETHTOOL_GRXFH:
+   return qede_get_rss_flags(edev, info);
+   default:
+   DP_ERR(edev, "Command parameters not supported\n");
+   return -EOPNOTSUPP;
+   }
+}
+
+static int qede_set_rss_flags(struct qede_dev *edev, struct ethtool_rxnfc 
*info)
+{
+   struct qed_update_vport_params vport_update_params;
+   u8 set_caps = 0, clr_caps = 0;
+
+   DP_VERBOSE(edev, QED_MSG_DEBUG,
+  "Set rss flags command parameters: flow type = %d, data = 
%llu\n",
+  info->flow_type, info->data);
+
+   switch (info->flow_type) {
+   case TCP_V4_FLOW:
+   case TCP_V6_FLOW:
+   /* For TCP only 4-tuple hash is supported */
+   if (info->data ^ (RXH_IP_SRC | RXH_IP_DST |
+ RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+   DP_INFO(edev, "Command parameters not supported\n");
+   return -EINVAL;
+   }
+   return 0;
+   case UDP_V4_FLOW:
+   /* For UDP either 2-tuple hash or 4-tuple hash is supported */
+   if (info->data == (RXH_IP_SRC | RXH_IP_DST |
+  RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+   set_caps = QED_RSS_IPV4_UDP;
+   DP_VERBOSE(edev, QED_MSG_DEBUG,
+  "UDP 4-tuple enabled\n");
+   } else if (info->data == (RXH_IP_SRC | RXH_IP_DST)) {
+   clr_caps = QED_RSS_IPV4_UDP;
+   DP_VERBOSE(edev, QED_MSG_DEBUG,
+  "UDP 4-tuple disabled\n");
+   } else {
+   return -EINVAL;
+   }
+   break;
+   case UDP_V6_FLOW:
+   /* For UDP either 2-tuple hash or 4-tuple hash is supported */
+   if (info->data == (RXH_IP_SRC | RXH_IP_DST |
+  RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+   set_caps = QED_RSS_IPV6_UDP;
+   DP_VERBOSE(edev, QED_MSG_DEBUG,
+  "UDP 4-tuple enabled\n");
+   } else if (info->

[PATCH 4/4] qed* - bump driver versions to 8.7.1.20

2016-04-10 Thread Yuval Mintz
Signed-off-by: Yuval Mintz 
---
 drivers/net/ethernet/qlogic/qed/qed.h   | 2 +-
 drivers/net/ethernet/qlogic/qede/qede.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h 
b/drivers/net/ethernet/qlogic/qed/qed.h
index a3ee9df..0f0d2d1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -26,7 +26,7 @@
 #include "qed_hsi.h"
 
 extern const struct qed_common_ops qed_common_ops_pass;
-#define DRV_MODULE_VERSION "8.7.0.0"
+#define DRV_MODULE_VERSION "8.7.1.20"
 
 #define MAX_HWFNS_PER_DEVICE(4)
 #define NAME_SIZE 16
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h 
b/drivers/net/ethernet/qlogic/qede/qede.h
index 80dbb73..41c4189 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -25,8 +25,8 @@
 
 #define QEDE_MAJOR_VERSION 8
 #define QEDE_MINOR_VERSION 7
-#define QEDE_REVISION_VERSION  0
-#define QEDE_ENGINEERING_VERSION   0
+#define QEDE_REVISION_VERSION  1
+#define QEDE_ENGINEERING_VERSION   20
 #define DRV_MODULE_VERSION __stringify(QEDE_MAJOR_VERSION) "." \
__stringify(QEDE_MINOR_VERSION) "." \
__stringify(QEDE_REVISION_VERSION) "."  \
-- 
1.9.3



[PATCH net-next 1/4] qed*: remove version dependency

2016-04-10 Thread Yuval Mintz
From: Rahul Verma 

Inbox drivers don't need versioning scheme in order to guarantee
compatibility, as both qed and qede are compiled from same codebase.

Signed-off-by: Rahul Verma 
Signed-off-by: Yuval Mintz 
---
 drivers/net/ethernet/qlogic/qed/qed.h|  2 --
 drivers/net/ethernet/qlogic/qed/qed_l2.c |  8 +---
 drivers/net/ethernet/qlogic/qed/qed_main.c   | 11 ---
 drivers/net/ethernet/qlogic/qede/qede.h  |  2 --
 drivers/net/ethernet/qlogic/qede/qede_main.c | 11 +--
 include/linux/qed/qed_eth_if.h   |  2 +-
 include/linux/qed/qed_if.h   |  9 -
 7 files changed, 3 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h 
b/drivers/net/ethernet/qlogic/qed/qed.h
index fcb8e9b..a3ee9df 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -507,6 +507,4 @@ u32 qed_unzip_data(struct qed_hwfn *p_hwfn,
 
 int qed_slowpath_irq_req(struct qed_hwfn *hwfn);
 
-#define QED_ETH_INTERFACE_VERSION   300
-
 #endif /* _QED_H */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c 
b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 3f35c6c..e848d5a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2043,14 +2043,8 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
.get_vport_stats = &qed_get_vport_stats,
 };
 
-const struct qed_eth_ops *qed_get_eth_ops(u32 version)
+const struct qed_eth_ops *qed_get_eth_ops(void)
 {
-   if (version != QED_ETH_INTERFACE_VERSION) {
-   pr_notice("Cannot supply ethtool operations [%08x != %08x]\n",
- version, QED_ETH_INTERFACE_VERSION);
-   return NULL;
-   }
-
return &qed_eth_ops_pass;
 }
 EXPORT_SYMBOL(qed_get_eth_ops);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c 
b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 26d40db..c31d485 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1172,14 +1172,3 @@ const struct qed_common_ops qed_common_ops_pass = {
.chain_free = &qed_chain_free,
.set_led = &qed_set_led,
 };
-
-u32 qed_get_protocol_version(enum qed_protocol protocol)
-{
-   switch (protocol) {
-   case QED_PROTOCOL_ETH:
-   return QED_ETH_INTERFACE_VERSION;
-   default:
-   return 0;
-   }
-}
-EXPORT_SYMBOL(qed_get_protocol_version);
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h 
b/drivers/net/ethernet/qlogic/qede/qede.h
index d023251..e0a696a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -32,8 +32,6 @@
__stringify(QEDE_REVISION_VERSION) "."  \
__stringify(QEDE_ENGINEERING_VERSION)
 
-#define QEDE_ETH_INTERFACE_VERSION 300
-
 #define DRV_MODULE_SYM qede
 
 struct qede_stats {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c 
b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 518af32..a55d93e 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -141,19 +141,10 @@ static
 int __init qede_init(void)
 {
int ret;
-   u32 qed_ver;
 
pr_notice("qede_init: %s\n", version);
 
-   qed_ver = qed_get_protocol_version(QED_PROTOCOL_ETH);
-   if (qed_ver !=  QEDE_ETH_INTERFACE_VERSION) {
-   pr_notice("Version mismatch [%08x != %08x]\n",
- qed_ver,
- QEDE_ETH_INTERFACE_VERSION);
-   return -EINVAL;
-   }
-
-   qed_ops = qed_get_eth_ops(QEDE_ETH_INTERFACE_VERSION);
+   qed_ops = qed_get_eth_ops();
if (!qed_ops) {
pr_notice("Failed to get qed ethtool operations\n");
return -EINVAL;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index e1d6983..e00c8db 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -167,7 +167,7 @@ struct qed_eth_ops {
struct qed_eth_stats *stats);
 };
 
-const struct qed_eth_ops *qed_get_eth_ops(u32 version);
+const struct qed_eth_ops *qed_get_eth_ops(void);
 void qed_put_eth_ops(void);
 
 #endif
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 1f7599c7..b007011 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -271,15 +271,6 @@ struct qed_common_ops {
   enum qed_led_mode mode);
 };
 
-/**
- * @brief qed_get_protocol_version
- *
- * @param protocol
- *
- * @return version supported by qed for given protocol driver
- */
-u32 qed_get_protocol_version(enum qed_protocol protocol);
-
 #define MASK_FIELD(_name, _value) \
((_value) &= (_name ## _MASK))
 
-- 
1.9.3



Re: [PATCH net-next v2 1/2] rtnetlink: add new RTM_GETSTATS message to dump link stats

2016-04-10 Thread Thomas Graf
On 04/08/16 at 11:38pm, Roopa Prabhu wrote:
> From: Roopa Prabhu 
> 
> This patch adds a new RTM_GETSTATS message to query link stats via netlink
> from the kernel. RTM_NEWLINK also dumps stats today, but RTM_NEWLINK
> returns a lot more than just stats and is expensive in some cases when
> frequent polling for stats from userspace is a common operation.
> 
> RTM_GETSTATS is an attempt to provide a light weight netlink message
> to explicity query only link stats from the kernel on an interface.
> The idea is to also keep it extensible so that new kinds of stats can be
> added to it in the future.
> 
> This patch adds the following attribute for NETDEV stats:
> struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = {
> [IFLA_STATS_LINK64]  = { .len = sizeof(struct rtnl_link_stats64) },
> };
> 
> This patch also allows for af family stats (an example af stats for IPV6
> is available with the second patch in the series).
> 
> Like any other rtnetlink message, RTM_GETSTATS can be used to get stats of
> a single interface or all interfaces with NLM_F_DUMP.

Awesome stuff Roopa.

This currently ties everything to a net_device with a selector to
include certain bits of that net_device. How about we take it half a
step further and allow for non net_device stats such as IP, TCP,
routing or ipsec stats to be retrieved as well?

A simple array of nested attributes replacing IFLA_STATS_* would
allow for that, e.g.

1. {.type = ST_IPSTATS, value = { ...} }

2. {.type = ST_LINK, .value = {
{.type = ST_LINK_NAME, .value = "eth0"},
{.type = ST_LINK_Q, .value = 10}
  }}

3. ...

So for your initial patch, you'd simply add a new top level attribute
which ties all the net_device specific statistics to a top level
attribute and we can add the non net_device specific stats later on.
We can't do that later on though without breaking ABI so that would
have to go in with the first iteration.

We can preserve all the existing attribute formats, we simply have
to introduce new attribute types which don't overlap and document
which statistic identifier maps to which existing attribute id.


Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter

2016-04-10 Thread Thomas Graf
On 04/09/16 at 10:26am, Alexei Starovoitov wrote:
> On Sat, Apr 09, 2016 at 11:29:18AM -0400, Jamal Hadi Salim wrote:
> > If this is _forwarding only_ it maybe useful to look at
> > Alexey's old code in particular the DMA bits;
> > he built his own lookup algorithm but sounds like bpf is
> > a much better fit today.
> 
> a link to these old bits?
> 
> Just to be clear: this rfc is not the only thing we're considering.
> In particular huawei guys did a monster effort to improve performance
> in this area as well. We'll try to blend all the code together and
> pick what's the best.

What's the plan on opening the discussion on this? Can we get a peek?
Is it an alternative to XDP and the driver hook? Different architecture
or just different implementation? I understood it as another pseudo
skb model with a path on converting to real skbs for stack processing.

I really like the current proposal by Brenden for its simplicity and
targeted compatibility with cls_bpf.


[PATCHv2] wlcore: spi: add wl18xx support

2016-04-10 Thread Reizer, Eyal
Add support for using with both wl12xx and wl18xx.

- all wilink family needs special init command for entering wspi mode.
  extra clock cycles should be sent after the spi init command while the
  cs pin is high.
- switch to controling the cs pin from the spi driver for achieveing the
  above.
- the selected cs gpio is read from the spi device-tree node using the
  cs-gpios field and setup as a gpio.
- See the example below for specifying the cs gpio using the cs-gpios entry
&spi0   {
...
cs-gpios = <&gpio0 5 0>;
...
wlcore: wlcore@0 {
compatible = "ti,wl1835";
...
...
};
};

Signed-off-by: Eyal Reizer 
---
v1 -> v2: update device tree bindings documentation

 .../bindings/net/wireless/ti,wlcore,spi.txt|   50 +-
 drivers/net/wireless/ti/wlcore/spi.c   |  176 +---
 2 files changed, 200 insertions(+), 26 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/wireless/ti,wlcore,spi.txt 
b/Documentation/devicetree/bindings/net/wireless/ti,wlcore,spi.txt
index 9180724..912ab0c 100644
--- a/Documentation/devicetree/bindings/net/wireless/ti,wlcore,spi.txt
+++ b/Documentation/devicetree/bindings/net/wireless/ti,wlcore,spi.txt
@@ -1,19 +1,31 @@
-* Texas Instruments wl1271 wireless lan controller
+* Texas Instruments wl12xx/wl18xx wireless lan controller
 
-The wl1271 chip can be connected via SPI or via SDIO. This
+The wl12xx/wl18xx chips can be connected via SPI or via SDIO. This
 document describes the binding for the SPI connected chip.
 
 Required properties:
-- compatible :  Should be "ti,wl1271"
+- compatible :  Should be one of the following:
+* "ti,wl1271"
+* "ti,wl1273"
+* "ti,wl1281"
+* "ti,wl1283"
+* "ti,wl1801"
+* "ti,wl1805"
+* "ti,wl1807"
+* "ti,wl1831"
+* "ti,wl1835"
+* "ti,wl1837"
 - reg : Chip select address of device
 - spi-max-frequency :   Maximum SPI clocking speed of device in Hz
-- ref-clock-frequency : Reference clock frequency
 - interrupt-parent, interrupts :
 Should contain parameters for 1 interrupt line.
 Interrupt parameters: parent, line number, type.
-- vwlan-supply :Point the node of the regulator that powers/enable the 
wl1271 chip
+- vwlan-supply :Point the node of the regulator that powers/enable the
+wl12xx/wl18xx chip
+- cs-gpios :GPIO pin used as the spi chip select
 
 Optional properties:
+- ref-clock-frequency : Reference clock frequency (should be set for wl12xx)
 - clock-xtal :  boolean, clock is generated from XTAL
 
 - Please consult Documentation/devicetree/bindings/spi/spi-bus.txt
@@ -21,10 +33,16 @@ Optional properties:
 
 Examples:
 
+For wl12xx family:
 &spi1 {
-   wl1271@1 {
+   status = "okay";
+   pinctrl-names = "default";
+   pinctrl-0 = <&spi1_pins>;
+   cs-gpios = <&gpio0 5 0>;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   wlcore: wlcore@0 {
compatible = "ti,wl1271";
-
reg = <1>;
spi-max-frequency = <4800>;
clock-xtal;
@@ -34,3 +52,21 @@ Examples:
vwlan-supply = <&vwlan_fixed>;
};
 };
+
+For wl18xx family:
+&spi0  {
+   status = "okay";
+   pinctrl-names = "default";
+   pinctrl-0 = <&spi0_pins>;
+   cs-gpios = <&gpio0 5 0>;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   wlcore: wlcore@0 {
+   compatible = "ti,wl1835";
+   vwlan-supply = <&wlan_en_reg>;
+   spi-max-frequency = <4800>;
+   reg = <0>;
+   interrupt-parent = <&gpio0>;
+   interrupts = <27 IRQ_TYPE_EDGE_RISING>;
+   };
+};
diff --git a/drivers/net/wireless/ti/wlcore/spi.c 
b/drivers/net/wireless/ti/wlcore/spi.c
index 020ac1a..fb48a0d 100644
--- a/drivers/net/wireless/ti/wlcore/spi.c
+++ b/drivers/net/wireless/ti/wlcore/spi.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "wlcore.h"
 #include "wl12xx_80211.h"
@@ -70,16 +71,30 @@
 #define WSPI_MAX_CHUNK_SIZE4092
 
 /*
- * only support SPI for 12xx - this code should be reworked when 18xx
- * support is introduced
+ * wl18xx driver aggregation buffer size is (13 * PAGE_SIZE) compared to
+ * (4 * PAGE_SIZE) for wl12xx, so use the larger buffer needed for wl18xx
  */
-#define SPI_AGGR_BUFFER_SIZE (4 * PAGE_SIZE)
+#define SPI_AGGR_BUFFER_SIZE (13 * PAGE_SIZE)
 
 /* Maximum number of SPI write chunks */
 #define WSPI_MAX_NUM_OF_CHUNKS \
((SPI_AGGR_BUFFER_SIZE / WSPI_MAX_CHUNK_SIZE) + 1)
 
 
+struct wilink_familiy_data {
+   char name[8];
+};
+
+const struct wilink_familiy_data *wilink_data;
+
+static const struct wilink_familiy_data wl18xx_data = {
+   .name = "wl18xx",
+};
+
+static const struct wilink_familiy_data wl12xx_data = {
+   .name = "wl12xx",
+};
+
 struc