date:20160623

RE: [PATCH] Maxim/driver: Add driver for maxim ds26522

2016-06-23 Thread Qiang Zhao

On Thu, 2016-06-23 at 10:59PM, David Miller wrote:
> -Original Message-
> From: David Miller [mailto:da...@davemloft.net]
> Sent: Thursday, June 23, 2016 10:59 PM
> To: Qiang Zhao 
> Cc: o...@buserror.net; linux-ker...@vger.kernel.org; netdev@vger.kernel.org;
> Xiaobo Xie 
> Subject: Re: [PATCH] Maxim/driver: Add driver for maxim ds26522
> 
> From: Zhao Qiang 
> Date: Thu, 23 Jun 2016 09:09:45 +0800
> 
> > +MODULE_DESCRIPTION(DRV_DESC);
> 
> There is no definition of DRV_DESC, so this makes it look like you didn't even
> compile this driver.

I really, really compiled this driver.
Thank you for your review and comments. I will modify it the next version.

[zhaoqiang@titan:~/upstream/linux]$ll drivers/net/wan/slic_ds26522.o
-rw-r--r-- 1 zhaoqiang klocwork 153288 Jun 22 15:48 
drivers/net/wan/slic_ds26522.o
[zhaoqiang@titan:~/upstream/linux]$date
Fri Jun 24 09:42:16 CST 2016

-Zhao Qiang
BR

Re: [PATCH net-next 0/5] qed/qede: Tunnel hardware GRO support

2016-06-23 Thread Yuval Mintz

>We already know of one firmware bug you guys have which makes 
>it clear that the bnx2x is not doing hardware assisted GRO it is doing 
>something else since it performs much worse than GRO if the MSS is 
>less than what it would be based on the MTU.

It's a bit nitpicky, isn't it? Claiming this flaw means it's not GRO.
I.e., you obviously wouldn't have claimed it beacme GRO if it
was fixed.

Not saying it makes a lot of difference, though.

[PATCH v4 01/16] bluetooth: Switch SMP to crypto_cipher_encrypt_one()

2016-06-23 Thread Andy Lutomirski

SMP does ECB crypto on stack buffers.  This is complicated and
fragile, and it will not work if the stack is virtually allocated.

Switch to the crypto_cipher interface, which is simpler and safer.

Cc: Marcel Holtmann 
Cc: Gustavo Padovan 
Cc: Johan Hedberg 
Cc: "David S. Miller" 
Cc: linux-blueto...@vger.kernel.org
Cc: Herbert Xu 
Cc: netdev@vger.kernel.org
Signed-off-by: Andy Lutomirski 
---
 net/bluetooth/smp.c | 67 ++---
 1 file changed, 28 insertions(+), 39 deletions(-)

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 50976a6481f3..4c1a16a96ae5 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -22,9 +22,9 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
-#include 
 
 #include 
 #include 
@@ -88,7 +88,7 @@ struct smp_dev {
u8  min_key_size;
u8  max_key_size;
 
-   struct crypto_skcipher  *tfm_aes;
+   struct crypto_cipher*tfm_aes;
struct crypto_shash *tfm_cmac;
 };
 
@@ -127,7 +127,7 @@ struct smp_chan {
u8  dhkey[32];
u8  mackey[16];
 
-   struct crypto_skcipher  *tfm_aes;
+   struct crypto_cipher*tfm_aes;
struct crypto_shash *tfm_cmac;
 };
 
@@ -361,10 +361,8 @@ static int smp_h6(struct crypto_shash *tfm_cmac, const u8 
w[16],
  * s1 and ah.
  */
 
-static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r)
+static int smp_e(struct crypto_cipher *tfm, const u8 *k, u8 *r)
 {
-   SKCIPHER_REQUEST_ON_STACK(req, tfm);
-   struct scatterlist sg;
uint8_t tmp[16], data[16];
int err;
 
@@ -378,7 +376,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, 
u8 *r)
/* The most significant octet of key corresponds to k[0] */
swap_buf(k, tmp, 16);
 
-   err = crypto_skcipher_setkey(tfm, tmp, 16);
+   err = crypto_cipher_setkey(tfm, tmp, 16);
if (err) {
BT_ERR("cipher setkey failed: %d", err);
return err;
@@ -387,16 +385,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, 
u8 *r)
/* Most significant octet of plaintextData corresponds to data[0] */
swap_buf(r, data, 16);
 
-   sg_init_one(, data, 16);
-
-   skcipher_request_set_tfm(req, tfm);
-   skcipher_request_set_callback(req, 0, NULL, NULL);
-   skcipher_request_set_crypt(req, , , 16, NULL);
-
-   err = crypto_skcipher_encrypt(req);
-   skcipher_request_zero(req);
-   if (err)
-   BT_ERR("Encrypt data error %d", err);
+   crypto_cipher_encrypt_one(tfm, data, data);
 
/* Most significant octet of encryptedData corresponds to data[0] */
swap_buf(data, r, 16);
@@ -406,7 +395,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, 
u8 *r)
return err;
 }
 
-static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 k[16],
+static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16],
  const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat,
  const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16])
 {
@@ -455,7 +444,7 @@ static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 
k[16],
return err;
 }
 
-static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 k[16],
+static int smp_s1(struct crypto_cipher *tfm_aes, const u8 k[16],
  const u8 r1[16], const u8 r2[16], u8 _r[16])
 {
int err;
@@ -471,7 +460,7 @@ static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 
k[16],
return err;
 }
 
-static int smp_ah(struct crypto_skcipher *tfm, const u8 irk[16],
+static int smp_ah(struct crypto_cipher *tfm, const u8 irk[16],
  const u8 r[3], u8 res[3])
 {
u8 _res[16];
@@ -759,7 +748,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
kzfree(smp->slave_csrk);
kzfree(smp->link_key);
 
-   crypto_free_skcipher(smp->tfm_aes);
+   crypto_free_cipher(smp->tfm_aes);
crypto_free_shash(smp->tfm_cmac);
 
/* Ensure that we don't leave any debug key around if debug key
@@ -1359,9 +1348,9 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn 
*conn)
if (!smp)
return NULL;
 
-   smp->tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+   smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(smp->tfm_aes)) {
-   BT_ERR("Unable to create ECB crypto context");
+   BT_ERR("Unable to create AES crypto context");
kzfree(smp);
return NULL;
}
@@ -1369,7 +1358,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn 
*conn)
smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if

Re: esp: Fix ESN generation under UDP encapsulation

2016-06-23 Thread Herbert Xu

On Thu, Jun 23, 2016 at 11:52:45AM -0400, David Miller wrote:
> 
> Does the ipv6 side need the same fix?

Last I checked IPv6 didn't do IPsec UDP-encapsulation so we're
safe for now.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: [PATCH iproute2 0/6] Add support for vrf keyword

2016-06-23 Thread David Ahern


On 6/14/16 2:59 PM, David Ahern wrote:

Currently the syntax for VRF related commands is rather kludgy and
inconsistent from one subcommand to another. This set adds support
for the VRF keyword to the link, address, neigh, and route commands
to improve the user experience listing data associated with vrfs,
modifying routes or doing a route lookup.

David Ahern (6):
  ip vrf: Add name_is_vrf
  ip link/addr: Add support for vrf keyword
  ip neigh: Add support for keyword
  ip route: Change type mask to bitmask
  ip vrf: Add ipvrf_get_table
  ip route: Add support for vrf keyword

 ip/ip_common.h  |   3 ++
 ip/ipaddress.c  |  11 ++
 ip/iplink.c |  15 ++-
 ip/iplink_vrf.c | 119 
 ip/ipneigh.c|  14 ++-
 ip/iproute.c|  43 
 6 files changed, 195 insertions(+), 10 deletions(-)



Stephen: This patchset is marked 'Changes Requested' in patchworks, yet 
I have not seen any responses to them. What change was requested?


David

[PATCH net-next] net: diag: Add support to filter on device index

2016-06-23 Thread David Ahern

Add support to inet_diag facility to filter sockets based on device
index. If an interface index is in the filter only sockets bound
to that index (sk_bound_dev_if) are returned.

Signed-off-by: David Ahern 
---
 include/uapi/linux/inet_diag.h |  1 +
 net/ipv4/inet_diag.c   | 25 +
 2 files changed, 26 insertions(+)

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index a16643705669..abbd1dc5d683 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -72,6 +72,7 @@ enum {
INET_DIAG_BC_AUTO,
INET_DIAG_BC_S_COND,
INET_DIAG_BC_D_COND,
+   INET_DIAG_BC_DEV_COND,   /* u32 ifindex */
 };
 
 struct inet_diag_hostcond {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 25af1243649b..38c2c47fe0e8 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -44,6 +44,7 @@ struct inet_diag_entry {
u16 dport;
u16 family;
u16 userlocks;
+   u32 ifindex;
 };
 
 static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -571,6 +572,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
yes = 0;
break;
}
+   case INET_DIAG_BC_DEV_COND: {
+   u32 ifindex;
+
+   ifindex = *((const u32 *)(op + 1));
+   if (ifindex != entry->ifindex)
+   yes = 0;
+   break;
+   }
}
 
if (yes) {
@@ -613,6 +622,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock 
*sk)
entry_fill_addrs(, sk);
entry.sport = inet->inet_num;
entry.dport = ntohs(inet->inet_dport);
+   entry.ifindex = sk->sk_bound_dev_if;
entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
 
return inet_diag_bc_run(bc, );
@@ -636,6 +646,17 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
 }
 
+/* data is u32 ifindex */
+static bool valid_devcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+   /* Check ifindex space. */
+   *min_len += sizeof(u32);
+   if (len < *min_len)
+   return false;
+
+   return true;
+}
 /* Validate an inet_diag_hostcond. */
 static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
   int *min_len)
@@ -700,6 +721,10 @@ static int inet_diag_bc_audit(const void *bytecode, int 
bytecode_len)
if (!valid_hostcond(bc, len, _len))
return -EINVAL;
break;
+   case INET_DIAG_BC_DEV_COND:
+   if (!valid_devcond(bc, len, _len))
+   return -EINVAL;
+   break;
case INET_DIAG_BC_S_GE:
case INET_DIAG_BC_S_LE:
case INET_DIAG_BC_D_GE:
-- 
2.1.4

linux-next: manual merge of the net-next tree with the net tree

2016-06-23 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the net-next tree got a conflict in:

  drivers/net/vrf.c

between commit:

  52fe705b493d ("net: vrf: replace hard tab with space in assignment")

from the net tree and commits:

  671cd19ade97 ("net: vrf: ipv4 support for local traffic to local addresses")
  625b47b50732 ("net: vrf: ipv6 support for local traffic to local addresses")

from the net-next tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/net/vrf.c
index 8bd8c7e1ee87,b3762822b653..
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@@ -304,8 -437,26 +437,26 @@@ static int vrf_rt6_create(struct net_de
dst_hold(>dst);
  
rt6->rt6i_table = rt6i_table;
 -  rt6->dst.output = vrf_output6;
 +  rt6->dst.output = vrf_output6;
+ 
+   /* create a dst for local routing - packets sent locally
+* to local address via the VRF device as a loopback
+*/
+   rt6_local = ip6_dst_alloc(net, dev, flags);
+   if (!rt6_local) {
+   dst_release(>dst);
+   goto out;
+   }
+ 
+   dst_hold(_local->dst);
+ 
+   rt6_local->rt6i_idev  = in6_dev_get(dev);
+   rt6_local->rt6i_flags = RTF_UP | RTF_NONEXTHOP | RTF_LOCAL;
+   rt6_local->rt6i_table = rt6i_table;
+   rt6_local->dst.input  = ip6_input;
+ 
rcu_assign_pointer(vrf->rt6, rt6);
+   rcu_assign_pointer(vrf->rt6_local, rt6_local);
  
rc = 0;
  out:
@@@ -403,10 -576,22 +576,22 @@@ static int vrf_rtable_create(struct net
if (!rth)
return -ENOMEM;
  
+   /* create a dst for local ingress routing - packets sent locally
+* to local address via the VRF device as a loopback
+*/
+   rth_local = rt_dst_alloc(dev, RTCF_LOCAL, RTN_LOCAL, 1, 1, 0);
+   if (!rth_local) {
+   dst_release(>dst);
+   return -ENOMEM;
+   }
+ 
 -  rth->dst.output = vrf_output;
 +  rth->dst.output = vrf_output;
rth->rt_table_id = vrf->tb_id;
  
+   rth_local->rt_table_id = vrf->tb_id;
+ 
rcu_assign_pointer(vrf->rth, rth);
+   rcu_assign_pointer(vrf->rth_local, rth_local);
  
return 0;
  }

[PATCH v3] net/mlx5: use mlx5_buf_alloc_node instead of mlx5_buf_alloc in mlx5_wq_ll_create

2016-06-23 Thread Wang Sheng-Hui

Commit 311c7c71c9bb ("net/mlx5e: Allocate DMA coherent memory on
reader NUMA node") introduced mlx5_*_alloc_node() but missed changing
some calling and warn messages. This patch introduces 2 changes:
* Use mlx5_buf_alloc_node() instead of mlx5_buf_alloc() in
  mlx5_wq_ll_create()
* Update the failure warn messages with _node postfix for
  mlx5_*_alloc function names

Fixes: 311c7c71c9bb ("net/mlx5e: Allocate DMA coherent memory on reader NUMA 
node")
Signed-off-by: Wang Sheng-Hui 
---

Changes since v2:
* Reformat the commit log

Changes since v1:
* Add Fixes line in commit log

 drivers/net/ethernet/mellanox/mlx5/core/wq.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c 
b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index ce21ee5..821a087 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -75,14 +75,14 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct 
mlx5_wq_param *param,
 
err = mlx5_db_alloc_node(mdev, _ctrl->db, param->db_numa_node);
if (err) {
-   mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err);
+   mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
return err;
}
 
err = mlx5_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq),
  _ctrl->buf, param->buf_numa_node);
if (err) {
-   mlx5_core_warn(mdev, "mlx5_buf_alloc() failed, %d\n", err);
+   mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
goto err_db_free;
}
 
@@ -111,14 +111,14 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct 
mlx5_wq_param *param,
 
err = mlx5_db_alloc_node(mdev, _ctrl->db, param->db_numa_node);
if (err) {
-   mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err);
+   mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
return err;
}
 
err = mlx5_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq),
  _ctrl->buf, param->buf_numa_node);
if (err) {
-   mlx5_core_warn(mdev, "mlx5_buf_alloc() failed, %d\n", err);
+   mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
goto err_db_free;
}
 
@@ -148,13 +148,14 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct 
mlx5_wq_param *param,
 
err = mlx5_db_alloc_node(mdev, _ctrl->db, param->db_numa_node);
if (err) {
-   mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err);
+   mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
return err;
}
 
-   err = mlx5_buf_alloc(mdev, mlx5_wq_ll_get_byte_size(wq), _ctrl->buf);
+   err = mlx5_buf_alloc_node(mdev, mlx5_wq_ll_get_byte_size(wq),
+ _ctrl->buf, param->buf_numa_node);
if (err) {
-   mlx5_core_warn(mdev, "mlx5_buf_alloc() failed, %d\n", err);
+   mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
goto err_db_free;
}
 
-- 
2.7.4

[PATCH usbnet v2.1] mtu change needs to stop RX

2016-06-23 Thread Soohoon Lee


When MTU is changed unlink_urbs() flushes RX Q but mean while usbnet_bh() can 
fill up the Q at the same time.
Depends on which HCD is down there unlink takes long time then the flush never 
ends.

Signed-off-by: Soohoon Lee 
Reviewed-by: Kimball Murray 

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 61ba464..ce72dd0 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -395,8 +395,11 @@ int usbnet_change_mtu (struct net_device *net, int new_mtu)
dev->hard_mtu = net->mtu + net->hard_header_len;
if (dev->rx_urb_size == old_hard_mtu) {
dev->rx_urb_size = dev->hard_mtu;
-   if (dev->rx_urb_size > old_rx_urb_size)
+   if (dev->rx_urb_size > old_rx_urb_size) {
+   usbnet_pause_rx(dev);
usbnet_unlink_rx_urbs(dev);
+   usbnet_resume_rx(dev);
+   }
}

/* max qlen depend on hard_mtu and rx_urb_size */
@@ -1509,6 +1512,7 @@ static void usbnet_bh (unsigned long param)
   netif_device_present (dev->net) &&
   netif_carrier_ok(dev->net) &&
   !timer_pending (>delay) &&
+  !test_bit (EVENT_RX_PAUSED, >flags) &&
   !test_bit (EVENT_RX_HALT, >flags)) {
int temp = dev->rxq.qlen;

Re: [PATCH net-next 0/5] qed/qede: Tunnel hardware GRO support

2016-06-23 Thread Alexander Duyck

On Thu, Jun 23, 2016 at 2:06 PM, Yuval Mintz  wrote:
 Then again, if you're basically saying every HW-assisted offload on
 receive should be done under LRO flag, what would be the use case
 where a GRO-assisted offload would help?
 I.e., afaik LRO is superior to GRO in `brute force' -
 it creates better packed packets and utilizes memory better
 [with all the obvious cons such as inability for defragmentation].
 So if you'd have the choice of having an adpater perform 'classic'
 LRO aggregation or something that resembles a GRO packet,
 what would be the gain from doing the latter?
>
>> LRO and GRO shouldn't really differ in packing or anything like that.
>> The big difference between the two is that LRO is destructive while
>> GRO is not.  Specifically in the case of GRO you should be able to
>> take the resultant frame, feed it through GSO, and get the original
>> stream of frames back out.  So you can pack the frames however you
>> want the only key is that you must capture all the correct offsets and
>> set the gso_size correct for the flow.
>
> While the implementation might lack in things [such as issues with
> future implementation], following your logic it is GRO - I.e., forwarding
> scenarios work fine with HW assisted GRO.
>
>>> Just to relate to bnx2x/qede differences in current implementation -
>>> when this GRO hw-offload was added to bnx2x, it has already
>>> supported classical LRO, and due to above statement whenever LRO
>>> was set driver aggregated incoming traffic as classic LRO.
>>> I agree that in hindsight the lack of distinction between sw/hw GRO
>>> was hurting us.
>
>> In the case of bnx2x it sounds like you have issues that are
>> significantly hurting the performance versus classic software GRO.  If
>> that is the case you might want to simply flip the logic for the
>> module parameter that Rick mentioned and just disable the hardware
>> assisted GRO unless it is specifically requested.
>
> A bit hard to flip; The module parameter also disables LRO support.
> And given that module parameters is mostly a thing of the past, I
> don't think we should strive fixing things through additional changes
> in that area.
>
>> > qede isn't implementing LRO, so we could easily mark this feature
>> > under LRO there - but question is, given that the adapter can support
>> > LRO, if we're going to suffer from all the shotrages that arise from
>> > putting this feature under LRO, why should we bother?
>
>> The idea is to address feature isolation.  The fact is the hardware
>> exists outside of kernel control.  If you end up linking an internal
>> kernel feature to your device like this you are essentially stripping
>> the option of using the kernel feature.
>
>> I would prefer to see us extend LRO to support "close enough GRO"
>> instead of have us extend GRO to also include LRO.
>
> Again - why? What's the benefit of HW doing LRO and trying to
> control imitate GRO, if it's still carrying all the LRO baggage
> [specifically, disabling it on forwarding] as opposed to simply
> doing classic LRO?

In most cases that is all LRO is.  LRO is trying to do the best it can
to emulate GRO.  The reason why it was pushed off into a separate
feature bit is that it never quite works out that way and most vendors
end up with something that comes close but is always missing a few
items.

The "hardware assisted GRO" is really nothing more than a marketing
term and a justification to ignore the fact that we should probably be
disabling it when routing or bridging is enabled.  What you are doing
is LRO but using the wrong bit to test for the feature.  We already
know of one firmware bug you guys have which makes it clear that the
bnx2x is not doing hardware assisted GRO it is doing something else
since it performs much worse than GRO if the MSS is less than what it
would be based on the MTU.

>> > You can argue that we might need a new feature bit for control
>> > over such a feature; If we don't do that, is there any gain in all of this?
>
>> I would argue that yes there are many cases where we will be able to
>> show gain.  The fact is there is a strong likelihood of the GRO on
>> your parts having some differences either now, or at some point in the
>> future as the code evolves.  As I mentioned there was already some
>> talk about possibly needing to push the UDP tunnel aggregation out of
>> GRO and perhaps handling it sometime after IP look up had verified
>> that the destination was in fact a local address in the namespace.  In
>> addition it makes the changes to include the tunnel encapsulation much
>> more acceptable as LRO is already naturally dropped in the routing and
>> bridging cases if I recall correctly.
>
> I think it all boils down to the question of "do we actually want to have
> HW-assisted GRO?". If we do [and not necessarily for the UDP-tunnel
> scenario] then we need to have it distinct from LRO, otherwise there's
> very little

Re: Doing crypto in small stack buffers (bluetooth vs vmalloc-stack crash, etc)

2016-06-23 Thread Andy Lutomirski

On Wed, Jun 22, 2016 at 11:41 PM, Herbert Xu
 wrote:
> On Thu, Jun 23, 2016 at 11:48:25AM +0800, Herbert Xu wrote:
>>
>> No we never had such an API in the kernel.  However, I see that
>> rxkad does some pretty silly things and we should be able to avoid
>> using the stack in pretty much all cases.  Let me try to come up with
>> something.
>
> Here it is:
>
> ---8<---
> Subject: rxrpc: Avoid using stack memory in SG lists in rxkad

Looks reasonable to me.  Unless anyone tells me otherwise, my plan is
to queue it in my virtually-mapped stack series and to ask Ingo to
apply it via -tip.

If it went in via the networking tree, that would work as well, but it
would introduce a bisectability problem.

Thanks!

--Andy

Re: [PATCH net-next v2 2/4] cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

2016-06-23 Thread Martin KaFai Lau

On Thu, Jun 23, 2016 at 11:50:08PM +0200, Daniel Borkmann wrote:
> On 06/23/2016 11:26 PM, Martin KaFai Lau wrote:
> >We are still hatching out how to set this up in production. However, the
> >situation is similar to removing the pinned file.
s/pinned file/pinned cgroup-array/

> I presume you mean removing the last BPF program holding a reference on
> the cgroup array map.
Yes

> (Any user space visibility like struct files given
> from the anon inode and pinnings are tracked via uref, btw, which is
> needed to break possible complex dependencies among tail called programs.)
Yep. Understood on prog_array use case.

Thanks,
-- Martin

Re: [PATCH net-next v2 2/4] cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

2016-06-23 Thread Daniel Borkmann


On 06/23/2016 11:26 PM, Martin KaFai Lau wrote:

On Thu, Jun 23, 2016 at 11:42:31AM +0200, Daniel Borkmann wrote:

Hi Martin,

[ sorry to jump late in here, on pto currently ]

Thanks for reviewing.


Could you describe a bit more with regards to pinning maps and how this
should interact with cgroups? The two specialized array maps we have (tail
calls, perf events) have fairly complicated semantics for when to clean up
map slots (see commits c9da161c6517ba1, 3b1efb196eee45b2f0c4).

How is this managed with cgroups? Once a cgroup fd is placed into a map and
the user removes the cgroup, will this be prevented due to 'being busy', or
will the cgroup live further as long as a program is running with a cgroup
map entry (but the cgroup itself is not visible from user space in any way
anymore)?

Having a cgroup ptr stored in the bpf_map will not stop the user from
removing the cgroup (by rmdir /mnt/cgroup2/tc/test_cgrp).


Right.


The cgroup ptr stored in the bpf_map holds a refcnt which answer the
second part.


Yep, clear.


The situation is similar to the netfilter usecase in
commit 38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")


I presume it's a valid use case to pin a cgroup map, put fds into it and
remove the pinned file expecting to continue to match on it, right? So
lifetime is really until last prog using a cgroup map somewhere gets removed
(even if not accessible from user space anymore, meaning no prog has fd and
pinned file was removed).

Yes.

We are still hatching out how to set this up in production. However, the
situation is similar to removing the pinned file.


I presume you mean removing the last BPF program holding a reference on
the cgroup array map. (Any user space visibility like struct files given
from the anon inode and pinnings are tracked via uref, btw, which is
needed to break possible complex dependencies among tail called programs.)
But dropping cgroup ref at latest when the last map ref is dropped as you
currently do seems fine. It makes cgroup array maps effectively no different
from plain regular array maps.


We probably will not use tc and pin a bpf_map to do that.  Instead,
one process will setup eveything (e.g. create the cgroup, pouplate the
cgroup map, load the bpf to egress) and then go away.


Yep, that seems a valid case as well, both use cases (pinned and non-pinned)
should be fine with your code then.

Thanks,
Daniel

[PATCH 2/2] net: ethernet: dnet: use phy_ethtool_{get|set}_link_ksettings

2016-06-23 Thread Philippe Reynes

There are two generics functions phy_ethtool_{get|set}_link_ksettings,
so we can use them instead of defining the same code in the driver.

Signed-off-by: Philippe Reynes 
---
 drivers/net/ethernet/dnet.c |   24 ++--
 1 files changed, 2 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/dnet.c b/drivers/net/ethernet/dnet.c
index 9c6955f..c3b64cd 100644
--- a/drivers/net/ethernet/dnet.c
+++ b/drivers/net/ethernet/dnet.c
@@ -730,26 +730,6 @@ static struct net_device_stats *dnet_get_stats(struct 
net_device *dev)
return nstat;
 }
 
-static int dnet_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
-{
-   struct phy_device *phydev = dev->phydev;
-
-   if (!phydev)
-   return -ENODEV;
-
-   return phy_ethtool_gset(phydev, cmd);
-}
-
-static int dnet_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
-{
-   struct phy_device *phydev = dev->phydev;
-
-   if (!phydev)
-   return -ENODEV;
-
-   return phy_ethtool_sset(phydev, cmd);
-}
-
 static int dnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
struct phy_device *phydev = dev->phydev;
@@ -772,11 +752,11 @@ static void dnet_get_drvinfo(struct net_device *dev,
 }
 
 static const struct ethtool_ops dnet_ethtool_ops = {
-   .get_settings   = dnet_get_settings,
-   .set_settings   = dnet_set_settings,
.get_drvinfo= dnet_get_drvinfo,
.get_link   = ethtool_op_get_link,
.get_ts_info= ethtool_op_get_ts_info,
+   .get_link_ksettings = phy_ethtool_get_link_ksettings,
+   .set_link_ksettings = phy_ethtool_set_link_ksettings,
 };
 
 static const struct net_device_ops dnet_netdev_ops = {
-- 
1.7.4.4

[PATCH 1/2] net: ethernet: dnet: use phydev from struct net_device

2016-06-23 Thread Philippe Reynes

The private structure contain a pointer to phydev, but the structure
net_device already contain such pointer. So we can remove the pointer
phydev in the private structure, and update the driver to use the
one contained in struct net_device.

Signed-off-by: Philippe Reynes 
---
 drivers/net/ethernet/dnet.c |   28 
 drivers/net/ethernet/dnet.h |1 -
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/dnet.c b/drivers/net/ethernet/dnet.c
index b69a9ea..9c6955f 100644
--- a/drivers/net/ethernet/dnet.c
+++ b/drivers/net/ethernet/dnet.c
@@ -173,7 +173,7 @@ static int dnet_mdio_write(struct mii_bus *bus, int mii_id, 
int regnum,
 static void dnet_handle_link_change(struct net_device *dev)
 {
struct dnet *bp = netdev_priv(dev);
-   struct phy_device *phydev = bp->phy_dev;
+   struct phy_device *phydev = dev->phydev;
unsigned long flags;
u32 mode_reg, ctl_reg;
 
@@ -295,7 +295,6 @@ static int dnet_mii_probe(struct net_device *dev)
bp->link = 0;
bp->speed = 0;
bp->duplex = -1;
-   bp->phy_dev = phydev;
 
return 0;
 }
@@ -629,16 +628,16 @@ static int dnet_open(struct net_device *dev)
struct dnet *bp = netdev_priv(dev);
 
/* if the phy is not yet register, retry later */
-   if (!bp->phy_dev)
+   if (!dev->phydev)
return -EAGAIN;
 
napi_enable(>napi);
dnet_init_hw(bp);
 
-   phy_start_aneg(bp->phy_dev);
+   phy_start_aneg(dev->phydev);
 
/* schedule a link state check */
-   phy_start(bp->phy_dev);
+   phy_start(dev->phydev);
 
netif_start_queue(dev);
 
@@ -652,8 +651,8 @@ static int dnet_close(struct net_device *dev)
netif_stop_queue(dev);
napi_disable(>napi);
 
-   if (bp->phy_dev)
-   phy_stop(bp->phy_dev);
+   if (dev->phydev)
+   phy_stop(dev->phydev);
 
dnet_reset_hw(bp);
netif_carrier_off(dev);
@@ -733,8 +732,7 @@ static struct net_device_stats *dnet_get_stats(struct 
net_device *dev)
 
 static int dnet_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-   struct dnet *bp = netdev_priv(dev);
-   struct phy_device *phydev = bp->phy_dev;
+   struct phy_device *phydev = dev->phydev;
 
if (!phydev)
return -ENODEV;
@@ -744,8 +742,7 @@ static int dnet_get_settings(struct net_device *dev, struct 
ethtool_cmd *cmd)
 
 static int dnet_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-   struct dnet *bp = netdev_priv(dev);
-   struct phy_device *phydev = bp->phy_dev;
+   struct phy_device *phydev = dev->phydev;
 
if (!phydev)
return -ENODEV;
@@ -755,8 +752,7 @@ static int dnet_set_settings(struct net_device *dev, struct 
ethtool_cmd *cmd)
 
 static int dnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-   struct dnet *bp = netdev_priv(dev);
-   struct phy_device *phydev = bp->phy_dev;
+   struct phy_device *phydev = dev->phydev;
 
if (!netif_running(dev))
return -EINVAL;
@@ -875,7 +871,7 @@ static int dnet_probe(struct platform_device *pdev)
   (bp->capabilities & DNET_HAS_IRQ) ? "" : "no ",
   (bp->capabilities & DNET_HAS_GIGABIT) ? "" : "no ",
   (bp->capabilities & DNET_HAS_DMA) ? "" : "no ");
-   phydev = bp->phy_dev;
+   phydev = dev->phydev;
phy_attached_info(phydev);
 
return 0;
@@ -899,8 +895,8 @@ static int dnet_remove(struct platform_device *pdev)
 
if (dev) {
bp = netdev_priv(dev);
-   if (bp->phy_dev)
-   phy_disconnect(bp->phy_dev);
+   if (dev->phydev)
+   phy_disconnect(dev->phydev);
mdiobus_unregister(bp->mii_bus);
mdiobus_free(bp->mii_bus);
unregister_netdev(dev);
diff --git a/drivers/net/ethernet/dnet.h b/drivers/net/ethernet/dnet.h
index 37f5b30..d985080 100644
--- a/drivers/net/ethernet/dnet.h
+++ b/drivers/net/ethernet/dnet.h
@@ -216,7 +216,6 @@ struct dnet {
 
/* PHY stuff */
struct mii_bus  *mii_bus;
-   struct phy_device   *phy_dev;
unsigned intlink;
unsigned intspeed;
unsigned intduplex;
-- 
1.7.4.4

Re: [PATCH net-next v2 3/4] cgroup: bpf: Add bpf_skb_in_cgroup_proto

2016-06-23 Thread Martin KaFai Lau

On Thu, Jun 23, 2016 at 10:07:27PM +0200, Daniel Borkmann wrote:
> On 06/23/2016 06:54 PM, Martin KaFai Lau wrote:
> >On Thu, Jun 23, 2016 at 11:53:50AM +0200, Daniel Borkmann wrote:
> >>>diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >>>index 668e079..68753e0 100644
> >>>--- a/kernel/bpf/verifier.c
> >>>+++ b/kernel/bpf/verifier.c
> >>>@@ -1062,6 +1062,10 @@ static int check_map_func_compatibility(struct 
> >>>bpf_map *map, int func_id)
> >>>   if (func_id != BPF_FUNC_get_stackid)
> >>>   goto error;
> >>>   break;
> >>>+  case BPF_MAP_TYPE_CGROUP_ARRAY:
> >>>+  if (func_id != BPF_FUNC_skb_in_cgroup)
> >>>+  goto error;
> >>>+  break;
> >>
> >>I think the BPF_MAP_TYPE_CGROUP_ARRAY case should have been fist here in
> >>patch 2/4, but with unconditional goto error. And this one only adds the
> >>'func_id != BPF_FUNC_skb_in_cgroup' test.
> >I am not sure I understand.  Can you elaborate? I am probably missing
> >something here.
>
> If someone backports patch 2/4 as-is, but for some reason not 3/4, then you
> could craft a program that calls f.e. bpf_map_update_elem() on a cgroup array
> and would thus cause a NULL pointer deref, since verifier doesn't prevent it.
> I'm just trying to say that it would probably make sense to add the above 
> 'case
> BPF_MAP_TYPE_CGROUP_ARRAY:' with an unconditional 'goto error' in patch 2/4
> and extend upon it in patch 3/4 so result looks like here, so that the patches
> are fine/complete each as stand-alone.
I failed to connect some points in your last comment.  Thanks for explaining.

Make sense. I will spin v3.

Re: [PATCH net-next v2 2/4] cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

2016-06-23 Thread Daniel Borkmann


On 06/23/2016 11:13 PM, Tejun Heo wrote:

Hello,

On Thu, Jun 23, 2016 at 11:42:31AM +0200, Daniel Borkmann wrote:

I presume it's a valid use case to pin a cgroup map, put fds into it and
remove the pinned file expecting to continue to match on it, right? So
lifetime is really until last prog using a cgroup map somewhere gets removed
(even if not accessible from user space anymore, meaning no prog has fd and
pinned file was removed).


Yeap, from what I can see, the cgroup will stay around (even if it
gets deleted) as long as the bpf rule using it is around and that's
completely fine from cgroup side.


Ok, thanks for confirming!


Thanks.

[PATCH usbnet v2] mtu change needs to stop RX

2016-06-23 Thread Soohoon Lee


When MTU is changed unlink_urbs() flushes RX Q but mean while usbnet_bh() can 
fill up the Q at the same time.
Depends on which HCD is down there unlink takes long time then the flush never 
ends.

Reviewed-by: kmur...@f5.com

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 61ba464..ce72dd0 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -395,8 +395,11 @@ int usbnet_change_mtu (struct net_device *net, int new_mtu)
dev->hard_mtu = net->mtu + net->hard_header_len;
if (dev->rx_urb_size == old_hard_mtu) {
dev->rx_urb_size = dev->hard_mtu;
-   if (dev->rx_urb_size > old_rx_urb_size)
+   if (dev->rx_urb_size > old_rx_urb_size) {
+   usbnet_pause_rx(dev);
usbnet_unlink_rx_urbs(dev);
+   usbnet_resume_rx(dev);
+   }
}
 
/* max qlen depend on hard_mtu and rx_urb_size */
@@ -1509,6 +1512,7 @@ static void usbnet_bh (unsigned long param)
   netif_device_present (dev->net) &&
   netif_carrier_ok(dev->net) &&
   !timer_pending (>delay) &&
+  !test_bit (EVENT_RX_PAUSED, >flags) &&
   !test_bit (EVENT_RX_HALT, >flags)) {
int temp = dev->rxq.qlen;

Re: [PATCH net-next v2 2/4] cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

2016-06-23 Thread Martin KaFai Lau

On Thu, Jun 23, 2016 at 11:42:31AM +0200, Daniel Borkmann wrote:
> Hi Martin,
>
> [ sorry to jump late in here, on pto currently ]
Thanks for reviewing.

> Could you describe a bit more with regards to pinning maps and how this
> should interact with cgroups? The two specialized array maps we have (tail
> calls, perf events) have fairly complicated semantics for when to clean up
> map slots (see commits c9da161c6517ba1, 3b1efb196eee45b2f0c4).
>
> How is this managed with cgroups? Once a cgroup fd is placed into a map and
> the user removes the cgroup, will this be prevented due to 'being busy', or
> will the cgroup live further as long as a program is running with a cgroup
> map entry (but the cgroup itself is not visible from user space in any way
> anymore)?
Having a cgroup ptr stored in the bpf_map will not stop the user from
removing the cgroup (by rmdir /mnt/cgroup2/tc/test_cgrp).

The cgroup ptr stored in the bpf_map holds a refcnt which answer the
second part.

The situation is similar to the netfilter usecase in
commit 38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")

>
> I presume it's a valid use case to pin a cgroup map, put fds into it and
> remove the pinned file expecting to continue to match on it, right? So
> lifetime is really until last prog using a cgroup map somewhere gets removed
> (even if not accessible from user space anymore, meaning no prog has fd and
> pinned file was removed).
Yes.

We are still hatching out how to set this up in production. However, the
situation is similar to removing the pinned file.
We probably will not use tc and pin a bpf_map to do that.  Instead,
one process will setup eveything (e.g. create the cgroup, pouplate the
cgroup map, load the bpf to egress) and then go away.

I don't think we need a prog fd to remove the bpf prog.

>
> I assume that using struct file here doesn't make sense (commit 
> e03e7ee34fdd1c3)
> either, right?
No. I don't think so. We eventually need a cgroup from the 'struct file'.

[PATCH net 2/3] net: bgmac: Start transmit queue in bgmac_open

2016-06-23 Thread Florian Fainelli

The driver does not start the transmit queue in bgmac_open(). If the
queue was stopped prior to closing then re-opening the interface, we
would never be able to wake-up again.

Fixes: dd4544f05469 ("bgmac: driver for GBit MAC core on BCMA bus")
Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bgmac.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index 70926c611f25..85cd07f72ffb 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -1314,6 +1314,9 @@ static int bgmac_open(struct net_device *net_dev)
phy_start(bgmac->phy_dev);
 
netif_carrier_on(net_dev);
+
+   netif_start_queue(net_dev);
+
return 0;
 }
 
-- 
2.7.4

[PATCH net 3/3] net: bgmac: Remove superflous netif_carrier_on()

2016-06-23 Thread Florian Fainelli

bgmac_open() calls phy_start() to initialize the PHY state machine,
which will set the interface's carrier state accordingly, no need to
force that as this could be conflicting with the PHY state determined by
PHYLIB.

Fixes: dd4544f05469 ("bgmac: driver for GBit MAC core on BCMA bus")
Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bgmac.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index 85cd07f72ffb..a6333d38ecc0 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -1313,8 +1313,6 @@ static int bgmac_open(struct net_device *net_dev)
 
phy_start(bgmac->phy_dev);
 
-   netif_carrier_on(net_dev);
-
netif_start_queue(net_dev);
 
return 0;
-- 
2.7.4

[PATCH net 0/3] net: bgmac: Random fixes

2016-06-23 Thread Florian Fainelli

Hi all,

This patch series fixes a few issues spotted by code inspection and
actual testing.

Thanks

Florian Fainelli (3):
  net: bgmac: Fix SOF bit checking
  net: bgmac: Start transmit queue in bgmac_open
  net: bgmac: Remove superflous netif_carrier_on()

 drivers/net/ethernet/broadcom/bgmac.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

-- 
2.7.4

[PATCH net 1/3] net: bgmac: Fix SOF bit checking

2016-06-23 Thread Florian Fainelli

We are checking for the Start of Frame bit in the ctl1 word, while this
bit is set in the ctl0 word instead. Read the ctl0 word and update the
check to verify that.

Fixes: 9cde94506eac ("bgmac: implement scatter/gather support")
Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bgmac.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index ee5f431ab32a..70926c611f25 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -267,15 +267,16 @@ static void bgmac_dma_tx_free(struct bgmac *bgmac, struct 
bgmac_dma_ring *ring)
while (ring->start != ring->end) {
int slot_idx = ring->start % BGMAC_TX_RING_SLOTS;
struct bgmac_slot_info *slot = >slots[slot_idx];
-   u32 ctl1;
+   u32 ctl0, ctl1;
int len;
 
if (slot_idx == empty_slot)
break;
 
+   ctl0 = le32_to_cpu(ring->cpu_base[slot_idx].ctl0);
ctl1 = le32_to_cpu(ring->cpu_base[slot_idx].ctl1);
len = ctl1 & BGMAC_DESC_CTL1_LEN;
-   if (ctl1 & BGMAC_DESC_CTL0_SOF)
+   if (ctl0 & BGMAC_DESC_CTL0_SOF)
/* Unmap no longer used buffer */
dma_unmap_single(dma_dev, slot->dma_addr, len,
 DMA_TO_DEVICE);
-- 
2.7.4

[PATCH net] bonding: fix 802.3ad aggregator reselection

2016-06-23 Thread Jay Vosburgh


Since commit 7bb11dc9f59d ("bonding: unify all places where
actor-oper key needs to be updated."), the logic in bonding to handle
selection between multiple aggregators has not functioned.

This affects only configurations wherein the bonding slaves
connect to two discrete aggregators (e.g., two independent switches, each
with LACP enabled), thus creating two separate aggregation groups within a
single bond.

The cause is a change in 7bb11dc9f59d to no longer set
AD_PORT_BEGIN on a port after a link state change, which would cause the
port to be reselected for attachment to an aggregator as if were newly
added to the bond.  We cannot restore the prior behavior, as it
contradicts IEEE 802.1AX 5.4.12, which requires ports that "become
inoperable" (lose carrier, setting port_enabled=false as per 802.1AX
5.4.7) to remain selected (i.e., assigned to the aggregator).  As the port
now remains selected, the aggregator selection logic is not invoked.

A side effect of this change is that aggregators in bonding will
now contain ports that are link down.  The aggregator selection logic
does not currently handle this situation correctly, causing incorrect
aggregator selection.

This patch makes two changes to repair the aggregator selection
logic in bonding to function as documented and within the confines of the
standard:

First, the aggregator selection and related logic now utilizes the
number of active ports per aggregator, not the number of selected ports
(as some selected ports may be down).  The ad_select "bandwidth" and
"count" options only consider ports that are link up.

Second, on any carrier state change of any slave, the aggregator
selection logic is explicitly called to insure the correct aggregator is
active.

Reported-by: Veli-Matti Lintu 
Fixes: 7bb11dc9f59d ("bonding: unify all places where actor-oper key needs to 
be updated.")
Signed-off-by: Jay Vosburgh 

---
 drivers/net/bonding/bond_3ad.c | 64 +-
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index b9304a295f86..ca81f46ea1aa 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -657,6 +657,20 @@ static void __set_agg_ports_ready(struct aggregator 
*aggregator, int val)
}
 }
 
+static int __agg_active_ports(struct aggregator *agg)
+{
+   struct port *port;
+   int active = 0;
+
+   for (port = agg->lag_ports; port;
+port = port->next_port_in_aggregator) {
+   if (port->is_enabled)
+   active++;
+   }
+
+   return active;
+}
+
 /**
  * __get_agg_bandwidth - get the total bandwidth of an aggregator
  * @aggregator: the aggregator we're looking at
@@ -664,39 +678,40 @@ static void __set_agg_ports_ready(struct aggregator 
*aggregator, int val)
  */
 static u32 __get_agg_bandwidth(struct aggregator *aggregator)
 {
+   int nports = __agg_active_ports(aggregator);
u32 bandwidth = 0;
 
-   if (aggregator->num_of_ports) {
+   if (nports) {
switch (__get_link_speed(aggregator->lag_ports)) {
case AD_LINK_SPEED_1MBPS:
-   bandwidth = aggregator->num_of_ports;
+   bandwidth = nports;
break;
case AD_LINK_SPEED_10MBPS:
-   bandwidth = aggregator->num_of_ports * 10;
+   bandwidth = nports * 10;
break;
case AD_LINK_SPEED_100MBPS:
-   bandwidth = aggregator->num_of_ports * 100;
+   bandwidth = nports * 100;
break;
case AD_LINK_SPEED_1000MBPS:
-   bandwidth = aggregator->num_of_ports * 1000;
+   bandwidth = nports * 1000;
break;
case AD_LINK_SPEED_2500MBPS:
-   bandwidth = aggregator->num_of_ports * 2500;
+   bandwidth = nports * 2500;
break;
case AD_LINK_SPEED_1MBPS:
-   bandwidth = aggregator->num_of_ports * 1;
+   bandwidth = nports * 1;
break;
case AD_LINK_SPEED_2MBPS:
-   bandwidth = aggregator->num_of_ports * 2;
+   bandwidth = nports * 2;
break;
case AD_LINK_SPEED_4MBPS:
-   bandwidth = aggregator->num_of_ports * 4;
+   bandwidth = nports * 4;
break;
case AD_LINK_SPEED_56000MBPS:
-   bandwidth = aggregator->num_of_ports * 56000;
+   bandwidth = nports * 56000;

Re: [PATCH net-next v2 2/4] cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

2016-06-23 Thread Tejun Heo

Hello,

On Thu, Jun 23, 2016 at 11:42:31AM +0200, Daniel Borkmann wrote:
> I presume it's a valid use case to pin a cgroup map, put fds into it and
> remove the pinned file expecting to continue to match on it, right? So
> lifetime is really until last prog using a cgroup map somewhere gets removed
> (even if not accessible from user space anymore, meaning no prog has fd and
> pinned file was removed).

Yeap, from what I can see, the cgroup will stay around (even if it
gets deleted) as long as the bpf rule using it is around and that's
completely fine from cgroup side.

Thanks.

-- 
tejun

Re: [PATCH net-next 0/5] qed/qede: Tunnel hardware GRO support

2016-06-23 Thread Yuval Mintz

>>> Then again, if you're basically saying every HW-assisted offload on
>>> receive should be done under LRO flag, what would be the use case
>>> where a GRO-assisted offload would help?
>>> I.e., afaik LRO is superior to GRO in `brute force' -
>>> it creates better packed packets and utilizes memory better
>>> [with all the obvious cons such as inability for defragmentation].
>>> So if you'd have the choice of having an adpater perform 'classic'
>>> LRO aggregation or something that resembles a GRO packet,
>>> what would be the gain from doing the latter?

> LRO and GRO shouldn't really differ in packing or anything like that.
> The big difference between the two is that LRO is destructive while
> GRO is not.  Specifically in the case of GRO you should be able to
> take the resultant frame, feed it through GSO, and get the original
> stream of frames back out.  So you can pack the frames however you
> want the only key is that you must capture all the correct offsets and
> set the gso_size correct for the flow.

While the implementation might lack in things [such as issues with
future implementation], following your logic it is GRO - I.e., forwarding
scenarios work fine with HW assisted GRO.

>> Just to relate to bnx2x/qede differences in current implementation -
>> when this GRO hw-offload was added to bnx2x, it has already
>> supported classical LRO, and due to above statement whenever LRO
>> was set driver aggregated incoming traffic as classic LRO.
>> I agree that in hindsight the lack of distinction between sw/hw GRO
>> was hurting us.

> In the case of bnx2x it sounds like you have issues that are
> significantly hurting the performance versus classic software GRO.  If
> that is the case you might want to simply flip the logic for the
> module parameter that Rick mentioned and just disable the hardware
> assisted GRO unless it is specifically requested.

A bit hard to flip; The module parameter also disables LRO support.
And given that module parameters is mostly a thing of the past, I
don't think we should strive fixing things through additional changes
in that area.

> > qede isn't implementing LRO, so we could easily mark this feature
> > under LRO there - but question is, given that the adapter can support
> > LRO, if we're going to suffer from all the shotrages that arise from
> > putting this feature under LRO, why should we bother?

> The idea is to address feature isolation.  The fact is the hardware
> exists outside of kernel control.  If you end up linking an internal
> kernel feature to your device like this you are essentially stripping
> the option of using the kernel feature.

> I would prefer to see us extend LRO to support "close enough GRO"
> instead of have us extend GRO to also include LRO. 

Again - why? What's the benefit of HW doing LRO and trying to
control imitate GRO, if it's still carrying all the LRO baggage
[specifically, disabling it on forwarding] as opposed to simply
doing classic LRO?

> > You can argue that we might need a new feature bit for control
> > over such a feature; If we don't do that, is there any gain in all of this?

> I would argue that yes there are many cases where we will be able to
> show gain.  The fact is there is a strong likelihood of the GRO on
> your parts having some differences either now, or at some point in the
> future as the code evolves.  As I mentioned there was already some
> talk about possibly needing to push the UDP tunnel aggregation out of
> GRO and perhaps handling it sometime after IP look up had verified
> that the destination was in fact a local address in the namespace.  In
> addition it makes the changes to include the tunnel encapsulation much
> more acceptable as LRO is already naturally dropped in the routing and
> bridging cases if I recall correctly.

I think it all boils down to the question of "do we actually want to have
HW-assisted GRO?". If we do [and not necessarily for the UDP-tunnel
scenario] then we need to have it distinct from LRO, otherwise there's
very little gain. If we believe tGRO should remain SW-only, then
I think the discussion is mott; We need to stop trying this, and offload
only LRO - in which case we can aggregate it in whichever 'destructive'
[correct] format we like, without trying to have it resemble GRO.

Re: [PATCH net-next v2 1/4] cgroup: Add cgroup_get_from_fd

2016-06-23 Thread Tejun Heo

On Wed, Jun 22, 2016 at 02:17:29PM -0700, Martin KaFai Lau wrote:
> Add a helper function to get a cgroup2 from a fd.  It will be
> stored in a bpf array (BPF_MAP_TYPE_CGROUP_ARRAY) which will
> be introduced in the later patch.
> 
> Signed-off-by: Martin KaFai Lau 
> Cc: Alexei Starovoitov 
> Cc: Daniel Borkmann 
> Cc: Tejun Heo 

 Acked-by: Tejun Heo 

Please feel free to route this patch with the rest of the series.  If
it's preferable to apply this to the cgroup branch, please let me
know.

Thanks!

-- 
tejun

Re: [PATCH net v2] ipv6: enforce egress device match in per table nexthop lookups

2016-06-23 Thread David Ahern


On 6/23/16 8:39 AM, Paolo Abeni wrote:

On Thu, 2016-06-23 at 08:29 -0600, David Ahern wrote:

On 6/23/16 8:20 AM, David Ahern wrote:

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 969913d..520b788 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1782,7 +1782,7 @@ static struct rt6_info
*ip6_nh_lookup_table(struct net *net,
 };
 struct fib6_table *table;
 struct rt6_info *rt;
-int flags = 0;
+int flags = RT6_LOOKUP_F_IFACE;

 table = fib6_get_table(net, cfg->fc_table);
 if (!table)



Acked-by: David Ahern 


I take that back.

I think RT6_LOOKUP_F_IFACE should only be set if cfg->fc_ifindex is set.


AFAICS the latter condition should not be needed. The related
information is passed all way down to rt6_score_route(), where it's
really used:

m = rt6_check_dev(rt, oif);
if (!m && (strict & RT6_LOOKUP_F_IFACE))
return RT6_NUD_FAIL_HARD;

and 'm' can be 0 only if oif is set: RT6_LOOKUP_F_IFACE has no effect
ifindex is set.



For the simplified lookup yes that is true. Lookups that go through 
ip6_pol_route it is not and for my comment above I was thinking about 
this latter case.


Anyways, your change is fine for the ip6_nh_lookup_table case.

[PATCH] ipmr/ip6mr: Initialize the last assert time of mfc entries.

2016-06-23 Thread Tom Goff

This fixes wrong-interface signaling on 32-bit platforms for entries
created when jiffies > 2^31 + MFC_ASSERT_THRESH.

Signed-off-by: Tom Goff 
---
 net/ipv4/ipmr.c  | 4 +++-
 net/ipv6/ip6mr.c | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 21a38e2..5ad48ec 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -891,8 +891,10 @@ static struct mfc_cache *ipmr_cache_alloc(void)
 {
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
 
-   if (c)
+   if (c) {
+   c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
c->mfc_un.res.minvif = MAXVIFS;
+   }
return c;
 }
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index f2e2013f8..487ef3b 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1074,6 +1074,7 @@ static struct mfc6_cache *ip6mr_cache_alloc(void)
struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
if (!c)
return NULL;
+   c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
c->mfc_un.res.minvif = MAXMIFS;
return c;
 }
-- 
1.9.1

Re: [PATCH net-next v2 3/4] cgroup: bpf: Add bpf_skb_in_cgroup_proto

2016-06-23 Thread Daniel Borkmann


On 06/23/2016 06:54 PM, Martin KaFai Lau wrote:

On Thu, Jun 23, 2016 at 11:53:50AM +0200, Daniel Borkmann wrote:

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 668e079..68753e0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1062,6 +1062,10 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
if (func_id != BPF_FUNC_get_stackid)
goto error;
break;
+   case BPF_MAP_TYPE_CGROUP_ARRAY:
+   if (func_id != BPF_FUNC_skb_in_cgroup)
+   goto error;
+   break;


I think the BPF_MAP_TYPE_CGROUP_ARRAY case should have been fist here in
patch 2/4, but with unconditional goto error. And this one only adds the
'func_id != BPF_FUNC_skb_in_cgroup' test.

I am not sure I understand.  Can you elaborate? I am probably missing
something here.


If someone backports patch 2/4 as-is, but for some reason not 3/4, then you
could craft a program that calls f.e. bpf_map_update_elem() on a cgroup array
and would thus cause a NULL pointer deref, since verifier doesn't prevent it.
I'm just trying to say that it would probably make sense to add the above 'case
BPF_MAP_TYPE_CGROUP_ARRAY:' with an unconditional 'goto error' in patch 2/4
and extend upon it in patch 3/4 so result looks like here, so that the patches
are fine/complete each as stand-alone.

Re: [PATCH net-next 0/2] qed*: coalesce parameters config support.

2016-06-23 Thread David Miller

From: Sudarsana Reddy Kalluru 
Date: Tue, 21 Jun 2016 09:36:20 -0400

> The patch series adds the support for config/read of the adapter coalesce
> parameters. Patch (1) adds the qed infrastructure/APIs for the support and
> patch (2) adds the driver support for following ethtool commands:
>   ethtool -c|--show-coalesce ethX
>   ethtool -C|--coalesce ethX [rx-usecs N] [tx-usecs N]

Series applied.

Re: pull-request: wireless-drivers-next 2016-06-21

2016-06-23 Thread David Miller

From: Kalle Valo 
Date: Tue, 21 Jun 2016 13:47:45 +0300

> I hope it's ok to send two pull requests the same day, both for net
> and net-next? This is targeted to 4.8 so it is for net-next.

Yeah that's fine.

> Even though is this the first pull request for 4.8 we actually
> remove more code than add, thanks to Guenter Roeck's on removing
> unused "phy_a" support from b43. Otherwise there's not really
> anything standing out, adding new chipset support to brcmfmac and
> ath10k, lots of fixes and the usual.

Pulled, thanks.

Re: pull-request: wireless-drivers 2016-06-21

2016-06-23 Thread David Miller

From: Kalle Valo 
Date: Tue, 21 Jun 2016 13:27:16 +0300

> here is a pull request for 4.7, really small fixes this time, some of
> them fix important regressions. Please let me know if there are any
> problems.

Applied, thanks.

Re: [PATCH] net: ethernet: fix odd_ptr_err.cocci warnings

2016-06-23 Thread David Miller

From: Julia Lawall 
Date: Tue, 21 Jun 2016 12:03:28 +0200 (CEST)

> PTR_ERR should normally access the value just tested by IS_ERR
> 
> Generated by: scripts/coccinelle/tests/odd_ptr_err.cocci
> 
> CC: Tien Hock Loh 
> Signed-off-by: Julia Lawall 
> Signed-off-by: Fengguang Wu 

This doesn't apply to any of my tree(s).

Re: [PATCH v4 00/19] CALIPSO Implementation

2016-06-23 Thread David Miller

From: Huw Davies 
Date: Tue, 21 Jun 2016 10:55:48 +0100

> On Tue, Jun 21, 2016 at 05:39:28AM -0400, David Miller wrote:
>> From: Huw Davies 
>> Date: Mon, 20 Jun 2016 14:36:40 +0100
>> 
>> > This patch series implements RFC 5570 - Common Architecture Label IPv6
>> > Security Option (CALIPSO).  Its goal is to set MLS sensitivity labels
>> > on IPv6 packets using a hop-by-hop option.  CALIPSO is very similar to
>> > its IPv4 cousin CIPSO and much of this series is based on that code.
>> 
>> What tree do you expect to integrate this?
> 
> My understanding is that Paul Moore is happy to take them
> in via the SELinux tree.  However, these patches do touch
> some core networking code, such as the IPv6 option handling
> code (in a similar manner to the way CIPSO touched the IPv4
> option code), so if you have any comments on those aspects
> that would be good to hear.

No objections on my part.

Re: [PATCH] net: vrf: replace hard tab with space in assignment

2016-06-23 Thread David Miller

From: Chris Packham 
Date: Tue, 21 Jun 2016 15:39:43 +1200

> The assignment of rth->dst.output in vrf_rt6_create() and
> vrf_rtable_create() used a hard tab before the '='. The neighboring
> assignments did not. Make the assignment of rth->dst.output consistent
> with the surrounding code.
> 
> Signed-off-by: Chris Packham 

Applied.

Re: [PATCH] geneve: fix tx_errors statistics

2016-06-23 Thread David Miller

From: Haishuang Yan 
Date: Tue, 21 Jun 2016 16:26:49 +0800

> Tx errors present summation of errors encountered while transmitting
> packets.
> 
> Signed-off-by: Haishuang Yan 

Applied.

Re: [PATCH net-next] net: dsa: b53: Fix statistics readings

2016-06-23 Thread David Miller

From: Florian Fainelli 
Date: Mon, 20 Jun 2016 18:26:53 -0700

> Due to a typo we would always be using the MIB counter width of the
> first element of the counter array instead of the current element, and
> we would always be accessing the register statistics with a 64-bits
> read, while some could be 32-bits. This got unnoticed in testing with
> MDIO and SRAB which tolerate doing this, but testing with the SPI bus
> revealed bogus values being returned. Fix this by using the proper
> iterator here.
> 
> Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch")
> Reported-by: Jonas Gorski 
> Signed-off-by: Florian Fainelli 

Applied.

Re: [PATCH net-next] ti_cpsw: Check for disabled child nodes

2016-06-23 Thread David Miller

From: Ben Hutchings 
Date: Tue, 21 Jun 2016 01:16:31 +0100

> Dual MAC devices don't necessarily have both MACs wired up, so ignore
> those that are disabled.
> 
> Signed-off-by: Ben Hutchings 

Applied.

Re: of_mdio: Enable fixed PHY support if driver is a module

2016-06-23 Thread David Miller

From: Ben Hutchings 
Date: Tue, 21 Jun 2016 01:10:55 +0100

> The fixed_phy driver doesn't have to be built-in, and it's
> important that of_mdio supports it even if it's a module.
> 
> Signed-off-by: Ben Hutchings 

Applied.

Re: [PATCH net] netem: fix a use after free

2016-06-23 Thread David Miller

From: Eric Dumazet 
Date: Mon, 20 Jun 2016 15:00:43 -0700

> From: Eric Dumazet 
> 
> If the packet was dropped by lower qdisc, then we must not
> access it later.
> 
> Save qdisc_pkt_len(skb) in a temp variable.
> 
> Fixes: 2f5fb43f ("net_sched: update hierarchical backlog too")
> Signed-off-by: Eric Dumazet 

Applied and queued up for -stable, thanks.

Re: [PATCH] bridge: netfilter: spanning tree: Add masked_ether_addr_equal and neatening

2016-06-23 Thread Joe Perches

On Thu, 2016-06-23 at 19:36 +0200, Pablo Neira Ayuso wrote:
> On Wed, Jun 15, 2016 at 01:58:45PM -0700, Joe Perches wrote:
> > 
> > There is code duplication of a masked ethernet address comparison here
> > so make it a separate function instead.
> > 
> > Miscellanea:
> > 
> > o Neaten alignment of FWINV macro uses to make it clearer for the reader
> Applied, thanks.
> 
> > 
> > Signed-off-by: Joe Perches 
> > ---
> > 
> > This masked_ether_addr_equal function could go into etherdevice.h,
> > but I don't see another use like it in kernel code.  Is there one?
> This is specific of iptables, not even nftables would use this. So I
> would keep this in the iptables tree.

Did you see the other patch that adds a generic
ether_addr_equal_masked() and uses it in a few
more files?

[ PATCH] usbnet.c mtu change needs to stop RX

2016-06-23 Thread Soohoon Lee


When MTU is changed unlink_urbs() flushes RX Q but mean while usbnet_bh() can 
fill up the Q at the same time.
Depends on which HCD is down there unlink takes long time then the flush never 
ends.


diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 61ba464..e03e3e6 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -708,6 +708,7 @@ static int unlink_urbs (struct usbnet *dev, struct 
sk_buff_head *q)
int count = 0;
 
spin_lock_irqsave (>lock, flags);
+   usbnet_pause_rx(dev);
while (!skb_queue_empty(q)) {
struct skb_data *entry;
struct urb  *urb;
@@ -742,6 +743,7 @@ found:
usb_put_urb(urb);
spin_lock_irqsave(>lock, flags);
}
+   usbnet_resume_rx(dev);
spin_unlock_irqrestore (>lock, flags);
return count;
 }
@@ -1509,6 +1511,7 @@ static void usbnet_bh (unsigned long param)
   netif_device_present (dev->net) &&
   netif_carrier_ok(dev->net) &&
   !timer_pending (>delay) &&
+  !test_bit (EVENT_RX_PAUSED, >flags) &&
   !test_bit (EVENT_RX_HALT, >flags)) {
int temp = dev->rxq.qlen;

Re: [PATCH] Bridge: Fix ipv6 mc snooping if it has no ipv6 address.

2016-06-23 Thread Linus Lüssing

Hi Daniel,

Thanks for submitting this patch here :).

On Thu, Jun 23, 2016 at 11:28:55AM +0200, daniel wrote:
> The bridge is falsly dropping ipv6 mulitcast packets
> if there is no ipv6 address assigned on the brigde and no
> external mld querier is present.

and if the bridge internal querier is enabled (usually disabled
by default in the bridge code, but enabled by default in OpenWRT
for instance).

> 
> When the bridge fails to build mld queries, because it has no
> ipv6 address, it silently returns, but keeps the local querier enabled.
> (br_multicast.c:832)

Not sure whether David or others like line numbers in commit messages, as
they can change over time.

> 
> Ipv6 multicast snooping can only work if:
>  a) an external querier is present

maybe clarify that this is an OR, not AND?

I think you can add a [PATCH net] tag, as it seems small
enough for stable kernels and fixes a potential, confusing packet
loss case.

Also maybe add a:
--
Fixes: 1d81d4c3dd88 ("bridge: check return value of ipv6_dev_get_saddr()")
--

Regards, Linus

PS: Does not seem to apply for me on either David's net branch or
Torvald's master branch.

"fatal: patch fragment without header at line 7: @@ -599,10
+612,12 @@ static inline bool"

Try using "git format-patch" and "git send-email" instead. Also
check ./scripts/get_maintainer.pl for a few more email addresses
to add.

Re: [PATCH v2 2/2] netfilter/nflog: nflog-range does not truncate packets (userspace)

2016-06-23 Thread Pablo Neira Ayuso

On Tue, Jun 21, 2016 at 03:02:16PM -0400, Vishwanath Pai wrote:
> netfilter/nflog: nflog-range does not truncate packets
> 
> The option --nflog-range has never worked, but we cannot just fix this
> because users might be using this feature option and their behavior would
> change. Instead add a new option --nflog-size. This option works the same
> way nflog-range should have, and both of them are mutually exclusive. When
> someone uses --nflog-range we print a warning message informing them that
> this feature has no effect.
> 
> To indicate the kernel that the user has set --nflog-size we have to pass a
> new flag XT_NFLOG_F_COPY_LEN.
> 
> Also updated the man page to reflect this.

Please, send me a v3 including tests, see:

iptables/extensions/libxt_NFLOG.t

Thanks.

Re: [PATCH] bridge: netfilter: spanning tree: Add masked_ether_addr_equal and neatening

2016-06-23 Thread Pablo Neira Ayuso

On Wed, Jun 15, 2016 at 01:58:45PM -0700, Joe Perches wrote:
> There is code duplication of a masked ethernet address comparison here
> so make it a separate function instead.
> 
> Miscellanea:
> 
> o Neaten alignment of FWINV macro uses to make it clearer for the reader

Applied, thanks.

> Signed-off-by: Joe Perches 
> ---
> 
> This masked_ether_addr_equal function could go into etherdevice.h,
> but I don't see another use like it in kernel code.  Is there one?

This is specific of iptables, not even nftables would use this. So I
would keep this in the iptables tree.

[iproute PATCH v3 6/6] misc/ifstat: simplify unsigned value comparison

2016-06-23 Thread Phil Sutter

By directly comparing the value of both unsigned variables, casting to
signed becomes unnecessary.

This also fixes for compiling with older versions of gcc (at least
<=3.4.6) which emit the following warning:

| ifstat.c: In function `update_db':
| ifstat.c:542: warning: comparison is always false due to limited range of 
data type

Signed-off-by: Phil Sutter 
---
 misc/ifstat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/misc/ifstat.c b/misc/ifstat.c
index abbb4e732fcef..9a44da487599e 100644
--- a/misc/ifstat.c
+++ b/misc/ifstat.c
@@ -539,7 +539,7 @@ static void update_db(int interval)
int i;
 
for (i = 0; i < MAXS; i++) {
-   if ((long)(h1->ival[i] - n->ival[i]) < 
0) {
+   if (h1->ival[i] < n->ival[i]) {
memset(n->ival, 0, 
sizeof(n->ival));
break;
}
-- 
2.8.2

[iproute PATCH v3 5/6] Makefile: Allow to override CC

2016-06-23 Thread Phil Sutter

This makes it easier to build iproute2 with a custom compiler.

While at it, make HOSTCC default to the value of CC if not explicitly
set elsewhere.

Signed-off-by: Phil Sutter 
---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 15c81ecfdca3a..fa200ddb76679 100644
--- a/Makefile
+++ b/Makefile
@@ -34,8 +34,8 @@ ADDLIB+=ipx_ntop.o ipx_pton.o
 #options for mpls
 ADDLIB+=mpls_ntop.o mpls_pton.o
 
-CC = gcc
-HOSTCC = gcc
+CC := gcc
+HOSTCC ?= $(CC)
 DEFINES += -D_GNU_SOURCE
 # Turn on transparent support for LFS
 DEFINES += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
-- 
2.8.2

[iproute PATCH v3 2/6] Use C99 style initializers everywhere

2016-06-23 Thread Phil Sutter

This big patch was compiled by vimgrepping for memset calls and changing
to C99 initializer if applicable. One notable exception is the
initialization of union bpf_attr in tc/tc_bpf.c: changing it would break
for older gcc versions (at least <=3.4.6).

Calls to memset for struct rtattr pointer fields for parse_rtattr*()
were just dropped since they are not needed.

The changes here allowed the compiler to discover some unused variables,
so get rid of them, too.

Signed-off-by: Phil Sutter 
---
Changes since v2:
- Flatten initializers.
- Leave a final comma in place.
- Fix checkpatch warnings.
- Initialize nlmsg_seq in the declaration, too.
- Use C99-style init in tc_bpf.c to get rid of the memset().
Changes since v1:
- Dropped former changes to tc/tc_bpf.c as they are incompatible to older
  gcc versions (at least <=3.4.6).
---
 bridge/fdb.c |  25 ++---
 bridge/link.c|  14 +++
 bridge/mdb.c |  17 -
 bridge/vlan.c|  17 -
 genl/ctrl.c  |  44 +-
 ip/ip6tunnel.c   |  10 ++---
 ip/ipaddress.c   |  31 +++-
 ip/ipaddrlabel.c |  21 ---
 ip/iplink.c  |  61 +-
 ip/iplink_can.c  |   4 +-
 ip/ipmaddr.c |  25 -
 ip/ipmroute.c|   8 +---
 ip/ipneigh.c |  30 ++-
 ip/ipnetconf.c   |  10 ++---
 ip/ipnetns.c |  39 +---
 ip/ipntable.c|  25 -
 ip/iproute.c |  78 ++-
 ip/iprule.c  |  22 +--
 ip/iptoken.c |  19 --
 ip/iptunnel.c|  31 +---
 ip/ipxfrm.c  |  26 -
 ip/link_gre.c|  18 -
 ip/link_gre6.c   |  18 -
 ip/link_ip6tnl.c |  25 +
 ip/link_iptnl.c  |  22 +--
 ip/link_vti.c|  18 -
 ip/link_vti6.c   |  18 -
 ip/xfrm_policy.c |  99 -
 ip/xfrm_state.c  | 110 +++
 lib/libnetlink.c |  77 ++
 lib/ll_map.c |   1 -
 misc/arpd.c  |  64 ++--
 misc/ss.c|  37 +++
 tc/e_bpf.c   |   7 +---
 tc/em_cmp.c  |   4 +-
 tc/em_ipset.c|   4 +-
 tc/em_meta.c |   4 +-
 tc/em_nbyte.c|   4 +-
 tc/em_u32.c  |   4 +-
 tc/f_flow.c  |   3 --
 tc/f_flower.c|   3 +-
 tc/f_fw.c|   6 +--
 tc/f_route.c |   3 --
 tc/f_rsvp.c  |   6 +--
 tc/f_u32.c   |  12 ++
 tc/m_bpf.c   |   5 +--
 tc/m_csum.c  |   4 +-
 tc/m_ematch.c|   4 +-
 tc/m_gact.c  |   5 +--
 tc/m_ife.c   |   5 +--
 tc/m_mirred.c|   7 +---
 tc/m_nat.c   |   4 +-
 tc/m_pedit.c |   8 +---
 tc/m_police.c|   5 +--
 tc/q_atm.c   |   3 +-
 tc/q_cbq.c   |  22 +++
 tc/q_choke.c |   4 +-
 tc/q_codel.c |   3 +-
 tc/q_dsmark.c|   1 -
 tc/q_fifo.c  |   4 +-
 tc/q_fq_codel.c  |   3 +-
 tc/q_hfsc.c  |  13 ++-
 tc/q_htb.c   |  15 +++-
 tc/q_netem.c |  16 +++-
 tc/q_red.c   |   4 +-
 tc/q_sfb.c   |  17 -
 tc/q_sfq.c   |   4 +-
 tc/q_tbf.c   |   4 +-
 tc/tc_bpf.c  |  54 ++-
 tc/tc_class.c|  31 ++--
 tc/tc_exec.c |   3 +-
 tc/tc_filter.c   |  33 ++---
 tc/tc_qdisc.c|  33 ++---
 tc/tc_stab.c |   4 +-
 tc/tc_util.c |   3 +-
 75 files changed, 532 insertions(+), 913 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index be849f980a802..59538b1e16506 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -177,16 +177,15 @@ static int fdb_show(int argc, char **argv)
struct nlmsghdr n;
struct ifinfomsgifm;
charbuf[256];
-   } req;
+   } req = {
+   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+   .ifm.ifi_family = PF_BRIDGE,
+   };
 
char *filter_dev = NULL;
char *br = NULL;
int msg_size = sizeof(struct ifinfomsg);
 
-   memset(, 0, sizeof(req));
-   req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-   req.ifm.ifi_family = PF_BRIDGE;
-
while (argc > 0) {
if ((strcmp(*argv, "brport") == 0) || strcmp(*argv, "dev") == 
0) {
NEXT_ARG();
@@ -247,7 +246,13 @@ static int fdb_modify(int cmd, int flags, int argc, char 
**argv)
struct nlmsghdr n;
struct ndmsgndm;
charbuf[256];
-   } req;
+   } req = {
+   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
+   .n.nlmsg_flags = NLM_F_REQUEST | flags,
+   .n.nlmsg_type = cmd,
+   .ndm.ndm_family = PF_BRIDGE,
+   .ndm.ndm_state = NUD_NOARP,
+   };
char *addr = NULL;
char *d = NULL;
char abuf[ETH_ALEN];
@@ -259,14

[iproute PATCH v3 1/6] tc: m_action: Improve conversion to C99 style initializers

2016-06-23 Thread Phil Sutter

This improves my initial change in the following points:

- Flatten embedded struct's initializers.
- No need to initialize variables to zero as the key feature of C99
  initializers is to do this implicitly.
- By relocating the declaration of struct rtattr *tail, it can be
  initialized at the same time.

Fixes: a0a73b298a579 ("tc: m_action: Use C99 style initializers for struct req")
Signed-off-by: Phil Sutter 
---
Changes since v2:
- Don't drop the "superfluous" comma.
- Flatten initializers.
Changes since v1:
- Created this patch.
---
 tc/m_action.c | 23 +++
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index ea16817aefd4f..806fdd197965d 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -395,13 +395,10 @@ static int tc_action_gd(int cmd, unsigned int flags, int 
*argc_p, char ***argv_p
struct tcamsg   t;
charbuf[MAX_MSG];
} req = {
-   .n = {
-   .nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .nlmsg_flags = NLM_F_REQUEST | flags,
-   .nlmsg_type = cmd,
-   },
+   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
+   .n.nlmsg_flags = NLM_F_REQUEST | flags,
+   .n.nlmsg_type = cmd,
.t.tca_family = AF_UNSPEC,
-   .buf = { 0 }
};
 
argc -= 1;
@@ -491,23 +488,18 @@ static int tc_action_modify(int cmd, unsigned int flags, 
int *argc_p, char ***ar
int argc = *argc_p;
char **argv = *argv_p;
int ret = 0;
-
-   struct rtattr *tail;
struct {
struct nlmsghdr n;
struct tcamsg   t;
charbuf[MAX_MSG];
} req = {
-   .n = {
-   .nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .nlmsg_flags = NLM_F_REQUEST | flags,
-   .nlmsg_type = cmd,
-   },
+   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
+   .n.nlmsg_flags = NLM_F_REQUEST | flags,
+   .n.nlmsg_type = cmd,
.t.tca_family = AF_UNSPEC,
-   .buf = { 0 }
};
+   struct rtattr *tail = NLMSG_TAIL();
 
-   tail = NLMSG_TAIL();
argc -= 1;
argv += 1;
if (parse_action(, , TCA_ACT_TAB, )) {
@@ -540,7 +532,6 @@ static int tc_act_list_or_flush(int argc, char **argv, int 
event)
} req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
.t.tca_family = AF_UNSPEC,
-   .buf = { 0 }
};
 
tail = NLMSG_TAIL();
-- 
2.8.2

[iproute PATCH v3 3/6] Replace malloc && memset by calloc

2016-06-23 Thread Phil Sutter

This only replaces occurrences where the newly allocated memory is
cleared completely afterwards, as in other cases it is a theoretical
performance hit although code would be cleaner this way.

Signed-off-by: Phil Sutter 
---
Changes since v2:
- Fix checkpatch errors.
---
 genl/genl.c|  3 +--
 lib/names.c|  7 ++-
 misc/lnstat.c  |  6 ++
 misc/lnstat_util.c |  4 +---
 tc/em_canid.c  |  4 ++--
 tc/m_action.c  |  3 +--
 tc/m_ipt.c | 13 -
 tc/m_pedit.c   |  3 +--
 tc/tc.c|  9 +++--
 tc/tc_bpf.c|  4 +---
 tc/tc_class.c  |  3 +--
 tc/tc_exec.c   |  3 +--
 12 files changed, 20 insertions(+), 42 deletions(-)

diff --git a/genl/genl.c b/genl/genl.c
index e33fafdf2f524..747074b029a7b 100644
--- a/genl/genl.c
+++ b/genl/genl.c
@@ -86,9 +86,8 @@ reg:
return f;
 
 noexist:
-   f = malloc(sizeof(*f));
+   f = calloc(1, sizeof(*f));
if (f) {
-   memset(f, 0, sizeof(*f));
strncpy(f->name, str, 15);
f->parse_genlopt = parse_nofopt;
f->print_genlopt = print_nofopt;
diff --git a/lib/names.c b/lib/names.c
index 3b5b0b1e1201a..fbd6503f22d42 100644
--- a/lib/names.c
+++ b/lib/names.c
@@ -54,15 +54,12 @@ struct db_names *db_names_alloc(void)
 {
struct db_names *db;
 
-   db = malloc(sizeof(*db));
+   db = calloc(1, sizeof(*db));
if (!db)
return NULL;
 
-   memset(db, 0, sizeof(*db));
-
db->size = MAX_ENTRIES;
-   db->hash = malloc(sizeof(struct db_entry *) * db->size);
-   memset(db->hash, 0, sizeof(struct db_entry *) * db->size);
+   db->hash = calloc(db->size, sizeof(struct db_entry *));
 
return db;
 }
diff --git a/misc/lnstat.c b/misc/lnstat.c
index 659a01bd69931..863fd4d9f03f2 100644
--- a/misc/lnstat.c
+++ b/misc/lnstat.c
@@ -182,10 +182,8 @@ static struct table_hdr *build_hdr_string(struct 
lnstat_file *lnstat_files,
static struct table_hdr th;
int ofs = 0;
 
-   for (i = 0; i < HDR_LINES; i++) {
-   th.hdr[i] = malloc(HDR_LINE_LENGTH);
-   memset(th.hdr[i], 0, HDR_LINE_LENGTH);
-   }
+   for (i = 0; i < HDR_LINES; i++)
+   th.hdr[i] = calloc(1, HDR_LINE_LENGTH);
 
for (i = 0; i < fps->num; i++) {
char *cname, *fname = fps->params[i].lf->name;
diff --git a/misc/lnstat_util.c b/misc/lnstat_util.c
index d918151282f55..cc54598fe1bef 100644
--- a/misc/lnstat_util.c
+++ b/misc/lnstat_util.c
@@ -173,15 +173,13 @@ static struct lnstat_file *alloc_and_open(const char 
*path, const char *file)
struct lnstat_file *lf;
 
/* allocate */
-   lf = malloc(sizeof(*lf));
+   lf = calloc(1, sizeof(*lf));
if (!lf) {
fprintf(stderr, "out of memory\n");
return NULL;
}
 
/* initialize */
-   memset(lf, 0, sizeof(*lf));
-
/* de->d_name is guaranteed to be <= NAME_MAX */
strcpy(lf->basename, file);
strcpy(lf->path, path);
diff --git a/tc/em_canid.c b/tc/em_canid.c
index 16f6ed5c0b7a4..ceb64cb933f51 100644
--- a/tc/em_canid.c
+++ b/tc/em_canid.c
@@ -106,8 +106,8 @@ static int canid_parse_eopt(struct nlmsghdr *n, struct 
tcf_ematch_hdr *hdr,
if (args == NULL)
return PARSE_ERR(args, "canid: missing arguments");
 
-   rules.rules_raw = malloc(sizeof(struct can_filter) * 
rules.rules_capacity);
-   memset(rules.rules_raw, 0, sizeof(struct can_filter) * 
rules.rules_capacity);
+   rules.rules_raw = calloc(rules.rules_capacity,
+sizeof(struct can_filter));
 
do {
if (!bstrcmp(args, "sff")) {
diff --git a/tc/m_action.c b/tc/m_action.c
index 806fdd197965d..24f8b5d855211 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -126,9 +126,8 @@ noexist:
goto restart_s;
}
 #endif
-   a = malloc(sizeof(*a));
+   a = calloc(1, sizeof(*a));
if (a) {
-   memset(a, 0, sizeof(*a));
strncpy(a->id, "noact", 15);
a->parse_aopt = parse_noaopt;
a->print_aopt = print_noaopt;
diff --git a/tc/m_ipt.c b/tc/m_ipt.c
index 098f610f9439a..d6f62bd6b32c9 100644
--- a/tc/m_ipt.c
+++ b/tc/m_ipt.c
@@ -164,16 +164,11 @@ get_target_name(const char *name)
return NULL;
 #endif
 
-   new_name = malloc(strlen(name) + 1);
-   lname = malloc(strlen(name) + 1);
-   if (new_name)
-   memset(new_name, '\0', strlen(name) + 1);
-   else
+   new_name = calloc(1, strlen(name) + 1);
+   lname = calloc(1, strlen(name) + 1);
+   if (!new_name)
exit_error(PARAMETER_PROBLEM, "get_target_name");
-
-   if (lname)
-   memset(lname, '\0', strlen(name) + 1);
-   else
+   if (!lname)
exit_error(PARAMETER_PROBLEM, "get_target_name");
 
strcpy(new_name, name);
diff

[iproute PATCH v3 4/6] No need to initialize rtattr fields before parsing

2016-06-23 Thread Phil Sutter

Since parse_rtattr_flags() calls memset already, there is no need for
callers to do so themselves.

Signed-off-by: Phil Sutter 
---
 ip/ipaddress.c | 2 +-
 tc/tc_class.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 62856f2c26eba..703a56b88d257 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -439,7 +439,7 @@ static void print_num(FILE *fp, unsigned int width, 
uint64_t count)
 
 static void print_vf_stats64(FILE *fp, struct rtattr *vfstats)
 {
-   struct rtattr *vf[IFLA_VF_STATS_MAX + 1] = {};
+   struct rtattr *vf[IFLA_VF_STATS_MAX + 1];
 
if (vfstats->rta_type != IFLA_VF_STATS) {
fprintf(stderr, "BUG: rta type is %d\n", vfstats->rta_type);
diff --git a/tc/tc_class.c b/tc/tc_class.c
index 158b4b18506eb..0d6000b91f539 100644
--- a/tc/tc_class.c
+++ b/tc/tc_class.c
@@ -219,7 +219,7 @@ static void graph_cls_show(FILE *fp, char *buf, struct 
hlist_head *root_list,
 {
struct hlist_node *n, *tmp_cls;
char cls_id_str[256] = {};
-   struct rtattr *tb[TCA_MAX + 1] = {};
+   struct rtattr *tb[TCA_MAX + 1];
struct qdisc_util *q;
char str[100] = {};
 
@@ -304,7 +304,7 @@ int print_class(const struct sockaddr_nl *who,
FILE *fp = (FILE *)arg;
struct tcmsg *t = NLMSG_DATA(n);
int len = n->nlmsg_len;
-   struct rtattr *tb[TCA_MAX + 1] = {};
+   struct rtattr *tb[TCA_MAX + 1];
struct qdisc_util *q;
char abuf[256];
 
-- 
2.8.2

[iproute PATCH v3 0/6] Big C99 style initializer rework

2016-06-23 Thread Phil Sutter

This is v3 of my C99-style initializer related patch series. The changes
since v2 are:

- Flattened embedded struct's initializers:
  Since the field names are very short, I figured it makes more sense to
  keep indenting low. Also, the same style is already used in
  ip/xfrm_policy.c so take that as an example.

- Moved leftover nlmsg_seq initializing into the common place as well:
  I was unsure whether this is a good idea at first (due to the
  increment), but again it's done in ip/xfrm_policy.c as well so should
  be fine.

- Added a comma after the last field initializer as suggested by Jakub.

- Dropped patch 7 since it was NACKed.

- Eliminated checkpatch non-compliance.

- Second go at union bpf_attr in tc/tc_bpf.c:
  I figured that while it is not possible to initialize fields, gcc-3.4.6
  does not complain when setting the whole union to zero using '= {0}'.
  So I did this and thereby at least got rid of the memset calls.

For reference, here's the v2 changelog:

- Rebased onto current upstream master:
  My own commit a0a73b298a579 ("tc: m_action: Use C99 style initializers
  for struct req") contains most of the changes to tc/m_action.c already,
  so I put the remaining ones into a dedicated patch (the first one here)
  with a better description.

- Tested against gcc-3.4.6:
  This is the oldest gcc version I was able to install locally. It indeed
  does not like the former changes in tc/tc_bpf.c, so I reverted them.
  Apart from emitting many warnings, it successfully compiles the
  sources.

In the process of compatibility testing, I made a few more changes which
make sense to have:

- New patch 5 allows to conveniently override the compiler via command
  line.

- New patch 6 eliminates a warning with old gcc but looks valid in
  general.

- A warning made me look at ip/tcp_metrics.c and I found a minor code
  simplification (patch 7).

Phil Sutter (6):
  tc: m_action: Improve conversion to C99 style initializers
  Use C99 style initializers everywhere
  Replace malloc && memset by calloc
  No need to initialize rtattr fields before parsing
  Makefile: Allow to override CC
  misc/ifstat: simplify unsigned value comparison

 Makefile   |   4 +-
 bridge/fdb.c   |  25 ++--
 bridge/link.c  |  14 +++
 bridge/mdb.c   |  17 -
 bridge/vlan.c  |  17 -
 genl/ctrl.c|  44 +
 genl/genl.c|   3 +-
 ip/ip6tunnel.c |  10 ++---
 ip/ipaddress.c |  33 +++-
 ip/ipaddrlabel.c   |  21 --
 ip/iplink.c|  61 -
 ip/iplink_can.c|   4 +-
 ip/ipmaddr.c   |  25 
 ip/ipmroute.c  |   8 +---
 ip/ipneigh.c   |  30 ++-
 ip/ipnetconf.c |  10 ++---
 ip/ipnetns.c   |  39 +--
 ip/ipntable.c  |  25 
 ip/iproute.c   |  78 +
 ip/iprule.c|  22 +--
 ip/iptoken.c   |  19 -
 ip/iptunnel.c  |  31 +--
 ip/ipxfrm.c|  26 -
 ip/link_gre.c  |  18 -
 ip/link_gre6.c |  18 -
 ip/link_ip6tnl.c   |  25 +---
 ip/link_iptnl.c|  22 +--
 ip/link_vti.c  |  18 -
 ip/link_vti6.c |  18 -
 ip/xfrm_policy.c   |  99 +++
 ip/xfrm_state.c| 110 ++---
 lib/libnetlink.c   |  77 ++---
 lib/ll_map.c   |   1 -
 lib/names.c|   7 +---
 misc/arpd.c|  64 ++-
 misc/ifstat.c  |   2 +-
 misc/lnstat.c  |   6 +--
 misc/lnstat_util.c |   4 +-
 misc/ss.c  |  37 +++---
 tc/e_bpf.c |   7 +---
 tc/em_canid.c  |   4 +-
 tc/em_cmp.c|   4 +-
 tc/em_ipset.c  |   4 +-
 tc/em_meta.c   |   4 +-
 tc/em_nbyte.c  |   4 +-
 tc/em_u32.c|   4 +-
 tc/f_flow.c|   3 --
 tc/f_flower.c  |   3 +-
 tc/f_fw.c  |   6 +--
 tc/f_route.c   |   3 --
 tc/f_rsvp.c|   6 +--
 tc/f_u32.c |  12 ++
 tc/m_action.c  |  26 -
 tc/m_bpf.c |   5 +--
 tc/m_csum.c|   4 +-
 tc/m_ematch.c  |   4 +-
 tc/m_gact.c|   5 +--
 tc/m_ife.c |   5 +--
 tc/m_ipt.c |  13 ++-
 tc/m_mirred.c  |   7 +---
 tc/m_nat.c |   4 +-
 tc/m_pedit.c   |  11 ++
 tc/m_police.c  |   5 +--
 tc/q_atm.c |   3 +-
 tc/q_cbq.c |  22 +++
 tc/q_choke.c   |   4 +-
 tc/q_codel.c   |   3 +-
 tc/q_dsmark.c  |   1 -
 tc/q_fifo.c|   4 +-
 tc/q_fq_codel.c|   3 +-
 tc/q_hfsc.c|  13 ++-
 tc/q_htb.c |  15 +++-
 tc/q_netem.c   |  16 +++-
 tc/q_red.c |   4 +-
 tc/q_sfb.c |  17 -
 tc/q_sfq.c |   4 +-
 tc/q_tbf.c |   4 +-
 tc/tc.c|   9 ++---
 tc/tc_bpf.c|  58

Re: [PATCH v2 1/2] netfilter/nflog: nflog-range does not truncate packets

2016-06-23 Thread Pablo Neira Ayuso

On Tue, Jun 21, 2016 at 02:58:46PM -0400, Vishwanath Pai wrote:
> netfilter/nflog: nflog-range does not truncate packets
> 
> li->u.ulog.copy_len is currently ignored by the kernel, we should truncate
> the packet to either li->u.ulog.copy_len (if set) or copy_range before
> sending it to userspace. 0 is a valid input for copy_len, so add a new
> flag to indicate whether this was option was specified by the user or not.
> 
> Add two flags to indicate whether nflog-size/copy_len was set or not.
> XT_NFLOG_F_COPY_LEN is for XT_NFLOG and NFLOG_F_COPY_LEN for nfnetlink_log
> 
> On the userspace side, this was initially represented by the option
> nflog-range, this will be replaced by --nflog-size now. --nflog-range would
> still exist but does not do anything.

Applied, thanks!

[4.6] kernel BUG at net/ipv6/raw.c:592

2016-06-23 Thread Dave Jones


Found this logs after a Trinity run.

kernel BUG at net/ipv6/raw.c:592!
[ cut here ]
invalid opcode:  [#1] SMP 

Modules linked in: udp_diag dccp_ipv6 dccp_ipv4 dccp sctp af_key tcp_diag 
inet_diag ip6table_filter xt_NFLOG nfnetlink_log xt_comment xt_statistic 
iptable_filter nfsv3 nfs_acl nfs fscache lockd grace autofs4 i2c_piix4 
rpcsec_gss_krb5 auth_rpcgss oid_registry sunrpc loop dummy ipmi_devintf 
iTCO_wdt iTCO_vendor_support acpi_cpufreq efivars ipmi_si ipmi_msghandler 
i2c_i801 i2c_core sg lpc_ich mfd_core button

CPU: 2 PID: 28854 Comm: trinity-c23 Not tainted 4.6.0 #1
Hardware name: Quanta Leopard-DDR3/Leopard-DDR3, BIOS F06_3A14.DDR3 05/13/2015
task: 880459cab600 ti: 880747bc4000 task.ti: 880747bc4000
RIP: 0010:[] [] rawv6_sendmsg+0xc30/0xc40
RSP: 0018:880747bc7bf8  EFLAGS: 00010282
RAX: fff2 RBX: 88080c6f2d00 RCX: 0002
RDX: 880747bc7cd8 RSI: 0030 RDI: 8803de801500
RBP: 880747bc7d90 R08: 002d R09: 0009
R10: 8803de801500 R11: 0009 R12: 0030
R13: 8803de801500 R14: 88086d67e000 R15: 88046bdac480
FS:  7fe29c566700() GS:88046fa4() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 01f0f2c0 CR3: 00080b99d000 CR4: 001406e0
Stack:
  88086d67e000 880747bc7d18 88046bdac480
 8804  880747bc7c68 88086d67e000
 8808002d 88080009  0001
 
Call Trace:
 [] ? page_fault+0x22/0x30
 [] ? bad_to_user+0x6a/0x6fa
 [] inet_sendmsg+0x67/0xa0
 [] sock_sendmsg+0x38/0x50
 [] sock_write_iter+0x78/0xd0
 [] __vfs_write+0xaa/0xe0
 [] vfs_write+0xa2/0x1a0
 [] SyS_write+0x46/0xa0 
 [] entry_SYSCALL_64_fastpath+0x13/0x8f
Code: 23 f7 ff ff f7 d0 41 01 c0 41 83 d0 00 e9 ac fd ff ff 48 8b 44 24 48 48 
8b 80 c0 01 00 00 65 48 ff 40 28 8b 51 78 d0 41 01 c0 41 83 d0 00 e9 ac fd ff 
ff 48 8b 44 24 48 48 8b 80 c0 01 00 00 65 48 ff 40 28 8b 51 78 e9 64 fe ff ff 
<0f> 0b 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 

RIP [] rawv6_sendmsg+0xc30/0xc40
 RSP 

 590 
 591 offset += skb_transport_offset(skb);
 592 BUG_ON(skb_copy_bits(skb, offset, , 2));
 593

Re: [PATCH net-next 0/5] qed/qede: Tunnel hardware GRO support

2016-06-23 Thread Alexander Duyck

On Wed, Jun 22, 2016 at 9:17 PM, Yuval Mintz  wrote:
>> Then again, if you're basically saying every HW-assisted offload on
>> receive should be done under LRO flag, what would be the use case
>> where a GRO-assisted offload would help?
>
>> I.e., afaik LRO is superior to GRO in `brute force' -
>> it creates better packed packets and utilizes memory better
>> [with all the obvious cons such as inability for defragmentation].
>> So if you'd have the choice of having an adpater perform 'classic'
>> LRO aggregation or something that resembles a GRO packet,
>> what would be the gain from doing the latter?

LRO and GRO shouldn't really differ in packing or anything like that.
The big difference between the two is that LRO is destructive while
GRO is not.  Specifically in the case of GRO you should be able to
take the resultant frame, feed it through GSO, and get the original
stream of frames back out.  So you can pack the frames however you
want the only key is that you must capture all the correct offsets and
set the gso_size correct for the flow.

> Just to relate to bnx2x/qede differences in current implementation -
> when this GRO hw-offload was added to bnx2x, it has already
> supported classical LRO, and due to above statement whenever LRO
> was set driver aggregated incoming traffic as classic LRO.
> I agree that in hindsight the lack of distinction between sw/hw GRO
> was hurting us.

In the case of bnx2x it sounds like you have issues that are
significantly hurting the performance versus classic software GRO.  If
that is the case you might want to simply flip the logic for the
module parameter that Rick mentioned and just disable the hardware
assisted GRO unless it is specifically requested.

> qede isn't implementing LRO, so we could easily mark this feature
> under LRO there - but question is, given that the adapter can support
> LRO, if we're going to suffer from all the shotrages that arise from
> putting this feature under LRO, why should we bother?

The idea is to address feature isolation.  The fact is the hardware
exists outside of kernel control.  If you end up linking an internal
kernel feature to your device like this you are essentially stripping
the option of using the kernel feature.

I would prefer to see us extend LRO to support "close enough GRO"
instead of have us extend GRO to also include LRO.  That way when we
encounter issues like the FW limitation that Rick encountered he can
just go in and disable the LRO and have true GRO kick in which would
be significantly better than having to poke around through
documentation to find a module parameter that can force the feature
off.  Really the fact that you have to use a module parameter is
frowned upon as well as most drivers aren't supposed to be using those
in the netdev tree.

> You can argue that we might need a new feature bit for control
> over such a feature; If we don't do that, is there any gain in all of this?

I would argue that yes there are many cases where we will be able to
show gain.  The fact is there is a strong likelihood of the GRO on
your parts having some differences either now, or at some point in the
future as the code evolves.  As I mentioned there was already some
talk about possibly needing to push the UDP tunnel aggregation out of
GRO and perhaps handling it sometime after IP look up had verified
that the destination was in fact a local address in the namespace.  In
addition it makes the changes to include the tunnel encapsulation much
more acceptable as LRO is already naturally dropped in the routing and
bridging cases if I recall correctly.

- Alex

Re: [PATCH net-next v2 3/4] cgroup: bpf: Add bpf_skb_in_cgroup_proto

2016-06-23 Thread Martin KaFai Lau

On Thu, Jun 23, 2016 at 11:53:50AM +0200, Daniel Borkmann wrote:
> >diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >index 668e079..68753e0 100644
> >--- a/kernel/bpf/verifier.c
> >+++ b/kernel/bpf/verifier.c
> >@@ -1062,6 +1062,10 @@ static int check_map_func_compatibility(struct 
> >bpf_map *map, int func_id)
> > if (func_id != BPF_FUNC_get_stackid)
> > goto error;
> > break;
> >+case BPF_MAP_TYPE_CGROUP_ARRAY:
> >+if (func_id != BPF_FUNC_skb_in_cgroup)
> >+goto error;
> >+break;
>
> I think the BPF_MAP_TYPE_CGROUP_ARRAY case should have been fist here in
> patch 2/4, but with unconditional goto error. And this one only adds the
> 'func_id != BPF_FUNC_skb_in_cgroup' test.
I am not sure I understand.  Can you elaborate? I am probably missing
something here.

>
> > default:
> > break;
> > }
> >@@ -1081,6 +1085,10 @@ static int check_map_func_compatibility(struct 
> >bpf_map *map, int func_id)
> > if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
> > goto error;
> > break;
> >+case BPF_FUNC_skb_in_cgroup:
> >+if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
> >+goto error;
> >+break;
> > default:
> > break;
> > }

Re: [PATCH net-next V2 07/10] ethtool: Add 50G baseSR2 link mode

2016-06-23 Thread David Decotigny

On Thu, Jun 23, 2016 at 7:02 AM, Saeed Mahameed  wrote:
> From: Gal Pressman 
>
> Add ETHTOOL_LINK_MODE_5baseSR2_Full_BIT bit.
>
> Signed-off-by: Gal Pressman 
> Signed-off-by: Saeed Mahameed 
> Cc: Ben Hutchings 
> Cc: David Decotigny 
> ---
>  include/uapi/linux/ethtool.h | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
> index 5f030b4..b8f38e8 100644
> --- a/include/uapi/linux/ethtool.h
> +++ b/include/uapi/linux/ethtool.h
> @@ -1362,6 +1362,7 @@ enum ethtool_link_mode_bit_indices {
> ETHTOOL_LINK_MODE_10baseSR4_Full_BIT= 37,
> ETHTOOL_LINK_MODE_10baseCR4_Full_BIT= 38,
> ETHTOOL_LINK_MODE_10baseLR4_ER4_Full_BIT= 39,
> +   ETHTOOL_LINK_MODE_5baseSR2_Full_BIT = 40,
>
> /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
>  * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
> @@ -1370,7 +1371,7 @@ enum ethtool_link_mode_bit_indices {
>  */
>
> __ETHTOOL_LINK_MODE_LAST
> - = ETHTOOL_LINK_MODE_10baseLR4_ER4_Full_BIT,
> + = ETHTOOL_LINK_MODE_5baseSR2_Full_BIT,
>  };
>
>  #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \
> --
> 2.8.0
>

Acked-By: David Decotigny

Re: [PATCH 2/3] can: fix oops caused by wrong rtnl dellink usage

2016-06-23 Thread Oliver Hartkopp


On 06/23/2016 03:09 PM, Sergei Shtylyov wrote:


+static void can_dellink(struct net_device *dev, struct list_head
*head)
+{
+return;


   Why?



http://marc.info/?l=linux-can=146651600421205=2

The same reason as for commit 993e6f2fd.


   I was asking just about the useless *return* statement...



Ah!

I did some investigation before whether using 'return' in empty void 
functions or not.


static void can_dellink(struct net_device *dev, struct list_head *head);

and

static void can_dellink(struct net_device *dev, struct list_head *head)
{
return;
}

do the same job, right?

But the first one looks like a forward declaration and you would try to 
find the 'implementing' function then.


Of course you can write less code and both implementations are correct - 
but this representation makes it pretty clear that here's nothing to do :-)


Regards,
Oliver

Re: [PATCH net-next 0/4] net_sched: bulk dequeue and deferred drops

2016-06-23 Thread Luigi Rizzo

On Wed, Jun 22, 2016 at 6:49 PM, Eric Dumazet  wrote:
> On Wed, 2016-06-22 at 17:44 +0200, Jesper Dangaard Brouer wrote:
>> On Wed, 22 Jun 2016 07:55:43 -0700
>> Eric Dumazet  wrote:
>>
>> > On Wed, 2016-06-22 at 16:47 +0200, Jesper Dangaard Brouer wrote:
>> > > On Tue, 21 Jun 2016 23:16:48 -0700
>> > > Eric Dumazet  wrote:
>> > >
>> > > > First patch adds an additional parameter to ->enqueue() qdisc method
>> > > > so that drops can be done outside of critical section
>> > > > (after locks are released).
>> > > >
>> > > > Then fq_codel can have a small optimization to reduce number of cache
>> > > > lines misses during a drop event
>> > > > (possibly accumulating hundreds of packets to be freed).
>> > > >
>> > > > A small htb change exports the backlog in class dumps.
>> > > >
>> > > > Final patch adds bulk dequeue to qdiscs that were lacking this feature.
>> > > >
>> > > > This series brings a nice qdisc performance increase (more than 80 %
>> > > > in some cases).
>> > >
>> > > Thanks for working on this Eric! this is great work! :-)
>> >
>> > Thanks Jesper
>> >
>> > I worked yesterday on bulk enqueues, but initial results are not that
>> > great.
>>
>> Hi Eric,
>>
>> This is interesting work! But I think you should read Luigi Rizzo's
>> (Cc'ed) paper on title "A Fast and Practical Software Packet Scheduling
>> Architecture"[1]
>>
>> [1] http://info.iet.unipi.it/~luigi/papers/20160511-mysched-preprint.pdf
>>
>> Luigi will be at Netfilter Workshop next week, and will actually
>> present on topic/paper you two should talk ;-)
>>
>> The article is not a 100% match for what we need, but there is some
>> good ideas.  The article also have a sort of "prequeue" that
>> enqueue'ing CPUs will place packets into.
>>
>> My understanding of the article:
>>
>> 1. transmitters submit packets to an intermediate queue
>>(replace q->enqueue call) lockless submit as queue per CPU
>>(runs in parallel)
>>
>> 2. like we only have _one_ qdisc dequeue process, this process (called
>>arbiter) empty the intermediate queues, and then invoke q->enqueue()
>>and q->dequeue(). (in a locked session/region)
>>
>> 3. Packets returned from q->dequeue() is placed on an outgoing
>>intermediate queue.
>>
>> 4. the transmitter then looks to see there are any packets to drain()
>>from the outgoing queue.  This can run in parallel.
>>
>> If the transmitter submitting a packet, detect no arbiter is running,
>> it can become the arbiter itself.  Like we do with qdisc_run_begin()
>> setting state __QDISC___STATE_RUNNING.
>>
>> The problem with this scheme is push-back from qdisc->enqueue
>> (NET_XMIT_CN) does not "reach" us.  And push-back in-form of processes
>> blocking on qdisc root lock, but that could be handled by either
>> blocking in article's submit() or returning some congestion return code
>> from submit().
>
> Okay, I see that you prepare upcoming conference in Amsterdam,
> but please keep this thread about existing kernel code, not the one that
> eventually reach a new operating system in 5 years ;)
>
> 1) We _want_ the result of the sends, obviously.
>
> 2) We also want back pressure, without adding complex callbacks and
> ref-counting.
>
> 3) We do not want to burn a cpu per TX queue (at least one per NUMA
> node ???) only to send few packets per second,
> Our model is still interrupt based, plus NAPI for interrupt mitigation.
>
> 4) I do not want to lock an innocent cpu to send packets from other
> threads/cpu without a tight control.
>
> In the patch I sent, I basically replaced a locked operation
> (spin_lock(>busylock)) with another one (xchg()) , but I did not add
> yet another queue before the qdisc ones, bufferbloat forbids.
>
> The virtual queue here is one packet per cpu, which basically is the
> same than before this patch, since each cpu spinning on busylock has one
> skb to send anyway.
>
> This is basically a simple extension of MCS locks, where the cpu at the
> head of the queue can queue up to 16 packets, instead of queueing its
> own packet only and give queue owner ship to the following cpu.

Hi Eric (and others),

don't worry, my proposal (PSPAT) is not specifically addressing/targeting
the linux qdisc now, but at the same time it does have any of the
faults you are worried about.

My target, at a high level, is a VM hosting node where the guest VMs
may create large amounts of traffic, maybe most of it doomed to be dropped,
but still consuming theirs and system's resources by creating the
packets and pounding on the xmit calls.

The goal of PSPAT is to let those clients know very early (possibly even
before doing lookups or encapsulation) when the underlying path
to the NIC will be essentially free for transmission, at which
point the sender can complete building the packet and push it out.


To comment on your observations, PSPAT has the following features:

1) it does return the result of the send, which is run by

Re: [Patch net 0/2] net_sched: bug fixes for ife action

2016-06-23 Thread David Miller

From: Cong Wang 
Date: Mon, 20 Jun 2016 13:37:17 -0700

> Cong Wang (2):
>   act_ife: only acquire tcf_lock for existing actions
>   act_ife: acquire ife_mod_lock before reading ifeoplist

Series applied, thanks.

Re: [PATCH] mpls: Add missing RCU-bh read side critical section locking in output path

2016-06-23 Thread David Miller

From: Lennert Buytenhek 
Date: Mon, 20 Jun 2016 21:05:27 +0300

> From: David Barroso 
> 
> When locally originated IP traffic hits a route that says to push
> MPLS labels, we'll get a call chain dst_output() -> lwtunnel_output()
> -> mpls_output() -> neigh_xmit() -> ___neigh_lookup_noref() where the
> last function in this chain accesses a RCU-bh protected struct
> neigh_table pointer without us ever having declared an RCU-bh read
> side critical section.
> 
> As in case of locally originated IP traffic we'll be running in process
> context, with softirqs enabled, we can be preempted by a softirq at any
> time, and RCU-bh considers the completion of a softirq as signaling
> the end of any pending read-side critical sections, so if we do get a
> softirq here, we can end up with an unexpected RCU grace period and
> all the nastiness that that comes with.
> 
> This patch makes neigh_xmit() take rcu_read_{,un}lock_bh() around the
> code that expects to be treated as an RCU-bh read side critical section.
> 
> Signed-off-by: David Barroso 
> Signed-off-by: Lennert Buytenhek 

Whilst the case that was used to discover this problem was MPLS, that
is not the subsystem where the bug exists and is being fixed.

Therefore please fix your Subject line.

Thanks.

[PATCH net-next V2 05/10] net/mlx5e: Support adaptive RX coalescing

2016-06-23 Thread Saeed Mahameed

From: Gil Rockah 

Striving for high message rate and low interrupt rate.

Usage:
ethtool -C  adaptive-rx on/off

Signed-off-by: Gil Rockah 
Signed-off-by: Achiad Shochat 
Signed-off-by: Saeed Mahameed 
CC: Arnd Bergmann 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  33 ++
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  18 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  30 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 335 +
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |   5 +
 6 files changed, 416 insertions(+), 8 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 0c8a7dc..c4f450f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -7,6 +7,7 @@ mlx5_core-y :=  main.o cmd.o debugfs.o fw.o eq.o uar.o 
pagealloc.o \
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o \
en_main.o en_fs.o en_ethtool.o en_tx.o en_rx.o \
-   en_txrx.o en_clock.o vxlan.o en_tc.o en_arfs.o
+   en_rx_am.o en_txrx.o en_clock.o vxlan.o en_tc.o \
+   en_arfs.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) +=  en_dcbnl.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 36f625d..aa36a3a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -195,6 +195,7 @@ struct mlx5e_params {
 #ifdef CONFIG_MLX5_CORE_EN_DCB
struct ieee_ets ets;
 #endif
+   bool rx_am_enabled;
 };
 
 struct mlx5e_tstamp {
@@ -213,6 +214,7 @@ struct mlx5e_tstamp {
 enum {
MLX5E_RQ_STATE_POST_WQES_ENABLE,
MLX5E_RQ_STATE_UMR_WQE_IN_PROGRESS,
+   MLX5E_RQ_STATE_AM,
 };
 
 struct mlx5e_cq {
@@ -220,6 +222,7 @@ struct mlx5e_cq {
struct mlx5_cqwq   wq;
 
/* data path - accessed per napi poll */
+   u16event_ctr;
struct napi_struct*napi;
struct mlx5_core_cqmcq;
struct mlx5e_channel  *channel;
@@ -247,6 +250,30 @@ struct mlx5e_dma_info {
dma_addr_t  addr;
 };
 
+struct mlx5e_rx_am_stats {
+   int ppms; /* packets per msec */
+   int epms; /* events per msec */
+};
+
+struct mlx5e_rx_am_sample {
+   ktime_t time;
+   unsigned intpkt_ctr;
+   u16 event_ctr;
+};
+
+struct mlx5e_rx_am { /* Adaptive Moderation */
+   u8  state;
+   struct mlx5e_rx_am_statsprev_stats;
+   struct mlx5e_rx_am_sample   start_sample;
+   struct work_struct  work;
+   u8  profile_ix;
+   u8  mode;
+   u8  tune_state;
+   u8  steps_right;
+   u8  steps_left;
+   u8  tired;
+};
+
 struct mlx5e_rq {
/* data path */
struct mlx5_wq_ll  wq;
@@ -267,6 +294,8 @@ struct mlx5e_rq {
unsigned long  state;
intix;
 
+   struct mlx5e_rx_am am; /* Adaptive Moderation */
+
/* control */
struct mlx5_wq_ctrlwq_ctrl;
u8 wq_type;
@@ -637,6 +666,10 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
struct mlx5e_mpw_info *wi);
 struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
 
+void mlx5e_rx_am(struct mlx5e_rq *rq);
+void mlx5e_rx_am_work(struct work_struct *work);
+struct mlx5e_cq_moder mlx5e_am_get_def_profile(u8 rx_cq_period_mode);
+
 void mlx5e_update_stats(struct mlx5e_priv *priv);
 
 int mlx5e_create_flow_steering(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 4f433d3..c4be394 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -528,6 +528,7 @@ static int mlx5e_get_coalesce(struct net_device *netdev,
coal->rx_max_coalesced_frames = priv->params.rx_cq_moderation.pkts;
coal->tx_coalesce_usecs   = priv->params.tx_cq_moderation.usec;
coal->tx_max_coalesced_frames = priv->params.tx_cq_moderation.pkts;
+   coal->use_adaptive_rx_coalesce = priv->params.rx_am_enabled;
 
return 0;
 }
@@ -538,6 +539,10 @@ static int mlx5e_set_coalesce(struct net_device *netdev,
struct mlx5e_priv *priv= netdev_priv(netdev);

[PATCH net-next V2 00/10] Mellanox 100G mlx5e Ethernet extensions

2016-06-23 Thread Saeed Mahameed

Hi Dave,

This series includes multiple features extensions for mlx5 Ethernet netdevice 
driver.
Namely, TX Rate limiting, RX interrupt moderation, ethtool settings.

TX Rate limiting:
- ConnectX-4 rate limiting infrastructure
- Set max rate NDO support

RX interrupt moderation:
- CQE based coalescing option (controlled via priv flags)
- Adaptive RX coalescing

ethtool settings:
- priv flags callbacks
- Support new ksettings API
- Add 50G missing link mode
- Support auto negotiation on/off

Changes since V1:
- Split ("net/mlx5e: Add 50G missing link mode to ethtool and mlx5 
driver")

Thanks,
Saeed.

Gal Pressman (6):
  net/mlx5e: Introduce net device priv flags infrastructure
  net/mlx5e: Toggle link only after modifying port parameters
  ethtool: Add 50G baseSR2 link mode
  net/mlx5e: Add missing 50G baseSR2 link mode
  net/mlx5e: Use new ethtool get/set link ksettings API
  net/mlx5e: Report correct auto negotiation and allow toggling

Gil Rockah (1):
  net/mlx5e: Support adaptive RX coalescing

Tariq Toukan (1):
  net/mlx5e: CQE based moderation

Yevgeny Petrilin (2):
  net/mlx5: Rate limit tables support
  net/mlx5e: Add TXQ set max rate support

 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   8 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  73 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c |   9 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 476 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 181 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 335 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/fw.c   |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  10 +
 drivers/net/ethernet/mellanox/mlx5/core/port.c |  48 ++-
 drivers/net/ethernet/mellanox/mlx5/core/rl.c   | 209 +
 include/linux/mlx5/device.h|   4 +
 include/linux/mlx5/driver.h|  27 ++
 include/linux/mlx5/port.h  |  16 +-
 include/uapi/linux/ethtool.h   |   3 +-
 15 files changed, 1179 insertions(+), 231 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/rl.c

-- 
2.8.0

Re: esp: Fix ESN generation under UDP encapsulation

2016-06-23 Thread David Miller

From: Steffen Klassert 
Date: Thu, 23 Jun 2016 12:40:07 +0200

> On Thu, Jun 23, 2016 at 04:25:21AM +, Blair Steven wrote:
>> This change tests okay in my setup.
>> 
>> Thanks very much
>> -Blair
> 
> David, can you please take this patch directly in the net tree?
> This is a candidate for stable.
> 
> Acked-by: Steffen Klassert 

Applied, thanks everyone.

Does the ipv6 side need the same fix?

[PATCH net-next V2 10/10] net/mlx5e: Report correct auto negotiation and allow toggling

2016-06-23 Thread Saeed Mahameed

From: Gal Pressman 

Previous to this patch auto negotiation was reported off although it was
on by default in hardware. This patch reports the correct information to
ethtool and allows the user to toggle it on/off.

Added another parameter to set port proto function in order to pass
the auto negotiation field to the hardware.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 42 ++
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 36 ---
 include/linux/mlx5/port.h  | 15 ++--
 3 files changed, 80 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 4c560e0..39a4d96 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -702,6 +702,8 @@ static int mlx5e_get_link_ksettings(struct net_device 
*netdev,
u32 eth_proto_admin;
u32 eth_proto_lp;
u32 eth_proto_oper;
+   u8 an_disable_admin;
+   u8 an_status;
int err;
 
err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1);
@@ -712,10 +714,12 @@ static int mlx5e_get_link_ksettings(struct net_device 
*netdev,
goto err_query_ptys;
}
 
-   eth_proto_cap   = MLX5_GET(ptys_reg, out, eth_proto_capability);
-   eth_proto_admin = MLX5_GET(ptys_reg, out, eth_proto_admin);
-   eth_proto_oper  = MLX5_GET(ptys_reg, out, eth_proto_oper);
-   eth_proto_lp= MLX5_GET(ptys_reg, out, eth_proto_lp_advertise);
+   eth_proto_cap= MLX5_GET(ptys_reg, out, eth_proto_capability);
+   eth_proto_admin  = MLX5_GET(ptys_reg, out, eth_proto_admin);
+   eth_proto_oper   = MLX5_GET(ptys_reg, out, eth_proto_oper);
+   eth_proto_lp = MLX5_GET(ptys_reg, out, eth_proto_lp_advertise);
+   an_disable_admin = MLX5_GET(ptys_reg, out, an_disable_admin);
+   an_status= MLX5_GET(ptys_reg, out, an_status);
 
ethtool_link_ksettings_zero_link_mode(link_ksettings, supported);
ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
@@ -729,6 +733,18 @@ static int mlx5e_get_link_ksettings(struct net_device 
*netdev,
link_ksettings->base.port = get_connector_port(eth_proto_oper);
get_lp_advertising(eth_proto_lp, link_ksettings);
 
+   if (an_status == MLX5_AN_COMPLETE)
+   ethtool_link_ksettings_add_link_mode(link_ksettings,
+lp_advertising, Autoneg);
+
+   link_ksettings->base.autoneg = an_disable_admin ? AUTONEG_DISABLE :
+ AUTONEG_ENABLE;
+   ethtool_link_ksettings_add_link_mode(link_ksettings, supported,
+Autoneg);
+   if (!an_disable_admin)
+   ethtool_link_ksettings_add_link_mode(link_ksettings,
+advertising, Autoneg);
+
 err_query_ptys:
return err;
 }
@@ -764,9 +780,14 @@ static int mlx5e_set_link_ksettings(struct net_device 
*netdev,
 {
struct mlx5e_priv *priv= netdev_priv(netdev);
struct mlx5_core_dev *mdev = priv->mdev;
+   u32 eth_proto_cap, eth_proto_admin;
+   bool an_changes = false;
+   u8 an_disable_admin;
+   u8 an_disable_cap;
+   bool an_disable;
u32 link_modes;
+   u8 an_status;
u32 speed;
-   u32 eth_proto_cap, eth_proto_admin;
int err;
 
speed = link_ksettings->base.speed;
@@ -797,10 +818,17 @@ static int mlx5e_set_link_ksettings(struct net_device 
*netdev,
goto out;
}
 
-   if (link_modes == eth_proto_admin)
+   mlx5_query_port_autoneg(mdev, MLX5_PTYS_EN, _status,
+   _disable_cap, _disable_admin);
+
+   an_disable = link_ksettings->base.autoneg == AUTONEG_DISABLE;
+   an_changes = ((!an_disable && an_disable_admin) ||
+ (an_disable && !an_disable_admin));
+
+   if (!an_changes && link_modes == eth_proto_admin)
goto out;
 
-   mlx5_set_port_proto(mdev, link_modes, MLX5_PTYS_EN);
+   mlx5_set_port_ptys(mdev, an_disable, link_modes, MLX5_PTYS_EN);
mlx5_toggle_port_link(mdev);
 
 out:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c 
b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 1562e73..752c081 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -202,15 +202,24 @@ int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_proto_oper);
 
-int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
-   int proto_mask)
+int mlx5_set_port_ptys(struct

[PATCH net-next V2 04/10] net/mlx5e: CQE based moderation

2016-06-23 Thread Saeed Mahameed

From: Tariq Toukan 

In this mode the moderation timer will restart upon
new completion (CQE) generation rather than upon interrupt
generation.

The outcome is that for bursty traffic the period timer will never
expire and thus only the moderation frames counter will dictate
interrupt generation, thus the interrupt rate will be relative
to the incoming packets size.
If the burst seizes for "moderation period" time then an interrupt
will be issued immediately.

CQE based moderation is off by default and can be controlled
via ethtool set_priv_flags.

Performance tested on ConnectX4-Lx 50G.

Less packet loss in netperf UDP and TCP tests, with no bw degradation,
for both single and multi streams, with message sizes of
64, 1024, 1472 and 32768 byte.

Signed-off-by: Tariq Toukan 
Signed-off-by: Achiad Shochat 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Gal Pressman 
Signed-off-by: Gil Rockah 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h   | 20 +---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 54 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 54 --
 3 files changed, 95 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 02fa4da..36f625d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -79,6 +79,7 @@
 
 #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ (64 * 1024)
 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC  0x10
+#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3
 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS  0x20
 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC  0x10
 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS  0x20
@@ -145,11 +146,11 @@ struct mlx5e_umr_wqe {
 };
 
 static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {
-   "nop",
+   "rx_cqe_moder",
 };
 
 enum mlx5e_priv_flag {
-   MLX5E_PFLAG_NOP = (1 << 0),
+   MLX5E_PFLAG_RX_CQE_BASED_MODER = (1 << 0),
 };
 
 #define MLX5E_SET_PRIV_FLAG(priv, pflag, enable)\
@@ -165,6 +166,11 @@ enum mlx5e_priv_flag {
 #define MLX5E_MIN_BW_ALLOC 1   /* Min percentage of BW allocation */
 #endif
 
+struct mlx5e_cq_moder {
+   u16 usec;
+   u16 pkts;
+};
+
 struct mlx5e_params {
u8  log_sq_size;
u8  rq_wq_type;
@@ -173,12 +179,11 @@ struct mlx5e_params {
u8  log_rq_size;
u16 num_channels;
u8  num_tc;
+   u8  rx_cq_period_mode;
bool rx_cqe_compress_admin;
bool rx_cqe_compress;
-   u16 rx_cq_moderation_usec;
-   u16 rx_cq_moderation_pkts;
-   u16 tx_cq_moderation_usec;
-   u16 tx_cq_moderation_pkts;
+   struct mlx5e_cq_moder rx_cq_moderation;
+   struct mlx5e_cq_moder tx_cq_moderation;
u16 min_rx_wqes;
bool lro_en;
u32 lro_wqe_sz;
@@ -667,6 +672,9 @@ void mlx5e_build_default_indir_rqt(struct mlx5_core_dev 
*mdev,
   int num_channels);
 int mlx5e_get_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 
+void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
+u8 cq_period_mode);
+
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
  struct mlx5_wqe_ctrl_seg *ctrl, int bf_sz)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index f8bbc2b..4f433d3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -524,10 +524,10 @@ static int mlx5e_get_coalesce(struct net_device *netdev,
if (!MLX5_CAP_GEN(priv->mdev, cq_moderation))
return -ENOTSUPP;
 
-   coal->rx_coalesce_usecs   = priv->params.rx_cq_moderation_usec;
-   coal->rx_max_coalesced_frames = priv->params.rx_cq_moderation_pkts;
-   coal->tx_coalesce_usecs   = priv->params.tx_cq_moderation_usec;
-   coal->tx_max_coalesced_frames = priv->params.tx_cq_moderation_pkts;
+   coal->rx_coalesce_usecs   = priv->params.rx_cq_moderation.usec;
+   coal->rx_max_coalesced_frames = priv->params.rx_cq_moderation.pkts;
+   coal->tx_coalesce_usecs   = priv->params.tx_cq_moderation.usec;
+   coal->tx_max_coalesced_frames = priv->params.tx_cq_moderation.pkts;
 
return 0;
 }
@@ -545,10 +545,11 @@ static int mlx5e_set_coalesce(struct net_device *netdev,
return -ENOTSUPP;
 
mutex_lock(>state_lock);
-   priv->params.tx_cq_moderation_usec = coal->tx_coalesce_usecs;
-   priv->params.tx_cq_moderation_pkts = coal->tx_max_coalesced_frames;
-   priv->params.rx_cq_moderation_usec = coal->rx_coalesce_usecs;
-

[PATCH net-next V2 03/10] net/mlx5e: Introduce net device priv flags infrastructure

2016-06-23 Thread Saeed Mahameed

From: Gal Pressman 

Introduce an infrastructure for getting/setting private net device
flags.

Currently a 'nop' priv flag is added, following patches will override
the flag will actual feature specific flags.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h   | 17 +++
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 59 ++
 2 files changed, 76 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 017e047..02fa4da 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -144,6 +144,22 @@ struct mlx5e_umr_wqe {
struct mlx5_wqe_data_seg   data;
 };
 
+static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {
+   "nop",
+};
+
+enum mlx5e_priv_flag {
+   MLX5E_PFLAG_NOP = (1 << 0),
+};
+
+#define MLX5E_SET_PRIV_FLAG(priv, pflag, enable)\
+   do {\
+   if (enable) \
+   priv->pflags |= pflag;  \
+   else\
+   priv->pflags &= ~pflag; \
+   } while (0)
+
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 #define MLX5E_MAX_BW_ALLOC 100 /* Max percentage of BW allocation */
 #define MLX5E_MIN_BW_ALLOC 1   /* Min percentage of BW allocation */
@@ -543,6 +559,7 @@ struct mlx5e_priv {
struct work_struct set_rx_mode_work;
struct delayed_workupdate_stats_work;
 
+   u32pflags;
struct mlx5_core_dev  *mdev;
struct net_device *netdev;
struct mlx5e_stats stats;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index fc7dcc0..f8bbc2b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -198,6 +198,8 @@ static int mlx5e_get_sset_count(struct net_device *dev, int 
sset)
   MLX5E_NUM_RQ_STATS(priv) +
   MLX5E_NUM_SQ_STATS(priv) +
   MLX5E_NUM_PFC_COUNTERS(priv);
+   case ETH_SS_PRIV_FLAGS:
+   return ARRAY_SIZE(mlx5e_priv_flags);
/* fallthrough */
default:
return -EOPNOTSUPP;
@@ -272,9 +274,12 @@ static void mlx5e_get_strings(struct net_device *dev,
  uint32_t stringset, uint8_t *data)
 {
struct mlx5e_priv *priv = netdev_priv(dev);
+   int i;
 
switch (stringset) {
case ETH_SS_PRIV_FLAGS:
+   for (i = 0; i < ARRAY_SIZE(mlx5e_priv_flags); i++)
+   strcpy(data + i * ETH_GSTRING_LEN, mlx5e_priv_flags[i]);
break;
 
case ETH_SS_TEST:
@@ -1272,6 +1277,58 @@ static int mlx5e_get_module_eeprom(struct net_device 
*netdev,
return 0;
 }
 
+typedef int (*mlx5e_pflag_handler)(struct net_device *netdev, bool enable);
+
+static int set_pflag_nop(struct net_device *netdev, bool enable)
+{
+   return 0;
+}
+
+static int mlx5e_handle_pflag(struct net_device *netdev,
+ u32 wanted_flags,
+ enum mlx5e_priv_flag flag,
+ mlx5e_pflag_handler pflag_handler)
+{
+   struct mlx5e_priv *priv = netdev_priv(netdev);
+   bool enable = !!(wanted_flags & flag);
+   u32 changes = wanted_flags ^ priv->pflags;
+   int err;
+
+   if (!(changes & flag))
+   return 0;
+
+   err = pflag_handler(netdev, enable);
+   if (err) {
+   netdev_err(netdev, "%s private flag 0x%x failed err %d\n",
+  enable ? "Enable" : "Disable", flag, err);
+   return err;
+   }
+
+   MLX5E_SET_PRIV_FLAG(priv, flag, enable);
+   return 0;
+}
+
+static int mlx5e_set_priv_flags(struct net_device *netdev, u32 pflags)
+{
+   struct mlx5e_priv *priv = netdev_priv(netdev);
+   int err;
+
+   mutex_lock(>state_lock);
+
+   err = mlx5e_handle_pflag(netdev, pflags, MLX5E_PFLAG_NOP,
+set_pflag_nop);
+
+   mutex_unlock(>state_lock);
+   return err ? -EINVAL : 0;
+}
+
+static u32 mlx5e_get_priv_flags(struct net_device *netdev)
+{
+   struct mlx5e_priv *priv = netdev_priv(netdev);
+
+   return priv->pflags;
+}
+
 const struct ethtool_ops mlx5e_ethtool_ops = {
.get_drvinfo   = mlx5e_get_drvinfo,
.get_link  = ethtool_op_get_link,
@@ -1301,4 +1358,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
.set_wol   = mlx5e_set_wol,
.get_module_info   = mlx5e_get_module_info,
.get_module_eeprom = mlx5e_get_module_eeprom,
+   .get_priv_flags= mlx5e_get_priv_flags,
+

[PATCH net-next V2 07/10] ethtool: Add 50G baseSR2 link mode

2016-06-23 Thread Saeed Mahameed

From: Gal Pressman 

Add ETHTOOL_LINK_MODE_5baseSR2_Full_BIT bit.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
Cc: Ben Hutchings 
Cc: David Decotigny 
---
 include/uapi/linux/ethtool.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 5f030b4..b8f38e8 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1362,6 +1362,7 @@ enum ethtool_link_mode_bit_indices {
ETHTOOL_LINK_MODE_10baseSR4_Full_BIT= 37,
ETHTOOL_LINK_MODE_10baseCR4_Full_BIT= 38,
ETHTOOL_LINK_MODE_10baseLR4_ER4_Full_BIT= 39,
+   ETHTOOL_LINK_MODE_5baseSR2_Full_BIT = 40,
 
/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
@@ -1370,7 +1371,7 @@ enum ethtool_link_mode_bit_indices {
 */
 
__ETHTOOL_LINK_MODE_LAST
- = ETHTOOL_LINK_MODE_10baseLR4_ER4_Full_BIT,
+ = ETHTOOL_LINK_MODE_5baseSR2_Full_BIT,
 };
 
 #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \
-- 
2.8.0

[PATCH net-next V2 08/10] net/mlx5e: Add missing 50G baseSR2 link mode

2016-06-23 Thread Saeed Mahameed

From: Gal Pressman 

Add MLX5E_50GBASE_SR2 as ETHTOOL_LINK_MODE_5baseSR2_Full_BIT.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
Cc: Ben Hutchings 
Cc: David Decotigny 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index aa36a3a..b8732e6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -616,6 +616,7 @@ enum mlx5e_link_mode {
MLX5E_10GBASE_ER = 14,
MLX5E_40GBASE_SR4= 15,
MLX5E_40GBASE_LR4= 16,
+   MLX5E_50GBASE_SR2= 18,
MLX5E_100GBASE_CR4   = 20,
MLX5E_100GBASE_SR4   = 21,
MLX5E_100GBASE_KR4   = 22,
-- 
2.8.0

[PATCH net-next V2 06/10] net/mlx5e: Toggle link only after modifying port parameters

2016-06-23 Thread Saeed Mahameed

From: Gal Pressman 

Add a dedicated function to toggle port link. It should be called only
after setting a port register.
Toggle will set port link to down and bring it back up in case that it's
admin status was up.

Signed-off-by: Gal Pressman 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c   |  9 +
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c |  7 +--
 drivers/net/ethernet/mellanox/mlx5/core/port.c   | 12 
 include/linux/mlx5/port.h|  1 +
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index b2db180..e688313 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -191,7 +191,6 @@ static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev,
 {
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5_core_dev *mdev = priv->mdev;
-   enum mlx5_port_status ps;
u8 curr_pfc_en;
int ret;
 
@@ -200,14 +199,8 @@ static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev,
if (pfc->pfc_en == curr_pfc_en)
return 0;
 
-   mlx5_query_port_admin_status(mdev, );
-   if (ps == MLX5_PORT_UP)
-   mlx5_set_port_admin_status(mdev, MLX5_PORT_DOWN);
-
ret = mlx5_set_port_pfc(mdev, pfc->pfc_en, pfc->pfc_en);
-
-   if (ps == MLX5_PORT_UP)
-   mlx5_set_port_admin_status(mdev, MLX5_PORT_UP);
+   mlx5_toggle_port_link(mdev);
 
return ret;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index c4be394..d0d3dcf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -795,7 +795,6 @@ static int mlx5e_set_settings(struct net_device *netdev,
u32 link_modes;
u32 speed;
u32 eth_proto_cap, eth_proto_admin;
-   enum mlx5_port_status ps;
int err;
 
speed = ethtool_cmd_speed(cmd);
@@ -829,12 +828,8 @@ static int mlx5e_set_settings(struct net_device *netdev,
if (link_modes == eth_proto_admin)
goto out;
 
-   mlx5_query_port_admin_status(mdev, );
-   if (ps == MLX5_PORT_UP)
-   mlx5_set_port_admin_status(mdev, MLX5_PORT_DOWN);
mlx5_set_port_proto(mdev, link_modes, MLX5_PTYS_EN);
-   if (ps == MLX5_PORT_UP)
-   mlx5_set_port_admin_status(mdev, MLX5_PORT_UP);
+   mlx5_toggle_port_link(mdev);
 
 out:
return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c 
b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 3e35611..1562e73 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -222,6 +222,18 @@ int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 
proto_admin,
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_proto);
 
+/* This function should be used after setting a port register only */
+void mlx5_toggle_port_link(struct mlx5_core_dev *dev)
+{
+   enum mlx5_port_status ps;
+
+   mlx5_query_port_admin_status(dev, );
+   mlx5_set_port_admin_status(dev, MLX5_PORT_DOWN);
+   if (ps == MLX5_PORT_UP)
+   mlx5_set_port_admin_status(dev, MLX5_PORT_UP);
+}
+EXPORT_SYMBOL_GPL(mlx5_toggle_port_link);
+
 int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
   enum mlx5_port_status status)
 {
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 9851862..4adfac1 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -67,6 +67,7 @@ int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev,
   u8 local_port);
 int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
int proto_mask);
+void mlx5_toggle_port_link(struct mlx5_core_dev *dev);
 int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
   enum mlx5_port_status status);
 int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
-- 
2.8.0

[PATCH net-next V2 02/10] net/mlx5e: Add TXQ set max rate support

2016-06-23 Thread Saeed Mahameed

From: Yevgeny Petrilin 

Implement set_maxrate ndo.
Use the rate index from the hardware table to attach to channel SQ/TXQ.
In case of failure to configure new rate, the queue remains with
unlimited rate.

We save the configuration on priv structure and apply it each time
Send Queues are being reinitialized (after open/close) operations.

Signed-off-by: Yevgeny Petrilin 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 102 +-
 2 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index e8a6c33..017e047 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -88,6 +88,7 @@
 #define MLX5E_LOG_INDIR_RQT_SIZE   0x7
 #define MLX5E_INDIR_RQT_SIZE   BIT(MLX5E_LOG_INDIR_RQT_SIZE)
 #define MLX5E_MAX_NUM_CHANNELS (MLX5E_INDIR_RQT_SIZE >> 1)
+#define MLX5E_MAX_NUM_SQS  (MLX5E_MAX_NUM_CHANNELS * 
MLX5E_MAX_NUM_TC)
 #define MLX5E_TX_CQ_POLL_BUDGET128
 #define MLX5E_UPDATE_STATS_INTERVAL200 /* msecs */
 #define MLX5E_SQ_BF_BUDGET 16
@@ -354,6 +355,7 @@ struct mlx5e_sq {
struct mlx5e_channel  *channel;
inttc;
struct mlx5e_ico_wqe_info *ico_wqe_info;
+   u32rate_limit;
 } cacheline_aligned_in_smp;
 
 static inline bool mlx5e_sq_has_room_for(struct mlx5e_sq *sq, u16 n)
@@ -530,6 +532,7 @@ struct mlx5e_priv {
u32indir_rqtn;
u32indir_tirn[MLX5E_NUM_INDIR_TIRS];
struct mlx5e_direct_tirdirect_tir[MLX5E_MAX_NUM_CHANNELS];
+   u32tx_rates[MLX5E_MAX_NUM_SQS];
 
struct mlx5e_flow_steering fs;
struct mlx5e_vxlan_db  vxlan;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 8b7c6f3..e5a2cef 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -702,7 +702,8 @@ static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct 
mlx5e_sq_param *param)
return err;
 }
 
-static int mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state, int next_state)
+static int mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state,
+  int next_state, bool update_rl, int rl_index)
 {
struct mlx5e_channel *c = sq->channel;
struct mlx5e_priv *priv = c->priv;
@@ -722,6 +723,10 @@ static int mlx5e_modify_sq(struct mlx5e_sq *sq, int 
curr_state, int next_state)
 
MLX5_SET(modify_sq_in, in, sq_state, curr_state);
MLX5_SET(sqc, sqc, state, next_state);
+   if (update_rl && next_state == MLX5_SQC_STATE_RDY) {
+   MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
+   MLX5_SET(sqc,  sqc, packet_pacing_rate_limit_index, rl_index);
+   }
 
err = mlx5_core_modify_sq(mdev, sq->sqn, in, inlen);
 
@@ -737,6 +742,8 @@ static void mlx5e_disable_sq(struct mlx5e_sq *sq)
struct mlx5_core_dev *mdev = priv->mdev;
 
mlx5_core_destroy_sq(mdev, sq->sqn);
+   if (sq->rate_limit)
+   mlx5_rl_remove_rate(mdev, sq->rate_limit);
 }
 
 static int mlx5e_open_sq(struct mlx5e_channel *c,
@@ -754,7 +761,8 @@ static int mlx5e_open_sq(struct mlx5e_channel *c,
if (err)
goto err_destroy_sq;
 
-   err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
+   err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY,
+ false, 0);
if (err)
goto err_disable_sq;
 
@@ -793,7 +801,8 @@ static void mlx5e_close_sq(struct mlx5e_sq *sq)
if (mlx5e_sq_has_room_for(sq, 1))
mlx5e_send_nop(sq, true);
 
-   mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
+   mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR,
+   false, 0);
}
 
while (sq->cc != sq->pc) /* wait till sq is empty */
@@ -1024,6 +1033,79 @@ static void mlx5e_build_channeltc_to_txq_map(struct 
mlx5e_priv *priv, int ix)
ix + i * priv->params.num_channels;
 }
 
+static int mlx5e_set_sq_maxrate(struct net_device *dev,
+   struct mlx5e_sq *sq, u32 rate)
+{
+   struct mlx5e_priv *priv = netdev_priv(dev);
+   struct mlx5_core_dev *mdev = priv->mdev;
+   u16 rl_index = 0;
+   int err;
+
+   if (rate == sq->rate_limit)
+   /* nothing to do */
+   return 0;
+
+   if (sq->rate_limit)
+   /* remove current rl index to free space to next ones */
+

[PATCH net-next V2 09/10] net/mlx5e: Use new ethtool get/set link ksettings API

2016-06-23 Thread Saeed Mahameed

From: Gal Pressman 

Use new get/set link ksettings and remove get/set settings legacy
callbacks.
This allows us to use bitmasks longer than 32 bit for supported and
advertised link modes and use modes that were previously not supported.

Signed-off-by: Gal Pressman 
CC: Ben Hutchings 
CC: David Decotigny 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |   3 +
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 306 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   1 +
 3 files changed, 143 insertions(+), 167 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index b8732e6..da885c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -634,6 +634,9 @@ enum mlx5e_link_mode {
 
 #define MLX5E_PROT_MASK(link_mode) (1 << link_mode)
 
+
+void mlx5e_build_ptys2ethtool_map(void);
+
 void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw);
 u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
   void *accel_priv, select_queue_fallback_t fallback);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index d0d3dcf..4c560e0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -48,123 +48,85 @@ static void mlx5e_get_drvinfo(struct net_device *dev,
sizeof(drvinfo->bus_info));
 }
 
-static const struct {
-   u32 supported;
-   u32 advertised;
+struct ptys2ethtool_config {
+   __ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+   __ETHTOOL_DECLARE_LINK_MODE_MASK(advertised);
u32 speed;
-} ptys2ethtool_table[MLX5E_LINK_MODES_NUMBER] = {
-   [MLX5E_1000BASE_CX_SGMII] = {
-   .supported  = SUPPORTED_1000baseKX_Full,
-   .advertised = ADVERTISED_1000baseKX_Full,
-   .speed  = 1000,
-   },
-   [MLX5E_1000BASE_KX] = {
-   .supported  = SUPPORTED_1000baseKX_Full,
-   .advertised = ADVERTISED_1000baseKX_Full,
-   .speed  = 1000,
-   },
-   [MLX5E_10GBASE_CX4] = {
-   .supported  = SUPPORTED_1baseKX4_Full,
-   .advertised = ADVERTISED_1baseKX4_Full,
-   .speed  = 1,
-   },
-   [MLX5E_10GBASE_KX4] = {
-   .supported  = SUPPORTED_1baseKX4_Full,
-   .advertised = ADVERTISED_1baseKX4_Full,
-   .speed  = 1,
-   },
-   [MLX5E_10GBASE_KR] = {
-   .supported  = SUPPORTED_1baseKR_Full,
-   .advertised = ADVERTISED_1baseKR_Full,
-   .speed  = 1,
-   },
-   [MLX5E_20GBASE_KR2] = {
-   .supported  = SUPPORTED_2baseKR2_Full,
-   .advertised = ADVERTISED_2baseKR2_Full,
-   .speed  = 2,
-   },
-   [MLX5E_40GBASE_CR4] = {
-   .supported  = SUPPORTED_4baseCR4_Full,
-   .advertised = ADVERTISED_4baseCR4_Full,
-   .speed  = 4,
-   },
-   [MLX5E_40GBASE_KR4] = {
-   .supported  = SUPPORTED_4baseKR4_Full,
-   .advertised = ADVERTISED_4baseKR4_Full,
-   .speed  = 4,
-   },
-   [MLX5E_56GBASE_R4] = {
-   .supported  = SUPPORTED_56000baseKR4_Full,
-   .advertised = ADVERTISED_56000baseKR4_Full,
-   .speed  = 56000,
-   },
-   [MLX5E_10GBASE_CR] = {
-   .supported  = SUPPORTED_1baseKR_Full,
-   .advertised = ADVERTISED_1baseKR_Full,
-   .speed  = 1,
-   },
-   [MLX5E_10GBASE_SR] = {
-   .supported  = SUPPORTED_1baseKR_Full,
-   .advertised = ADVERTISED_1baseKR_Full,
-   .speed  = 1,
-   },
-   [MLX5E_10GBASE_ER] = {
-   .supported  = SUPPORTED_1baseKR_Full,
-   .advertised = ADVERTISED_1baseKR_Full,
-   .speed  = 1,
-   },
-   [MLX5E_40GBASE_SR4] = {
-   .supported  = SUPPORTED_4baseSR4_Full,
-   .advertised = ADVERTISED_4baseSR4_Full,
-   .speed  = 4,
-   },
-   [MLX5E_40GBASE_LR4] = {
-   .supported  = SUPPORTED_4baseLR4_Full,
-   .advertised = ADVERTISED_4baseLR4_Full,
-   .speed  = 4,
-   },
-   [MLX5E_100GBASE_CR4] = {
-   .speed  = 10,
-   },
-   [MLX5E_100GBASE_SR4] = {
-   .speed  = 10,
-   },
-   [MLX5E_100GBASE_KR4] = {
-   .speed  = 10,
-   },
-   [MLX5E_100GBASE_LR4] =

[PATCH net-next V2 01/10] net/mlx5: Rate limit tables support

2016-06-23 Thread Saeed Mahameed

From: Yevgeny Petrilin 

Configuring and managing HW rate limit tables.
The HW holds a table of rate limits, each rate is
associated with an index in that table.
Later a Send Queue uses this index to set the rate limit.
Multiple Send Queues can have the same rate limit, which is
represented by a single entry in this table.
Even though a rate can be shared, each queue is being rate
limited independently of others.

The SW shadow of this table holds the rate itself,
the index in the HW table and the refcount (number of queues)
working with this rate.

The exported functions are mlx5_rl_add_rate and mlx5_rl_remove_rate.
Number of different rates and their values are derived
from HW capabilities.

Signed-off-by: Yevgeny Petrilin 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |   5 +-
 drivers/net/ethernet/mellanox/mlx5/core/fw.c |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c   |  10 ++
 drivers/net/ethernet/mellanox/mlx5/core/rl.c | 209 +++
 include/linux/mlx5/device.h  |   4 +
 include/linux/mlx5/driver.h  |  27 +++
 6 files changed, 259 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/rl.c

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9ea7b58..0c8a7dc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -1,8 +1,9 @@
 obj-$(CONFIG_MLX5_CORE)+= mlx5_core.o
 
 mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
-   health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
-   mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o 
fs_counters.o
+   health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o \
+   mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
+   fs_counters.o rl.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o \
en_main.o en_fs.o en_ethtool.o en_tx.o en_rx.o \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 75c7ae6..77fc1aa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -151,6 +151,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
return err;
}
 
+   if (MLX5_CAP_GEN(dev, qos)) {
+   err = mlx5_core_get_caps(dev, MLX5_CAP_QOS);
+   if (err)
+   return err;
+   }
+
return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index a19b593..08cae34 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1144,6 +1144,13 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, 
struct mlx5_priv *priv)
dev_err(>dev, "Failed to init flow steering\n");
goto err_fs;
}
+
+   err = mlx5_init_rl_table(dev);
+   if (err) {
+   dev_err(>dev, "Failed to init rate limiting\n");
+   goto err_rl;
+   }
+
 #ifdef CONFIG_MLX5_CORE_EN
err = mlx5_eswitch_init(dev);
if (err) {
@@ -1183,6 +1190,8 @@ err_sriov:
mlx5_eswitch_cleanup(dev->priv.eswitch);
 #endif
 err_reg_dev:
+   mlx5_cleanup_rl_table(dev);
+err_rl:
mlx5_cleanup_fs(dev);
 err_fs:
mlx5_cleanup_mkey_table(dev);
@@ -1253,6 +1262,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, 
struct mlx5_priv *priv)
mlx5_eswitch_cleanup(dev->priv.eswitch);
 #endif
 
+   mlx5_cleanup_rl_table(dev);
mlx5_cleanup_fs(dev);
mlx5_cleanup_mkey_table(dev);
mlx5_cleanup_srq_table(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c 
b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
new file mode 100644
index 000..c07c28b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2013-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *

vmw_vsock sk_ack_backlog double decrement bug

2016-06-23 Thread Stefan Hajnoczi

Hi Jorgen,
virtio-vsock doesn't use vsock_pending_work() but I may have spotted a
problem that affects the VMCI transport.  I'm not sending a patch
because I can't test it.

1. During vsock_accept() listener->sk_ack_backlog is decremented.
2. vsock_pending_work() will decrement listener->sk_ack_backlog again if
   vsk->rejected.

The result is that sk_ack_backlog can be invalid.  It only happens in
the case where the listener socket has an error.  Maybe in practice it's
not a problem because the server application will close the listener
socket if there is an error...

Stefan


signature.asc
Description: PGP signature

Re: [patch net-next v5 0/4] return offloaded stats as default and expose original sw stats

2016-06-23 Thread Jiri Pirko

Thu, Jun 23, 2016 at 05:11:26PM CEST, anurad...@cumulusnetworks.com wrote:
> we can't separate CPU and HW stats there. In some cases (or ASICs) HW
> counters do
> not include CPU generated packetsyou will have to add CPU
> generated pkt counters to the
> hw counters for such virtual device stats.
 Can you please provide and example how that could happen?
>>>
>>>example is the bridge vlan stats I mention below. These are usually counted
>>>by attaching hw virtual counter resources. And CPU generated packets
>>>in some cases maybe setup to bypass the ASIC pipeline because the CPU
>>>has already made the required decisions. So, they may not be counted by
>>>by such hw virtual counters.
>>
>> Bypass ASIC? How do the packets get on the wire?
>>
>
>Bypass the "forwarding pipeline" in the ASIC that is. Obviously the
>ASIC ships the CPU generated packet out of the switch/front-panel
>port. Continuing Roopa's example of vlan netdev stats To get the
>HW stats counters are typically tied to the ingress and egress vlan hw
>entries. All the incoming packets are subject to the ingress vlan
>lookup irrespective of whether they get punted to the CPU or whether
>they are forwarded to another front panel port. In that case the
>ingress HW stats does represent all packets. However for CPU
>originated packets egress vlan lookups are bypassed in the ASIC (this
>is common forwarding option in most ASICs) and the packet shipped as
>is out of front-panel port specified by the CPU. Which means these
>packets will NOT be counted against the egress VLAN HW counter; hence
>the need for summation.

Driver will know about this, and will provide the stats accordignly to
the core. Who else than driver should resolve this.

[PATCH] vsock: make listener child lock ordering explicit

2016-06-23 Thread Stefan Hajnoczi

There are several places where the listener and pending or accept queue
child sockets are accessed at the same time.  Lockdep is unhappy that
two locks from the same class are held.

Tell lockdep that it is safe and document the lock ordering.

Originally Claudio Imbrenda  sent a similar
patch asking whether this is safe.  I have audited the code and also
covered the vsock_pending_work() function.

Suggested-by: Claudio Imbrenda 
Signed-off-by: Stefan Hajnoczi 
---
 net/vmw_vsock/af_vsock.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index b5f1221..b96ac91 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -61,6 +61,14 @@
  * function will also cleanup rejected sockets, those that reach the connected
  * state but leave it before they have been accepted.
  *
+ * - Lock ordering for pending or accept queue sockets is:
+ *
+ * lock_sock(listener);
+ * lock_sock_nested(pending, SINGLE_DEPTH_NESTING);
+ *
+ * Using explicit nested locking keeps lockdep happy since normally only one
+ * lock of a given class may be taken at a time.
+ *
  * - Sockets created by user action will be cleaned up when the user process
  * calls close(2), causing our release implementation to be called. Our release
  * implementation will perform some cleanup then drop the last reference so our
@@ -443,7 +451,7 @@ void vsock_pending_work(struct work_struct *work)
cleanup = true;
 
lock_sock(listener);
-   lock_sock(sk);
+   lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
 
if (vsock_is_pending(sk)) {
vsock_remove_pending(listener, sk);
@@ -1292,7 +1300,7 @@ static int vsock_accept(struct socket *sock, struct 
socket *newsock, int flags)
if (connected) {
listener->sk_ack_backlog--;
 
-   lock_sock(connected);
+   lock_sock_nested(connected, SINGLE_DEPTH_NESTING);
vconnected = vsock_sk(connected);
 
/* If the listener socket has received an error, then we should
-- 
2.7.4

Re: [PATCH] dsa: mv88e6xxx: hide unused functions

2016-06-23 Thread Vivien Didelot

Hi,

Arnd Bergmann  writes:

> When CONFIG_NET_DSA_HWMON is disabled, we get warnings about two unused
> functions whose only callers are all inside of an #ifdef:
>
> drivers/net/dsa/mv88e6xxx.c:3257:12: 'mv88e6xxx_mdio_page_write' defined but 
> not used [-Werror=unused-function]
> drivers/net/dsa/mv88e6xxx.c:3244:12: 'mv88e6xxx_mdio_page_read' defined but 
> not used [-Werror=unused-function]
>
> This adds another ifdef around the function definitions. The warnings
> appeared after the functions were marked 'static', but the problem
> was already there before that.
>
> Signed-off-by: Arnd Bergmann 
> Fixes: 57d3231057e9 ("net: dsa: mv88e6xxx: fix style issues")

Reviewed-by: Vivien Didelot 

David, this patch is meant for net-next. It applies cleanly *before* my
last two submissions:

1/2 http://patchwork.ozlabs.org/patch/638773/
2/2 http://patchwork.ozlabs.org/patch/638772/

Thanks,

Vivien

Re: [patch net-next v5 0/4] return offloaded stats as default and expose original sw stats

2016-06-23 Thread Anuradha Karuppiah

 we can't separate CPU and HW stats there. In some cases (or ASICs) HW
 counters do
 not include CPU generated packetsyou will have to add CPU
 generated pkt counters to the
 hw counters for such virtual device stats.
>>> Can you please provide and example how that could happen?
>>
>>example is the bridge vlan stats I mention below. These are usually counted
>>by attaching hw virtual counter resources. And CPU generated packets
>>in some cases maybe setup to bypass the ASIC pipeline because the CPU
>>has already made the required decisions. So, they may not be counted by
>>by such hw virtual counters.
>
> Bypass ASIC? How do the packets get on the wire?
>

Bypass the "forwarding pipeline" in the ASIC that is. Obviously the
ASIC ships the CPU generated packet out of the switch/front-panel
port. Continuing Roopa's example of vlan netdev stats To get the
HW stats counters are typically tied to the ingress and egress vlan hw
entries. All the incoming packets are subject to the ingress vlan
lookup irrespective of whether they get punted to the CPU or whether
they are forwarded to another front panel port. In that case the
ingress HW stats does represent all packets. However for CPU
originated packets egress vlan lookups are bypassed in the ASIC (this
is common forwarding option in most ASICs) and the packet shipped as
is out of front-panel port specified by the CPU. Which means these
packets will NOT be counted against the egress VLAN HW counter; hence
the need for summation.

Re: [PATCH] Maxim/driver: Add driver for maxim ds26522

2016-06-23 Thread David Miller

From: Zhao Qiang 
Date: Thu, 23 Jun 2016 09:09:45 +0800

> +MODULE_DESCRIPTION(DRV_DESC);

There is no definition of DRV_DESC, so this makes it look like
you didn't even compile this driver.

Re: [PATCH net v2] ipv6: enforce egress device match in per table nexthop lookups

2016-06-23 Thread Paolo Abeni

On Thu, 2016-06-23 at 08:29 -0600, David Ahern wrote:
> On 6/23/16 8:20 AM, David Ahern wrote:
> >> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> >> index 969913d..520b788 100644
> >> --- a/net/ipv6/route.c
> >> +++ b/net/ipv6/route.c
> >> @@ -1782,7 +1782,7 @@ static struct rt6_info
> >> *ip6_nh_lookup_table(struct net *net,
> >>  };
> >>  struct fib6_table *table;
> >>  struct rt6_info *rt;
> >> -int flags = 0;
> >> +int flags = RT6_LOOKUP_F_IFACE;
> >>
> >>  table = fib6_get_table(net, cfg->fc_table);
> >>  if (!table)
> >>
> >
> > Acked-by: David Ahern 
> 
> I take that back.
> 
> I think RT6_LOOKUP_F_IFACE should only be set if cfg->fc_ifindex is set.

AFAICS the latter condition should not be needed. The related
information is passed all way down to rt6_score_route(), where it's
really used:

m = rt6_check_dev(rt, oif);
if (!m && (strict & RT6_LOOKUP_F_IFACE))
return RT6_NUD_FAIL_HARD;

and 'm' can be 0 only if oif is set: RT6_LOOKUP_F_IFACE has no effect
ifindex is set.

Paolo

Re: [PATCH net v2] ipv6: enforce egress device match in per table nexthop lookups

2016-06-23 Thread David Ahern


On 6/23/16 8:20 AM, David Ahern wrote:

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 969913d..520b788 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1782,7 +1782,7 @@ static struct rt6_info
*ip6_nh_lookup_table(struct net *net,
 };
 struct fib6_table *table;
 struct rt6_info *rt;
-int flags = 0;
+int flags = RT6_LOOKUP_F_IFACE;

 table = fib6_get_table(net, cfg->fc_table);
 if (!table)



Acked-by: David Ahern 


I take that back.

I think RT6_LOOKUP_F_IFACE should only be set if cfg->fc_ifindex is set.

Re: [PATCH net-next 0/4] net_sched: bulk dequeue and deferred drops

2016-06-23 Thread Jesper Dangaard Brouer

On Wed, 22 Jun 2016 09:49:48 -0700
Eric Dumazet  wrote:

> On Wed, 2016-06-22 at 17:44 +0200, Jesper Dangaard Brouer wrote:
> > On Wed, 22 Jun 2016 07:55:43 -0700
> > Eric Dumazet  wrote:
> >   
> > > On Wed, 2016-06-22 at 16:47 +0200, Jesper Dangaard Brouer wrote:  
> > > > On Tue, 21 Jun 2016 23:16:48 -0700
> > > > Eric Dumazet  wrote:
> > > > 
> > > > > First patch adds an additional parameter to ->enqueue() qdisc method
> > > > > so that drops can be done outside of critical section
> > > > > (after locks are released).
> > > > > 
> > > > > Then fq_codel can have a small optimization to reduce number of cache
> > > > > lines misses during a drop event
> > > > > (possibly accumulating hundreds of packets to be freed).
> > > > > 
> > > > > A small htb change exports the backlog in class dumps.
> > > > > 
> > > > > Final patch adds bulk dequeue to qdiscs that were lacking this 
> > > > > feature.
> > > > > 
> > > > > This series brings a nice qdisc performance increase (more than 80 %
> > > > > in some cases).
> > > > 
> > > > Thanks for working on this Eric! this is great work! :-)
> > > 
> > > Thanks Jesper
> > > 
> > > I worked yesterday on bulk enqueues, but initial results are not that
> > > great.  
> > 
> > Hi Eric,
> > 
> > This is interesting work! But I think you should read Luigi Rizzo's
> > (Cc'ed) paper on title "A Fast and Practical Software Packet Scheduling
> > Architecture"[1]
> > 
> > [1] http://info.iet.unipi.it/~luigi/papers/20160511-mysched-preprint.pdf
> > 
> > Luigi will be at Netfilter Workshop next week, and will actually
> > present on topic/paper you two should talk ;-)
> > 
> > The article is not a 100% match for what we need, but there is some
> > good ideas.  The article also have a sort of "prequeue" that
> > enqueue'ing CPUs will place packets into.
> > 
> > My understanding of the article:
> > 
> > 1. transmitters submit packets to an intermediate queue
> >(replace q->enqueue call) lockless submit as queue per CPU
> >(runs in parallel)
> > 
> > 2. like we only have _one_ qdisc dequeue process, this process (called
> >arbiter) empty the intermediate queues, and then invoke q->enqueue()
> >and q->dequeue(). (in a locked session/region)
> > 
> > 3. Packets returned from q->dequeue() is placed on an outgoing
> >intermediate queue.
> > 
> > 4. the transmitter then looks to see there are any packets to drain()
> >from the outgoing queue.  This can run in parallel.
> > 
> > If the transmitter submitting a packet, detect no arbiter is running,
> > it can become the arbiter itself.  Like we do with qdisc_run_begin()
> > setting state __QDISC___STATE_RUNNING.
> > 
> > The problem with this scheme is push-back from qdisc->enqueue
> > (NET_XMIT_CN) does not "reach" us.  And push-back in-form of processes
> > blocking on qdisc root lock, but that could be handled by either
> > blocking in article's submit() or returning some congestion return code
> > from submit().   
> 
> Okay, I see that you prepare upcoming conference in Amsterdam,
> but please keep this thread about existing kernel code, not the one that
> eventually reach a new operating system in 5 years ;)
> 
> 1) We _want_ the result of the sends, obviously.

How dependent are we on the return codes?

E.g. the NET_XMIT_CN return is not that accurate, it does not mean this
packet was dropped, it could be from an unrelated flow.


> 2) We also want back pressure, without adding complex callbacks and
> ref-counting.
> 
> 3) We do not want to burn a cpu per TX queue (at least one per NUMA
> node ???) only to send few packets per second,
> Our model is still interrupt based, plus NAPI for interrupt mitigation.
>
> 4) I do not want to lock an innocent cpu to send packets from other
> threads/cpu without a tight control.

Article present two modes: 1) a dedicated CPU runs the "arbiter",
2) submitting CPU becomes the arbiter (iif not other CPU is the arbiter).

I imagine we use mode 2.  Which is almost what we already do now.
The qdisc layer only allow a single CPU to be dequeue'ing packets.  This
process can be seen as the "arbiter".  The only difference is that it
will pickup packets from an intermediate queue, and invoke q->enqueue().
(Still keeping the quota in __qdisc_run()).

 
> In the patch I sent, I basically replaced a locked operation
> (spin_lock(>busylock)) with another one (xchg()) , but I did not add
> yet another queue before the qdisc ones, bufferbloat forbids.

Is it really bufferbloat to introduce an intermidiate queue at this
point.  The enqueue/submit process, can see that qdisc_is_running, thus
it knows these packets will be picked up very shortly (within 200
cycles) and "arbiter" will invoke q->enqueue() allowing qdisc to react
to bufferbloat.


> The virtual queue here is one packet per cpu, which basically is the
> same than before this patch, since each cpu spinning on busylock has one
> skb to send

Re: [PATCH net v2] ipv6: enforce egress device match in per table nexthop lookups

2016-06-23 Thread David Ahern


On 6/23/16 7:25 AM, Paolo Abeni wrote:

with the commit 8c14586fc320 ("net: ipv6: Use passed in table for
nexthop lookups"), net hop lookup is first performed on route creation
in the passed-in table.
However device match is not enforced in table lookup, so the found
route can be later discarded due to egress device mismatch and no
global lookup will be performed.
This cause the following to fail:

ip link add dummy1 type dummy
ip link add dummy2 type dummy
ip link set dummy1 up
ip link set dummy2 up
ip route add 2001:db8:8086::/48 dev dummy1 metric 20
ip route add 2001:db8:d34d::/64 via 2001:db8:8086::2 dev dummy1 metric 20
ip route add 2001:db8:8086::/48 dev dummy2 metric 21
ip route add 2001:db8:d34d::/64 via 2001:db8:8086::2 dev dummy2 metric 21
RTNETLINK answers: No route to host

This change fixes the issue enforcing device lookup in
ip6_nh_lookup_table()

v1->v2: updated commit message title

Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups")
Reported-and-tested-by: Beniamino Galvani 
Signed-off-by: Paolo Abeni 
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 969913d..520b788 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1782,7 +1782,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net 
*net,
};
struct fib6_table *table;
struct rt6_info *rt;
-   int flags = 0;
+   int flags = RT6_LOOKUP_F_IFACE;

table = fib6_get_table(net, cfg->fc_table);
if (!table)



Acked-by: David Ahern

[PATCH] dsa: mv88e6xxx: hide unused functions

2016-06-23 Thread Arnd Bergmann

When CONFIG_NET_DSA_HWMON is disabled, we get warnings about two unused
functions whose only callers are all inside of an #ifdef:

drivers/net/dsa/mv88e6xxx.c:3257:12: 'mv88e6xxx_mdio_page_write' defined but 
not used [-Werror=unused-function]
drivers/net/dsa/mv88e6xxx.c:3244:12: 'mv88e6xxx_mdio_page_read' defined but not 
used [-Werror=unused-function]

This adds another ifdef around the function definitions. The warnings
appeared after the functions were marked 'static', but the problem
was already there before that.

Signed-off-by: Arnd Bergmann 
Fixes: 57d3231057e9 ("net: dsa: mv88e6xxx: fix style issues")
---
 drivers/net/dsa/mv88e6xxx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 9b116d8d4e23..2a95f2d6cf09 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -3241,6 +3241,7 @@ unlock:
return err;
 }
 
+#ifdef CONFIG_NET_DSA_HWMON
 static int mv88e6xxx_mdio_page_read(struct dsa_switch *ds, int port, int page,
int reg)
 {
@@ -3266,6 +3267,7 @@ static int mv88e6xxx_mdio_page_write(struct dsa_switch 
*ds, int port, int page,
 
return ret;
 }
+#endif
 
 static int mv88e6xxx_port_to_mdio_addr(struct mv88e6xxx_priv_state *ps,
   int port)
-- 
2.9.0

Re: rstpd implementation

2016-06-23 Thread Phil


On 06/22/2016 08:12 PM, Stephen Hemminger wrote:

On Wed, 22 Jun 2016 12:44:52 -0500
ebied...@xmission.com (Eric W. Biederman) wrote:


Phil  writes:


Hi,

When looking for an RSTP daemon I found Stephen Hemminger's
git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/rstp.git

with it's last commit from October 2011.

Is this implementation still in good use by anybody - or has it been
replaced/superseded by another implementation?

I don't know and when you get into user space daemons they aren't much
talked about on the kernel lists.  That said you will likely fair better
on the netdev list (cc'd).

Eric

The current one I recommend is the MSTPd done by Cumulus
  https://sourceforge.net/p/mstpd/wiki/Home/
But like all projects they could use help


Thank's a lot to Eric and Stephen for your answers.
In the meantime I also found https://github.com/ocedo/mstpd

Philipp

[PATCH net-next 3/5] phy: separate swphy state validation from register generation

2016-06-23 Thread Russell King

Separate out the generation of MII registers from the state validation.
This allows us to simplify the error handing in fixed_phy() by allowing
earlier error detection.

Reviewed-by: Florian Fainelli 
Signed-off-by: Russell King 
---
 drivers/net/phy/fixed_phy.c | 15 +++
 drivers/net/phy/swphy.c | 33 ++---
 drivers/net/phy/swphy.h |  3 ++-
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index d98a0d90b5a5..d84e30c46824 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -48,12 +48,12 @@ static struct fixed_mdio_bus platform_fmb = {
.phys = LIST_HEAD_INIT(platform_fmb.phys),
 };
 
-static int fixed_phy_update_regs(struct fixed_phy *fp)
+static void fixed_phy_update_regs(struct fixed_phy *fp)
 {
if (gpio_is_valid(fp->link_gpio))
fp->status.link = !!gpio_get_value_cansleep(fp->link_gpio);
 
-   return swphy_update_regs(fp->regs, >status);
+   swphy_update_regs(fp->regs, >status);
 }
 
 static int fixed_mdio_read(struct mii_bus *bus, int phy_addr, int reg_num)
@@ -160,6 +160,10 @@ int fixed_phy_add(unsigned int irq, int phy_addr,
struct fixed_mdio_bus *fmb = _fmb;
struct fixed_phy *fp;
 
+   ret = swphy_validate_state(status);
+   if (ret < 0)
+   return ret;
+
fp = kzalloc(sizeof(*fp), GFP_KERNEL);
if (!fp)
return -ENOMEM;
@@ -180,17 +184,12 @@ int fixed_phy_add(unsigned int irq, int phy_addr,
goto err_regs;
}
 
-   ret = fixed_phy_update_regs(fp);
-   if (ret)
-   goto err_gpio;
+   fixed_phy_update_regs(fp);
 
list_add_tail(>node, >phys);
 
return 0;
 
-err_gpio:
-   if (gpio_is_valid(fp->link_gpio))
-   gpio_free(fp->link_gpio);
 err_regs:
kfree(fp);
return ret;
diff --git a/drivers/net/phy/swphy.c b/drivers/net/phy/swphy.c
index c88a194b4cb6..21a9bd8a7830 100644
--- a/drivers/net/phy/swphy.c
+++ b/drivers/net/phy/swphy.c
@@ -87,6 +87,29 @@ static int swphy_decode_speed(int speed)
 }
 
 /**
+ * swphy_validate_state - validate the software phy status
+ * @state: software phy status
+ *
+ * This checks that we can represent the state stored in @state can be
+ * represented in the emulated MII registers.  Returns 0 if it can,
+ * otherwise returns -EINVAL.
+ */
+int swphy_validate_state(const struct fixed_phy_status *state)
+{
+   int err;
+
+   if (state->link) {
+   err = swphy_decode_speed(state->speed);
+   if (err < 0) {
+   pr_warn("swphy: unknown speed\n");
+   return -EINVAL;
+   }
+   }
+   return 0;
+}
+EXPORT_SYMBOL_GPL(swphy_validate_state);
+
+/**
  * swphy_update_regs - update MII register array with fixed phy state
  * @regs: array of 32 registers to update
  * @state: fixed phy status
@@ -94,7 +117,7 @@ static int swphy_decode_speed(int speed)
  * Update the array of MII registers with the fixed phy link, speed,
  * duplex and pause mode settings.
  */
-int swphy_update_regs(u16 *regs, const struct fixed_phy_status *state)
+void swphy_update_regs(u16 *regs, const struct fixed_phy_status *state)
 {
int speed_index, duplex_index;
u16 bmsr = BMSR_ANEGCAPABLE;
@@ -103,10 +126,8 @@ int swphy_update_regs(u16 *regs, const struct 
fixed_phy_status *state)
u16 lpa = 0;
 
speed_index = swphy_decode_speed(state->speed);
-   if (speed_index < 0) {
-   pr_warn("swphy: unknown speed\n");
-   return -EINVAL;
-   }
+   if (WARN_ON(speed_index < 0))
+   return;
 
duplex_index = state->duplex ? SWMII_DUPLEX_FULL : SWMII_DUPLEX_HALF;
 
@@ -133,7 +154,5 @@ int swphy_update_regs(u16 *regs, const struct 
fixed_phy_status *state)
regs[MII_BMCR] = bmcr;
regs[MII_LPA] = lpa;
regs[MII_STAT1000] = lpagb;
-
-   return 0;
 }
 EXPORT_SYMBOL_GPL(swphy_update_regs);
diff --git a/drivers/net/phy/swphy.h b/drivers/net/phy/swphy.h
index feaa38ff86a2..33d2e061896e 100644
--- a/drivers/net/phy/swphy.h
+++ b/drivers/net/phy/swphy.h
@@ -3,6 +3,7 @@
 
 struct fixed_phy_status;
 
-int swphy_update_regs(u16 *regs, const struct fixed_phy_status *state);
+int swphy_validate_state(const struct fixed_phy_status *state);
+void swphy_update_regs(u16 *regs, const struct fixed_phy_status *state);
 
 #endif
-- 
2.1.0

[PATCH net-next 1/5] phy: move fixed_phy MII register generation to a library

2016-06-23 Thread Russell King

Move the fixed_phy MII register generation to a library to allow other
software phy implementations to use this code.

Reviewed-by: Florian Fainelli 
Signed-off-by: Russell King 
---
 drivers/net/phy/Kconfig |   4 ++
 drivers/net/phy/Makefile|   3 +-
 drivers/net/phy/fixed_phy.c |  95 ++---
 drivers/net/phy/swphy.c | 126 
 drivers/net/phy/swphy.h |   8 +++
 5 files changed, 143 insertions(+), 93 deletions(-)
 create mode 100644 drivers/net/phy/swphy.c
 create mode 100644 drivers/net/phy/swphy.h

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 8dac88abbc39..f96829415ce6 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -12,6 +12,9 @@ menuconfig PHYLIB
 
 if PHYLIB
 
+config SWPHY
+   bool
+
 comment "MII PHY device drivers"
 
 config AQUANTIA_PHY
@@ -159,6 +162,7 @@ config MICROCHIP_PHY
 config FIXED_PHY
tristate "Driver for MDIO Bus/PHY emulation with fixed speed/link PHYs"
depends on PHYLIB
+   select SWPHY
---help---
  Adds the platform "fixed" MDIO Bus to cover the boards that use
  PHYs that are not connected to the real MDIO bus.
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 4170642a2035..7158274327d0 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -1,6 +1,7 @@
 # Makefile for Linux PHY drivers
 
-libphy-objs:= phy.o phy_device.o mdio_bus.o mdio_device.o
+libphy-y   := phy.o phy_device.o mdio_bus.o mdio_device.o
+libphy-$(CONFIG_SWPHY) += swphy.o
 
 obj-$(CONFIG_PHYLIB)   += libphy.o
 obj-$(CONFIG_AQUANTIA_PHY) += aquantia.o
diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 2d2e4339f0df..d98a0d90b5a5 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -24,6 +24,8 @@
 #include 
 #include 
 
+#include "swphy.h"
+
 #define MII_REGS_NUM 29
 
 struct fixed_mdio_bus {
@@ -48,101 +50,10 @@ static struct fixed_mdio_bus platform_fmb = {
 
 static int fixed_phy_update_regs(struct fixed_phy *fp)
 {
-   u16 bmsr = BMSR_ANEGCAPABLE;
-   u16 bmcr = 0;
-   u16 lpagb = 0;
-   u16 lpa = 0;
-
if (gpio_is_valid(fp->link_gpio))
fp->status.link = !!gpio_get_value_cansleep(fp->link_gpio);
 
-   if (fp->status.duplex) {
-   switch (fp->status.speed) {
-   case 1000:
-   bmsr |= BMSR_ESTATEN;
-   break;
-   case 100:
-   bmsr |= BMSR_100FULL;
-   break;
-   case 10:
-   bmsr |= BMSR_10FULL;
-   break;
-   default:
-   break;
-   }
-   } else {
-   switch (fp->status.speed) {
-   case 1000:
-   bmsr |= BMSR_ESTATEN;
-   break;
-   case 100:
-   bmsr |= BMSR_100HALF;
-   break;
-   case 10:
-   bmsr |= BMSR_10HALF;
-   break;
-   default:
-   break;
-   }
-   }
-
-   if (fp->status.link) {
-   bmsr |= BMSR_LSTATUS | BMSR_ANEGCOMPLETE;
-
-   if (fp->status.duplex) {
-   bmcr |= BMCR_FULLDPLX;
-
-   switch (fp->status.speed) {
-   case 1000:
-   bmcr |= BMCR_SPEED1000;
-   lpagb |= LPA_1000FULL;
-   break;
-   case 100:
-   bmcr |= BMCR_SPEED100;
-   lpa |= LPA_100FULL;
-   break;
-   case 10:
-   lpa |= LPA_10FULL;
-   break;
-   default:
-   pr_warn("fixed phy: unknown speed\n");
-   return -EINVAL;
-   }
-   } else {
-   switch (fp->status.speed) {
-   case 1000:
-   bmcr |= BMCR_SPEED1000;
-   lpagb |= LPA_1000HALF;
-   break;
-   case 100:
-   bmcr |= BMCR_SPEED100;
-   lpa |= LPA_100HALF;
-   break;
-   case 10:
-   lpa |= LPA_10HALF;
-   break;
-   default:
-   pr_warn("fixed phy: unknown speed\n");
-   return -EINVAL;
-   }
-

[PATCH net-next 0/5] Initial SFP support patches

2016-06-23 Thread Russell King - ARM Linux

Hi David,

Please review and merge this initial patch set, which is part of a
larger set previously posted adding SFP support to phy and mvneta.

This initial set are focused on cleaning up and reorganising the
fixed-phy code to allow the core software-phy code to be re-used.

These are based on net-next.

Thanks.

 drivers/net/phy/Kconfig |   4 +
 drivers/net/phy/Makefile|   3 +-
 drivers/net/phy/fixed_phy.c | 153 +++--
 drivers/net/phy/swphy.c | 179 
 drivers/net/phy/swphy.h |   9 +++
 5 files changed, 222 insertions(+), 126 deletions(-)

-- 
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

[PATCH net-next 2/5] phy: convert swphy register generation to tabular form

2016-06-23 Thread Russell King

Convert the swphy register generation to tabular form which allows us
to eliminate multiple switch() statements.  This results in a smaller
object code size, more efficient, and easier to add support for faster
speeds.

Before:

Idx Name  Size  VMA   LMA   File off  Algn
  0 .text 0164      0034  2**2

   textdata bss dec hex filename
388   0   0 388 184 swphy.o

After:

Idx Name  Size  VMA   LMA   File off  Algn
  0 .text 00fc      0034  2**2
  5 .rodata   0028      0138  2**2

   textdata bss dec hex filename
324   0   0 324 144 swphy.o

Reviewed-by: Florian Fainelli 
Signed-off-by: Russell King 
---
 drivers/net/phy/swphy.c | 143 ++--
 1 file changed, 78 insertions(+), 65 deletions(-)

diff --git a/drivers/net/phy/swphy.c b/drivers/net/phy/swphy.c
index 0551a79a2454..c88a194b4cb6 100644
--- a/drivers/net/phy/swphy.c
+++ b/drivers/net/phy/swphy.c
@@ -20,6 +20,72 @@
 
 #include "swphy.h"
 
+struct swmii_regs {
+   u16 bmcr;
+   u16 bmsr;
+   u16 lpa;
+   u16 lpagb;
+};
+
+enum {
+   SWMII_SPEED_10 = 0,
+   SWMII_SPEED_100,
+   SWMII_SPEED_1000,
+   SWMII_DUPLEX_HALF = 0,
+   SWMII_DUPLEX_FULL,
+};
+
+/*
+ * These two tables get bitwise-anded together to produce the final result.
+ * This means the speed table must contain both duplex settings, and the
+ * duplex table must contain all speed settings.
+ */
+static const struct swmii_regs speed[] = {
+   [SWMII_SPEED_10] = {
+   .bmcr  = BMCR_FULLDPLX,
+   .lpa   = LPA_10FULL | LPA_10HALF,
+   },
+   [SWMII_SPEED_100] = {
+   .bmcr  = BMCR_FULLDPLX | BMCR_SPEED100,
+   .bmsr  = BMSR_100FULL | BMSR_100HALF,
+   .lpa   = LPA_100FULL | LPA_100HALF,
+   },
+   [SWMII_SPEED_1000] = {
+   .bmcr  = BMCR_FULLDPLX | BMCR_SPEED1000,
+   .bmsr  = BMSR_ESTATEN,
+   .lpagb = LPA_1000FULL | LPA_1000HALF,
+   },
+};
+
+static const struct swmii_regs duplex[] = {
+   [SWMII_DUPLEX_HALF] = {
+   .bmcr  = ~BMCR_FULLDPLX,
+   .bmsr  = BMSR_ESTATEN | BMSR_100HALF,
+   .lpa   = LPA_10HALF | LPA_100HALF,
+   .lpagb = LPA_1000HALF,
+   },
+   [SWMII_DUPLEX_FULL] = {
+   .bmcr  = ~0,
+   .bmsr  = BMSR_ESTATEN | BMSR_100FULL,
+   .lpa   = LPA_10FULL | LPA_100FULL,
+   .lpagb = LPA_1000FULL,
+   },
+};
+
+static int swphy_decode_speed(int speed)
+{
+   switch (speed) {
+   case 1000:
+   return SWMII_SPEED_1000;
+   case 100:
+   return SWMII_SPEED_100;
+   case 10:
+   return SWMII_SPEED_10;
+   default:
+   return -EINVAL;
+   }
+}
+
 /**
  * swphy_update_regs - update MII register array with fixed phy state
  * @regs: array of 32 registers to update
@@ -30,81 +96,28 @@
  */
 int swphy_update_regs(u16 *regs, const struct fixed_phy_status *state)
 {
+   int speed_index, duplex_index;
u16 bmsr = BMSR_ANEGCAPABLE;
u16 bmcr = 0;
u16 lpagb = 0;
u16 lpa = 0;
 
-   if (state->duplex) {
-   switch (state->speed) {
-   case 1000:
-   bmsr |= BMSR_ESTATEN;
-   break;
-   case 100:
-   bmsr |= BMSR_100FULL;
-   break;
-   case 10:
-   bmsr |= BMSR_10FULL;
-   break;
-   default:
-   break;
-   }
-   } else {
-   switch (state->speed) {
-   case 1000:
-   bmsr |= BMSR_ESTATEN;
-   break;
-   case 100:
-   bmsr |= BMSR_100HALF;
-   break;
-   case 10:
-   bmsr |= BMSR_10HALF;
-   break;
-   default:
-   break;
-   }
+   speed_index = swphy_decode_speed(state->speed);
+   if (speed_index < 0) {
+   pr_warn("swphy: unknown speed\n");
+   return -EINVAL;
}
 
+   duplex_index = state->duplex ? SWMII_DUPLEX_FULL : SWMII_DUPLEX_HALF;
+
+   bmsr |= speed[speed_index].bmsr & duplex[duplex_index].bmsr;
+
if (state->link) {
bmsr |= BMSR_LSTATUS | BMSR_ANEGCOMPLETE;
 
-   if (state->duplex) {
-   bmcr |= BMCR_FULLDPLX;
-
-   switch (state->speed) {
-   case 1000:
-   bmcr |= BMCR_SPEED1000;
-   lpagb |= LPA_1000FULL;
-

[PATCH net-next 4/5] phy: generate swphy registers on the fly

2016-06-23 Thread Russell King

Generate software phy registers as and when requested, rather than
duplicating the state in fixed_phy.  This allows us to eliminate
the duplicate storage of of the same data, which is only different
in format.

As fixed_phy_update_regs() no longer updates register state, rename
it to fixed_phy_update().

Reviewed-by: Florian Fainelli 
Signed-off-by: Russell King 
---
 drivers/net/phy/fixed_phy.c | 31 +-
 drivers/net/phy/swphy.c | 47 -
 drivers/net/phy/swphy.h |  2 +-
 3 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index d84e30c46824..0dfed86bdb5a 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -26,8 +26,6 @@
 
 #include "swphy.h"
 
-#define MII_REGS_NUM 29
-
 struct fixed_mdio_bus {
struct mii_bus *mii_bus;
struct list_head phys;
@@ -35,7 +33,6 @@ struct fixed_mdio_bus {
 
 struct fixed_phy {
int addr;
-   u16 regs[MII_REGS_NUM];
struct phy_device *phydev;
struct fixed_phy_status status;
int (*link_update)(struct net_device *, struct fixed_phy_status *);
@@ -48,12 +45,10 @@ static struct fixed_mdio_bus platform_fmb = {
.phys = LIST_HEAD_INIT(platform_fmb.phys),
 };
 
-static void fixed_phy_update_regs(struct fixed_phy *fp)
+static void fixed_phy_update(struct fixed_phy *fp)
 {
if (gpio_is_valid(fp->link_gpio))
fp->status.link = !!gpio_get_value_cansleep(fp->link_gpio);
-
-   swphy_update_regs(fp->regs, >status);
 }
 
 static int fixed_mdio_read(struct mii_bus *bus, int phy_addr, int reg_num)
@@ -61,29 +56,15 @@ static int fixed_mdio_read(struct mii_bus *bus, int 
phy_addr, int reg_num)
struct fixed_mdio_bus *fmb = bus->priv;
struct fixed_phy *fp;
 
-   if (reg_num >= MII_REGS_NUM)
-   return -1;
-
-   /* We do not support emulating Clause 45 over Clause 22 register reads
-* return an error instead of bogus data.
-*/
-   switch (reg_num) {
-   case MII_MMD_CTRL:
-   case MII_MMD_DATA:
-   return -1;
-   default:
-   break;
-   }
-
list_for_each_entry(fp, >phys, node) {
if (fp->addr == phy_addr) {
/* Issue callback if user registered it. */
if (fp->link_update) {
fp->link_update(fp->phydev->attached_dev,
>status);
-   fixed_phy_update_regs(fp);
+   fixed_phy_update(fp);
}
-   return fp->regs[reg_num];
+   return swphy_read_reg(reg_num, >status);
}
}
 
@@ -143,7 +124,7 @@ int fixed_phy_update_state(struct phy_device *phydev,
_UPD(pause);
_UPD(asym_pause);
 #undef _UPD
-   fixed_phy_update_regs(fp);
+   fixed_phy_update(fp);
return 0;
}
}
@@ -168,8 +149,6 @@ int fixed_phy_add(unsigned int irq, int phy_addr,
if (!fp)
return -ENOMEM;
 
-   memset(fp->regs, 0xFF,  sizeof(fp->regs[0]) * MII_REGS_NUM);
-
if (irq != PHY_POLL)
fmb->mii_bus->irq[phy_addr] = irq;
 
@@ -184,7 +163,7 @@ int fixed_phy_add(unsigned int irq, int phy_addr,
goto err_regs;
}
 
-   fixed_phy_update_regs(fp);
+   fixed_phy_update(fp);
 
list_add_tail(>node, >phys);
 
diff --git a/drivers/net/phy/swphy.c b/drivers/net/phy/swphy.c
index 21a9bd8a7830..34f58f2349e9 100644
--- a/drivers/net/phy/swphy.c
+++ b/drivers/net/phy/swphy.c
@@ -20,6 +20,8 @@
 
 #include "swphy.h"
 
+#define MII_REGS_NUM 29
+
 struct swmii_regs {
u16 bmcr;
u16 bmsr;
@@ -110,14 +112,13 @@ int swphy_validate_state(const struct fixed_phy_status 
*state)
 EXPORT_SYMBOL_GPL(swphy_validate_state);
 
 /**
- * swphy_update_regs - update MII register array with fixed phy state
- * @regs: array of 32 registers to update
+ * swphy_read_reg - return a MII register from the fixed phy state
+ * @reg: MII register
  * @state: fixed phy status
  *
- * Update the array of MII registers with the fixed phy link, speed,
- * duplex and pause mode settings.
+ * Return the MII @reg register generated from the fixed phy state @state.
  */
-void swphy_update_regs(u16 *regs, const struct fixed_phy_status *state)
+int swphy_read_reg(int reg, const struct fixed_phy_status *state)
 {
int speed_index, duplex_index;
u16 bmsr = BMSR_ANEGCAPABLE;
@@ -125,9 +126,12 @@ void swphy_update_regs(u16 *regs, const struct 
fixed_phy_status *state)
u16 lpagb = 0;
u16 lpa = 0;
 
+   if (reg > MII_REGS_NUM)
+   return -1;

[PATCH net-next 5/5] phy: improve safety of fixed-phy MII register reading

2016-06-23 Thread Russell King

There is no prevention of a concurrent call to both fixed_mdio_read()
and fixed_phy_update_state(), which can result in the state being
modified while it's being inspected.  Fix this by using a seqcount
to detect modifications, and memcpy()ing the state.

We remain slightly naughty here, calling link_update() and updating
the link status within the read-side loop - which would need rework
of the design to change.

Reviewed-by: Florian Fainelli 
Signed-off-by: Russell King 
---
 drivers/net/phy/fixed_phy.c | 28 +---
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 0dfed86bdb5a..b376ada83598 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "swphy.h"
 
@@ -34,6 +35,7 @@ struct fixed_mdio_bus {
 struct fixed_phy {
int addr;
struct phy_device *phydev;
+   seqcount_t seqcount;
struct fixed_phy_status status;
int (*link_update)(struct net_device *, struct fixed_phy_status *);
struct list_head node;
@@ -58,13 +60,21 @@ static int fixed_mdio_read(struct mii_bus *bus, int 
phy_addr, int reg_num)
 
list_for_each_entry(fp, >phys, node) {
if (fp->addr == phy_addr) {
-   /* Issue callback if user registered it. */
-   if (fp->link_update) {
-   fp->link_update(fp->phydev->attached_dev,
-   >status);
-   fixed_phy_update(fp);
-   }
-   return swphy_read_reg(reg_num, >status);
+   struct fixed_phy_status state;
+   int s;
+
+   do {
+   s = read_seqcount_begin(>seqcount);
+   /* Issue callback if user registered it. */
+   if (fp->link_update) {
+   
fp->link_update(fp->phydev->attached_dev,
+   >status);
+   fixed_phy_update(fp);
+   }
+   state = fp->status;
+   } while (read_seqcount_retry(>seqcount, s));
+
+   return swphy_read_reg(reg_num, );
}
}
 
@@ -116,6 +126,7 @@ int fixed_phy_update_state(struct phy_device *phydev,
 
list_for_each_entry(fp, >phys, node) {
if (fp->addr == phydev->mdio.addr) {
+   write_seqcount_begin(>seqcount);
 #define _UPD(x) if (changed->x) \
fp->status.x = status->x
_UPD(link);
@@ -125,6 +136,7 @@ int fixed_phy_update_state(struct phy_device *phydev,
_UPD(asym_pause);
 #undef _UPD
fixed_phy_update(fp);
+   write_seqcount_end(>seqcount);
return 0;
}
}
@@ -149,6 +161,8 @@ int fixed_phy_add(unsigned int irq, int phy_addr,
if (!fp)
return -ENOMEM;
 
+   seqcount_init(>seqcount);
+
if (irq != PHY_POLL)
fmb->mii_bus->irq[phy_addr] = irq;
 
-- 
2.1.0

Re: [alsa-devel] [very-RFC 0/8] TSN driver for the kernel

2016-06-23 Thread Richard Cochran

On Thu, Jun 23, 2016 at 12:38:48PM +0200, Henrik Austad wrote:
> Richard: is it fair to assume that if ptp4l is running and is part of a PTP 
> domain, ktime_get() will return PTP-adjusted time for the system?

No.

> Or do I also need to run phc2sys in order to sync the system-time
> to PTP-time?

Yes, unless you are using SW time stamping, in which case ptp4l will
steer the system clock directly.

HTH,
Richard

[PATCH net v2] ipv6: enforce egress device match in per table nexthop lookups

2016-06-23 Thread Paolo Abeni

with the commit 8c14586fc320 ("net: ipv6: Use passed in table for
nexthop lookups"), net hop lookup is first performed on route creation
in the passed-in table.
However device match is not enforced in table lookup, so the found
route can be later discarded due to egress device mismatch and no
global lookup will be performed.
This cause the following to fail:

ip link add dummy1 type dummy
ip link add dummy2 type dummy
ip link set dummy1 up
ip link set dummy2 up
ip route add 2001:db8:8086::/48 dev dummy1 metric 20
ip route add 2001:db8:d34d::/64 via 2001:db8:8086::2 dev dummy1 metric 20
ip route add 2001:db8:8086::/48 dev dummy2 metric 21
ip route add 2001:db8:d34d::/64 via 2001:db8:8086::2 dev dummy2 metric 21
RTNETLINK answers: No route to host

This change fixes the issue enforcing device lookup in
ip6_nh_lookup_table()

v1->v2: updated commit message title

Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups")
Reported-and-tested-by: Beniamino Galvani 
Signed-off-by: Paolo Abeni 
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 969913d..520b788 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1782,7 +1782,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net 
*net,
};
struct fib6_table *table;
struct rt6_info *rt;
-   int flags = 0;
+   int flags = RT6_LOOKUP_F_IFACE;
 
table = fib6_get_table(net, cfg->fc_table);
if (!table)
-- 
1.8.3.1

Re: [PATCH] net: ethernet: ti: cpdma: switch to use genalloc

2016-06-23 Thread ivan.khoronzhuk




On 23.06.16 15:36, Grygorii Strashko wrote:

TI CPDMA currently uses a bitmap for tracking descriptors alloactions
allocations, but The genalloc already handles the same and can be used
as with special memory (SRAM) as with DMA cherent memory chank
(dma_alloc_coherent()). Hence, switch to using genalloc and add
desc_num property for each channel for limitation of max number of
allowed descriptors for each CPDMA channel. This patch do not affect
on net throuput.

Cc: Ivan Khoronzhuk 
Signed-off-by: Grygorii Strashko 


Tested-by: Ivan Khoronzhuk 


---
Testing
TCP window: 256K, bandwidth in Mbits/sec:
  host: iperf -s
  device: iperf -c  172.22.39.17 -t600 -i5 -d -w128K

AM437x-idk, 1Gbps link
  before: : 341.60, after: 232+123=355
am57xx-beagle-x15, 1Gbps link
  before: : 1112.80, after: 814+321=1135
am335x-boneblack, 100Mbps link
  before: : 162.40, after: 72+93=165

  drivers/net/ethernet/ti/davinci_cpdma.c | 136 +++-
  1 file changed, 62 insertions(+), 74 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c 
b/drivers/net/ethernet/ti/davinci_cpdma.c
index 18bf3a8..03b9882 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -21,7 +21,7 @@
  #include 
  #include 
  #include 
-
+#include 
  #include "davinci_cpdma.h"

  /* DMA Registers */
@@ -87,9 +87,8 @@ struct cpdma_desc_pool {
void*cpumap;/* dma_alloc map */
int desc_size, mem_size;
int num_desc, used_desc;
-   unsigned long   *bitmap;
struct device   *dev;
-   spinlock_t  lock;
+   struct gen_pool *gen_pool;
  };

  enum cpdma_state {
@@ -117,6 +116,7 @@ struct cpdma_chan {
int chan_num;
spinlock_t  lock;
int count;
+   u32 desc_num;
u32 mask;
cpdma_handler_fnhandler;
enum dma_data_direction dir;
@@ -145,6 +145,20 @@ struct cpdma_chan {
 (directed << CPDMA_TO_PORT_SHIFT));  \
} while (0)

+static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
+{
+   if (!pool)
+   return;
+
+   WARN_ON(pool->used_desc);
+   if (pool->cpumap) {
+   dma_free_coherent(pool->dev, pool->mem_size, pool->cpumap,
+ pool->phys);
+   } else {
+   iounmap(pool->iomap);
+   }
+}
+
  /*
   * Utility constructs for a cpdma descriptor pool.  Some devices (e.g. davinci
   * emac) have dedicated on-chip memory for these descriptors.  Some other
@@ -155,24 +169,25 @@ static struct cpdma_desc_pool *
  cpdma_desc_pool_create(struct device *dev, u32 phys, dma_addr_t hw_addr,
int size, int align)
  {
-   int bitmap_size;
struct cpdma_desc_pool *pool;
+   int ret;

pool = devm_kzalloc(dev, sizeof(*pool), GFP_KERNEL);
if (!pool)
-   goto fail;
-
-   spin_lock_init(>lock);
+   goto gen_pool_create_fail;

pool->dev= dev;
pool->mem_size   = size;
pool->desc_size  = ALIGN(sizeof(struct cpdma_desc), align);
pool->num_desc   = size / pool->desc_size;

-   bitmap_size  = (pool->num_desc / BITS_PER_LONG) * sizeof(long);
-   pool->bitmap = devm_kzalloc(dev, bitmap_size, GFP_KERNEL);
-   if (!pool->bitmap)
-   goto fail;
+   pool->gen_pool = devm_gen_pool_create(dev, ilog2(pool->desc_size), -1,
+ "cpdma");
+   if (IS_ERR(pool->gen_pool)) {
+   dev_err(dev, "pool create failed %ld\n",
+   PTR_ERR(pool->gen_pool));
+   goto gen_pool_create_fail;
+   }

if (phys) {
pool->phys  = phys;
@@ -185,24 +200,22 @@ cpdma_desc_pool_create(struct device *dev, u32 phys, 
dma_addr_t hw_addr,
pool->phys = pool->hw_addr; /* assumes no IOMMU, don't use this 
value */
}

-   if (pool->iomap)
-   return pool;
-fail:
-   return NULL;
-}
-
-static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
-{
-   if (!pool)
-   return;
+   if (!pool->iomap)
+   goto gen_pool_create_fail;

-   WARN_ON(pool->used_desc);
-   if (pool->cpumap) {
-   dma_free_coherent(pool->dev, pool->mem_size, pool->cpumap,
- pool->phys);
-   } else {
-   iounmap(pool->iomap);
+   ret = gen_pool_add_virt(pool->gen_pool, (unsigned long)pool->iomap,
+   pool->phys, pool->mem_size, -1);
+   if (ret < 0) {
+

Re: [PATCH net] ipv6: allows gracefull fallback from table lookup

2016-06-23 Thread Paolo Abeni

On Thu, 2016-06-23 at 15:11 +0200, Paolo Abeni wrote:
> with the commit 8c14586fc320 ("net: ipv6: Use passed in table for
> nexthop lookups"), net hop lookup is first performed on route creation
> in the passed-in table.
> However device match is not enforced in table lookup, so the found
> route can be later discarded due to egress device mismatch and no
> global lookup will be performed.
> This cause the following to fail:
> 
> ip link add dummy1 type dummy
> ip link add dummy2 type dummy
> ip link set dummy1 up
> ip link set dummy2 up
> ip route add 2001:db8:8086::/48 dev dummy1 metric 20
> ip route add 2001:db8:d34d::/64 via 2001:db8:8086::2 dev dummy1 metric 20
> ip route add 2001:db8:8086::/48 dev dummy2 metric 21
> ip route add 2001:db8:d34d::/64 via 2001:db8:8086::2 dev dummy2 metric 21
> RTNETLINK answers: No route to host
> 
> This change fixes the issue enforcing device lookup in
> ip6_nh_lookup_table()
> 
> Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups")
> Reported-and-tested-by: Beniamino Galvani 
> Signed-off-by: Paolo Abeni 

Oops, bad commit message title (not updated from a previous
implementation), I'll resubmit with a more relevant one. Sorry for the
noise.

Paolo

Re: [PATCH 2/3] can: fix oops caused by wrong rtnl dellink usage

2016-06-23 Thread Sergei Shtylyov


On 6/23/2016 4:01 PM, Oliver Hartkopp wrote:


From: Oliver Hartkopp 

For 'real' hardware CAN devices the netlink interface is used to set CAN
specific communication parameters. Real CAN hardware can not be
created nor
removed with the ip tool ...

This patch adds a private dellink function for the CAN device driver
interface
that does just nothing.

It's a follow up to commit 993e6f2fd ("can: fix oops caused by wrong rtnl
newlink usage") but for dellink.

Reported-by: ajneu 
Signed-off-by: Oliver Hartkopp 
Cc: 
Signed-off-by: Marc Kleine-Budde 
---
 drivers/net/can/dev.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 348dd5001fa4..ad535a854e5c 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -1011,6 +1011,11 @@ static int can_newlink(struct net *src_net,
struct net_device *dev,
 return -EOPNOTSUPP;
 }

+static void can_dellink(struct net_device *dev, struct list_head *head)
+{
+return;


   Why?



http://marc.info/?l=linux-can=146651600421205=2

The same reason as for commit 993e6f2fd.


   I was asking just about the useless *return* statement...


Regards,
Oliver


MBR, Sergei

[PATCH net] ipv6: allows gracefull fallback from table lookup

2016-06-23 Thread Paolo Abeni

with the commit 8c14586fc320 ("net: ipv6: Use passed in table for
nexthop lookups"), net hop lookup is first performed on route creation
in the passed-in table.
However device match is not enforced in table lookup, so the found
route can be later discarded due to egress device mismatch and no
global lookup will be performed.
This cause the following to fail:

ip link add dummy1 type dummy
ip link add dummy2 type dummy
ip link set dummy1 up
ip link set dummy2 up
ip route add 2001:db8:8086::/48 dev dummy1 metric 20
ip route add 2001:db8:d34d::/64 via 2001:db8:8086::2 dev dummy1 metric 20
ip route add 2001:db8:8086::/48 dev dummy2 metric 21
ip route add 2001:db8:d34d::/64 via 2001:db8:8086::2 dev dummy2 metric 21
RTNETLINK answers: No route to host

This change fixes the issue enforcing device lookup in
ip6_nh_lookup_table()

Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups")
Reported-and-tested-by: Beniamino Galvani 
Signed-off-by: Paolo Abeni 
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 969913d..520b788 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1782,7 +1782,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net 
*net,
};
struct fib6_table *table;
struct rt6_info *rt;
-   int flags = 0;
+   int flags = RT6_LOOKUP_F_IFACE;
 
table = fib6_get_table(net, cfg->fc_table);
if (!table)
-- 
1.8.3.1

Re: [PATCH 2/3] can: fix oops caused by wrong rtnl dellink usage

2016-06-23 Thread Oliver Hartkopp




On 06/23/2016 02:55 PM, Sergei Shtylyov wrote:

Hello.

On 6/23/2016 12:22 PM, Marc Kleine-Budde wrote:


From: Oliver Hartkopp 

For 'real' hardware CAN devices the netlink interface is used to set CAN
specific communication parameters. Real CAN hardware can not be
created nor
removed with the ip tool ...

This patch adds a private dellink function for the CAN device driver
interface
that does just nothing.

It's a follow up to commit 993e6f2fd ("can: fix oops caused by wrong rtnl
newlink usage") but for dellink.

Reported-by: ajneu 
Signed-off-by: Oliver Hartkopp 
Cc: 
Signed-off-by: Marc Kleine-Budde 
---
 drivers/net/can/dev.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 348dd5001fa4..ad535a854e5c 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -1011,6 +1011,11 @@ static int can_newlink(struct net *src_net,
struct net_device *dev,
 return -EOPNOTSUPP;
 }

+static void can_dellink(struct net_device *dev, struct list_head *head)
+{
+return;


   Why?



http://marc.info/?l=linux-can=146651600421205=2

The same reason as for commit 993e6f2fd.

Regards,
Oliver


+}
+
 static struct rtnl_link_ops can_link_ops __read_mostly = {
 .kind= "can",
 .maxtype= IFLA_CAN_MAX,

[...]

MBR, Sergei

--
To unsubscribe from this list: send the line "unsubscribe linux-can" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] net: ethernet: ti: cpdma: switch to use genalloc

2016-06-23 Thread Ivan Khoronzhuk




On 23.06.16 15:36, Grygorii Strashko wrote:

TI CPDMA currently uses a bitmap for tracking descriptors alloactions
allocations, but The genalloc already handles the same and can be used
as with special memory (SRAM) as with DMA cherent memory chank
(dma_alloc_coherent()). Hence, switch to using genalloc and add
desc_num property for each channel for limitation of max number of
allowed descriptors for each CPDMA channel. This patch do not affect
on net throuput.

Cc: Ivan Khoronzhuk 
Signed-off-by: Grygorii Strashko 
---
Testing
TCP window: 256K, bandwidth in Mbits/sec:
  host: iperf -s
  device: iperf -c  172.22.39.17 -t600 -i5 -d -w128K

AM437x-idk, 1Gbps link
  before: : 341.60, after: 232+123=355
am57xx-beagle-x15, 1Gbps link
  before: : 1112.80, after: 814+321=1135
am335x-boneblack, 100Mbps link
  before: : 162.40, after: 72+93=165

  drivers/net/ethernet/ti/davinci_cpdma.c | 136 +++-
  1 file changed, 62 insertions(+), 74 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c 
b/drivers/net/ethernet/ti/davinci_cpdma.c
index 18bf3a8..03b9882 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -21,7 +21,7 @@
  #include 
  #include 
  #include 
-
+#include 
  #include "davinci_cpdma.h"

  /* DMA Registers */
@@ -87,9 +87,8 @@ struct cpdma_desc_pool {
void*cpumap;/* dma_alloc map */
int desc_size, mem_size;
int num_desc, used_desc;
-   unsigned long   *bitmap;
struct device   *dev;
-   spinlock_t  lock;
+   struct gen_pool *gen_pool;
  };

  enum cpdma_state {
@@ -117,6 +116,7 @@ struct cpdma_chan {
int chan_num;
spinlock_t  lock;
int count;
+   u32 desc_num;
u32 mask;
cpdma_handler_fnhandler;
enum dma_data_direction dir;
@@ -145,6 +145,20 @@ struct cpdma_chan {
 (directed << CPDMA_TO_PORT_SHIFT));  \
} while (0)

+static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
+{
+   if (!pool)
+   return;
+
+   WARN_ON(pool->used_desc);
+   if (pool->cpumap) {
+   dma_free_coherent(pool->dev, pool->mem_size, pool->cpumap,
+ pool->phys);
+   } else {
+   iounmap(pool->iomap);
+   }
+}
+

single if, brackets?


  /*
   * Utility constructs for a cpdma descriptor pool.  Some devices (e.g. davinci
   * emac) have dedicated on-chip memory for these descriptors.  Some other
@@ -155,24 +169,25 @@ static struct cpdma_desc_pool *
  cpdma_desc_pool_create(struct device *dev, u32 phys, dma_addr_t hw_addr,
int size, int align)
  {
-   int bitmap_size;
struct cpdma_desc_pool *pool;
+   int ret;

pool = devm_kzalloc(dev, sizeof(*pool), GFP_KERNEL);
if (!pool)
-   goto fail;
-
-   spin_lock_init(>lock);
+   goto gen_pool_create_fail;

pool->dev= dev;
pool->mem_size   = size;
pool->desc_size  = ALIGN(sizeof(struct cpdma_desc), align);
pool->num_desc   = size / pool->desc_size;

-   bitmap_size  = (pool->num_desc / BITS_PER_LONG) * sizeof(long);
-   pool->bitmap = devm_kzalloc(dev, bitmap_size, GFP_KERNEL);
-   if (!pool->bitmap)
-   goto fail;
+   pool->gen_pool = devm_gen_pool_create(dev, ilog2(pool->desc_size), -1,
+ "cpdma");
+   if (IS_ERR(pool->gen_pool)) {
+   dev_err(dev, "pool create failed %ld\n",
+   PTR_ERR(pool->gen_pool));
+   goto gen_pool_create_fail;
+   }

if (phys) {
pool->phys  = phys;
@@ -185,24 +200,22 @@ cpdma_desc_pool_create(struct device *dev, u32 phys, 
dma_addr_t hw_addr,
pool->phys = pool->hw_addr; /* assumes no IOMMU, don't use this 
value */
}

-   if (pool->iomap)
-   return pool;
-fail:
-   return NULL;
-}
-
-static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
-{
-   if (!pool)
-   return;
+   if (!pool->iomap)
+   goto gen_pool_create_fail;

-   WARN_ON(pool->used_desc);
-   if (pool->cpumap) {
-   dma_free_coherent(pool->dev, pool->mem_size, pool->cpumap,
- pool->phys);
-   } else {
-   iounmap(pool->iomap);
+   ret = gen_pool_add_virt(pool->gen_pool, (unsigned long)pool->iomap,
+   pool->phys, pool->mem_size, -1);
+   if (ret < 0) {
+   dev_err(dev, "pool add failed %d\n",

1 2 >

1 - 100 of 149 matches

Mail list logo