date:20170112

Re: kill off pci_enable_msi_{exact,range}

2017-01-12 Thread Christoph Hellwig

On Thu, Jan 12, 2017 at 03:29:00PM -0600, Bjorn Helgaas wrote:
> Applied all three (with Tom's ack on the amd-xgbe patch) to pci/msi for
> v4.11, thanks!

Tom had just send me an event better version of the xgbe patch.  Tom,
maybe you can resend that relative to the PCI tree [1], so that we don't
lose it for next merge window?

[1] https://git.kernel.org/cgit/linux/kernel/git/helgaas/pci.git)

Re: [net PATCH v3 3/5] virtio_net: factor out xdp handler for readability

2017-01-12 Thread Jason Wang




On 2017年01月13日 10:51, John Fastabend wrote:

At this point the do_xdp_prog is mostly if/else branches handling
the different modes of virtio_net. So remove it and handle running
the program in the per mode handlers.

Signed-off-by: John Fastabend 
---
  drivers/net/virtio_net.c |   76 +-
  1 file changed, 28 insertions(+), 48 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 43cb2e0..ec54644 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -388,49 +388,6 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
virtqueue_kick(sq->vq);
  }
  


[...]

  
  		/* This happens when rx buffer size is underestimated */

@@ -598,8 +570,10 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
if (unlikely(hdr->hdr.gso_type))
goto err_xdp;
  
-		act = do_xdp_prog(vi, rq, xdp_prog,

- page_address(xdp_page) + offset, len);
+   data = page_address(xdp_page) + offset;
+   xdp.data = data + desc_room;
+   xdp.data_end = xdp.data + (len - vi->hdr_len);


It looks desc_room is always vi->hdr_len.


+   act = bpf_prog_run_xdp(xdp_prog, );
switch (act) {
case XDP_PASS:
/* We can only create skb based on xdp_page. */
@@ -613,13 +587,19 @@ static struct sk_buff *receive_mergeable(struct 
net_device *dev,
}
break;
case XDP_TX:
+   qp = vi->curr_queue_pairs -
+   vi->xdp_queue_pairs +
+   smp_processor_id();
+   virtnet_xdp_xmit(vi, rq, >sq[qp], , data);
ewma_pkt_len_add(>mrg_avg_pkt_len, len);
if (unlikely(xdp_page != page))
goto err_xdp;
rcu_read_unlock();
goto xdp_xmit;
-   case XDP_DROP:
default:
+   bpf_warn_invalid_xdp_action(act);
+   case XDP_ABORTED:
+   case XDP_DROP:
if (unlikely(xdp_page != page))
__free_pages(xdp_page, 0);
ewma_pkt_len_add(>mrg_avg_pkt_len, len);

Re: [net PATCH v3 5/5] virtio_net: XDP support for adjust_head

2017-01-12 Thread Jason Wang




On 2017年01月13日 10:52, John Fastabend wrote:

Add support for XDP adjust head by allocating a 256B header region
that XDP programs can grow into. This is only enabled when a XDP
program is loaded.

In order to ensure that we do not have to unwind queue headroom push
queue setup below bpf_prog_add. It reads better to do a prog ref
unwind vs another queue setup call.

At the moment this code must do a full reset to ensure old buffers
without headroom on program add or with headroom on program removal
are not used incorrectly in the datapath. Ideally we would only
have to disable/enable the RX queues being updated but there is no
API to do this at the moment in virtio so use the big hammer. In
practice it is likely not that big of a problem as this will only
happen when XDP is enabled/disabled changing programs does not
require the reset. There is some risk that the driver may either
have an allocation failure or for some reason fail to correctly
negotiate with the underlying backend in this case the driver will
be left uninitialized. I have not seen this ever happen on my test
systems and for what its worth this same failure case can occur
from probe and other contexts in virtio framework.

Signed-off-by: John Fastabend 
---
  drivers/net/virtio_net.c |  155 --
  drivers/virtio/virtio.c  |9 ++-
  include/linux/virtio.h   |3 +
  3 files changed, 144 insertions(+), 23 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6041828..8b897e7 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -28,6 +28,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  
  static int napi_weight = NAPI_POLL_WEIGHT;

@@ -159,6 +160,9 @@ struct virtnet_info {
/* Ethtool settings */
u8 duplex;
u32 speed;
+
+   /* Headroom allocated in RX Queue */
+   unsigned int headroom;


If this could not be changed in anyway, better use a macro instead of a 
filed here. And there's even no need to add an extra parameter to 
add_recvbuf_mergeable().



  };
  
  struct padded_vnet_hdr {

@@ -359,6 +363,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
}
  
  	if (vi->mergeable_rx_bufs) {

+   xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);


Fail to understand why this is needed. We should have excluded vnet 
header from xdp->data even before bpf_prog_run_xdp().



/* Zero header and leave csum up to XDP layers */
hdr = xdp->data;
memset(hdr, 0, vi->hdr_len);
@@ -375,7 +380,9 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
num_sg = 2;
sg_init_table(sq->sg, 2);
sg_set_buf(sq->sg, hdr, vi->hdr_len);
-   skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
+   skb_to_sgvec(skb, sq->sg + 1,
+xdp->data - xdp->data_hard_start,
+xdp->data_end - xdp->data);
}
err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
   data, GFP_ATOMIC);
@@ -401,7 +408,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
struct bpf_prog *xdp_prog;
  
  	len -= vi->hdr_len;

-   skb_trim(skb, len);
  
  	rcu_read_lock();

xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -413,11 +419,15 @@ static struct sk_buff *receive_small(struct net_device 
*dev,
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
goto err_xdp;
  
-		xdp.data = skb->data;

+   xdp.data_hard_start = skb->data;
+   xdp.data = skb->data + vi->headroom;
xdp.data_end = xdp.data + len;
act = bpf_prog_run_xdp(xdp_prog, );
switch (act) {
case XDP_PASS:
+   /* Recalculate length in case bpf program changed it */
+   len = xdp.data_end - xdp.data;
+   __skb_pull(skb, xdp.data - xdp.data_hard_start);


How about do this just after bpf_pro_run_xdp() for XDP_TX too? This is 
more readable and there's no need to change xmit path.



break;
case XDP_TX:
virtnet_xdp_xmit(vi, rq, , skb);
@@ -432,6 +442,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
}
rcu_read_unlock();
  
+	skb_trim(skb, len);

return skb;
  
  err_xdp:

@@ -569,7 +580,11 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
if (unlikely(hdr->hdr.gso_type))
goto err_xdp;
  
+		/* Allow consuming headroom but reserve enough space to push

+* the descriptor on if we get an XDP_TX return code.
+*/
data = page_address(xdp_page) + offset;
+   xdp.data_hard_start = data - vi->headroom + desc_room;


Two

Re: [PATCH net-next] IPsec: do not ignore crypto err in ah input

2017-01-12 Thread Alexander Alemayhu

On Thu, Jan 12, 2017 at 03:33:22PM +0200, Gilad Ben-Yossef wrote:
> ah input processing uses the asynchrnous hash crypto API which
> supplies an error code as part of the operation completion but 
> the error code was being ignored.
>
s/asynchrnous/asynchronous

-- 
Mit freundlichen Grüßen

Alexander Alemayhu

Re: [PATCH v5 01/13] net: ethernet: aquantia: Make and configuration files.

2017-01-12 Thread Joe Perches

On Thu, 2017-01-12 at 22:57 -0800, David VomLehn wrote:
> On 01/12/2017 09:59 PM, Joe Perches wrote:
> > On Thu, 2017-01-12 at 21:24 -0800, David VomLehn wrote:
> > > On 01/12/2017 09:06 PM, Joe Perches wrote:
> > > > On Thu, 2017-01-12 at 21:02 -0800, Alexander Loktionov wrote:
> > > > > From: David VomLehn 
> > > > > 
> > > > > Patches to create the make and configuration files.
> > > > 
> > > > This patch should _really_ be the last in the series
> > > > not the first.
> > > > 
> > > 
> > > Could you explain the basis for this? By convention, we put tables of
> > > content at the beginning of books and only indices at the back.
> > > Analogously, make and config files can be used to established the
> > > context for what follows, making it easier to understand. Once
> > > committed, of course, the order no longer matters except as bisection is
> > > concerned.
> > 
> > As I wrote the first time:
> > 
> > On Tue, 2016-12-27 at 08:15 -0800, Joe Perches wrote:
> > > On Tue, 2016-12-27 at 05:17 -0800, David VomLehn wrote:
> > > > Patches to create the make and configuration files.
> > 
> > []
> > > Patch 1 will not build if CONFIG_AQTION is enabled.
> > > Patch 1/12 should be reordered to be patch 12/12 and
> > > all the other patches moved up appropriately.
> > 
> > You don't create the files until later patches.
> > 
> > If you applied just this first patch and tried to
> > add CONFIG_AQTION=y to the .config, make fails.
> > 
> > That's bad for git bisect.
> > Every patch in this series should build properly.
> > 
> > If you delay the adding of the Makefile and Kconfig
> > until all the files are added, then it'd bisect fine.
> 
> Please go back and re-read the latest patches; I think you will find 
> your concern about CONFIG_AQTION addressed in the v5 patchset. The 01/13 
> patch no longer has the changes to Makefile and Kconfig in 
> drivers/net/ethernet that will pull in the Makefile and Kconfig from 
> drivers/net/ethernet/aquantia. Those changes are now in the 13/13 patch, 
> which should make it bisectable. If I am missing something, please let 
> me know.

Well fine then.

I just looked subject titles and didn't notice that change.

cheers, Joe

Re: [PATCH v5 01/13] net: ethernet: aquantia: Make and configuration files.

2017-01-12 Thread David VomLehn


On 01/12/2017 09:59 PM, Joe Perches wrote:

On Thu, 2017-01-12 at 21:24 -0800, David VomLehn wrote:

On 01/12/2017 09:06 PM, Joe Perches wrote:

On Thu, 2017-01-12 at 21:02 -0800, Alexander Loktionov wrote:

From: David VomLehn 

Patches to create the make and configuration files.

This patch should _really_ be the last in the series
not the first.


Could you explain the basis for this? By convention, we put tables of
content at the beginning of books and only indices at the back.
Analogously, make and config files can be used to established the
context for what follows, making it easier to understand. Once
committed, of course, the order no longer matters except as bisection is
concerned.

As I wrote the first time:

On Tue, 2016-12-27 at 08:15 -0800, Joe Perches wrote:

On Tue, 2016-12-27 at 05:17 -0800, David VomLehn wrote:

Patches to create the make and configuration files.

[]

Patch 1 will not build if CONFIG_AQTION is enabled.
Patch 1/12 should be reordered to be patch 12/12 and
all the other patches moved up appropriately.

You don't create the files until later patches.

If you applied just this first patch and tried to
add CONFIG_AQTION=y to the .config, make fails.

That's bad for git bisect.
Every patch in this series should build properly.

If you delay the adding of the Makefile and Kconfig
until all the files are added, then it'd bisect fine.
Please go back and re-read the latest patches; I think you will find 
your concern about CONFIG_AQTION addressed in the v5 patchset. The 01/13 
patch no longer has the changes to Makefile and Kconfig in 
drivers/net/ethernet that will pull in the Makefile and Kconfig from 
drivers/net/ethernet/aquantia. Those changes are now in the 13/13 patch, 
which should make it bisectable. If I am missing something, please let 
me know.


--
David VL

Re: To netlink or not to netlink, that is the question

2017-01-12 Thread Johannes Berg

On Thu, 2017-01-12 at 20:02 +0100, Jason A. Donenfeld wrote:
> Hi Dan,
> 
> Thanks for your response. I'd thought about this, at least for
> adding/removing wgpeers/wgipmasks and for configuring wgdevices. This
> would fit into multiple smaller messages indeed.
> 
> But what about fetching the list of all existing peers and ipmasks
> atomically? It seems like with multiple calls, if I'm using some kind
> of pagination, things could change in the process. That's why using
> one big buffer was most appealing... Any ideas about this?

In addition to what others have said - netlink typically includes (and
has helpers to do so) a generation counter that's updated whenever this
list changes, and included in each message, so if userspace really
cares (often not) it can retry the dump until the system was idle
enough to get a consistent snapshot.

It also looks to me like your existing API isn't even compat-safe due
to u64 alignment (e.g. in wgpeer), proving once again that ioctl is a
bad idea.

johannes

[PATCH net-next v2 4/5] bnxt_en: Add support for ethtool -p.

2017-01-12 Thread Michael Chan

Add LED blinking code to support ethtool -p on the PF.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 40 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 17 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 44 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h | 23 
 4 files changed, 123 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index df2358b..2b46f9b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5621,6 +5621,45 @@ static int bnxt_hwrm_shutdown_link(struct bnxt *bp)
return hwrm_send_message(bp, , sizeof(req), HWRM_CMD_TIMEOUT);
 }
 
+static int bnxt_hwrm_port_led_qcaps(struct bnxt *bp)
+{
+   struct hwrm_port_led_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+   struct hwrm_port_led_qcaps_input req = {0};
+   struct bnxt_pf_info *pf = >pf;
+   int rc;
+
+   if (BNXT_VF(bp) || bp->hwrm_spec_code < 0x10601)
+   return 0;
+
+   bnxt_hwrm_cmd_hdr_init(bp, , HWRM_PORT_LED_QCAPS, -1, -1);
+   req.port_id = cpu_to_le16(pf->port_id);
+   mutex_lock(>hwrm_cmd_lock);
+   rc = _hwrm_send_message(bp, , sizeof(req), HWRM_CMD_TIMEOUT);
+   if (rc) {
+   mutex_unlock(>hwrm_cmd_lock);
+   return rc;
+   }
+   if (resp->num_leds > 0 && resp->num_leds < BNXT_MAX_LED) {
+   int i;
+
+   bp->num_leds = resp->num_leds;
+   memcpy(bp->leds, >led0_id, sizeof(bp->leds[0]) *
+bp->num_leds);
+   for (i = 0; i < bp->num_leds; i++) {
+   struct bnxt_led_info *led = >leds[i];
+   __le16 caps = led->led_state_caps;
+
+   if (!led->led_group_id ||
+   !BNXT_LED_ALT_BLINK_CAP(caps)) {
+   bp->num_leds = 0;
+   break;
+   }
+   }
+   }
+   mutex_unlock(>hwrm_cmd_lock);
+   return 0;
+}
+
 static bool bnxt_eee_config_ok(struct bnxt *bp)
 {
struct ethtool_eee *eee = >eee;
@@ -7244,6 +7283,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
}
 
bnxt_hwrm_func_qcfg(bp);
+   bnxt_hwrm_port_led_qcaps(bp);
 
bnxt_set_tpa_flags(bp);
bnxt_set_ring_params(bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index f6b9b1c..52a1cc0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -868,6 +868,20 @@ struct bnxt_queue_info {
u8  queue_profile;
 };
 
+#define BNXT_MAX_LED   4
+
+struct bnxt_led_info {
+   u8  led_id;
+   u8  led_type;
+   u8  led_group_id;
+   u8  unused;
+   __le16  led_state_caps;
+#define BNXT_LED_ALT_BLINK_CAP(x)  ((x) &  \
+   cpu_to_le16(PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_ALT_SUPPORTED))
+
+   __le16  led_color_caps;
+};
+
 #define BNXT_GRCPF_REG_WINDOW_BASE_OUT 0x400
 #define BNXT_CAG_REG_LEGACY_INT_STATUS 0x4014
 #define BNXT_CAG_REG_BASE  0x30
@@ -1123,6 +1137,9 @@ struct bnxt {
struct ethtool_eee  eee;
u32 lpi_tmr_lo;
u32 lpi_tmr_hi;
+
+   u8  num_leds;
+   struct bnxt_led_infoleds[BNXT_MAX_LED];
 };
 
 #define BNXT_RX_STATS_OFFSET(counter)  \
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index dd21be4..24818e1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2080,6 +2080,47 @@ static int bnxt_nway_reset(struct net_device *dev)
return rc;
 }
 
+static int bnxt_set_phys_id(struct net_device *dev,
+   enum ethtool_phys_id_state state)
+{
+   struct hwrm_port_led_cfg_input req = {0};
+   struct bnxt *bp = netdev_priv(dev);
+   struct bnxt_pf_info *pf = >pf;
+   struct bnxt_led_cfg *led_cfg;
+   u8 led_state;
+   __le16 duration;
+   int i, rc;
+
+   if (!bp->num_leds || BNXT_VF(bp))
+   return -EOPNOTSUPP;
+
+   if (state == ETHTOOL_ID_ACTIVE) {
+   led_state = PORT_LED_CFG_REQ_LED0_STATE_BLINKALT;
+   duration = cpu_to_le16(500);
+   } else if (state == ETHTOOL_ID_INACTIVE) {
+   led_state = PORT_LED_CFG_REQ_LED1_STATE_DEFAULT;
+   duration = cpu_to_le16(0);
+   } else {
+   return -EINVAL;
+   }
+   bnxt_hwrm_cmd_hdr_init(bp, , HWRM_PORT_LED_CFG, -1, -1);
+   req.port_id =

[PATCH net-next v2 3/5] bnxt_en: Update to firmware interface spec to 1.6.1.

2017-01-12 Thread Michael Chan

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 261 +++---
 1 file changed, 237 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
index d0d49ed..5df32ab 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
@@ -1,7 +1,7 @@
 /* Broadcom NetXtreme-C/E network driver.
  *
  * Copyright (c) 2014-2016 Broadcom Corporation
- * Copyright (c) 2016 Broadcom Limited
+ * Copyright (c) 2016-2017 Broadcom Limited
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -11,12 +11,12 @@
 #ifndef BNXT_HSI_H
 #define BNXT_HSI_H
 
-/* HSI and HWRM Specification 1.6.0 */
+/* HSI and HWRM Specification 1.6.1 */
 #define HWRM_VERSION_MAJOR 1
 #define HWRM_VERSION_MINOR 6
-#define HWRM_VERSION_UPDATE0
+#define HWRM_VERSION_UPDATE1
 
-#define HWRM_VERSION_STR   "1.6.0"
+#define HWRM_VERSION_STR   "1.6.1"
 /*
  * Following is the signature for HWRM message field that indicates not
  * applicable (All F's). Need to cast it the size of the field if needed.
@@ -549,6 +549,8 @@ struct hwrm_ver_get_output {
__le32 dev_caps_cfg;
#define VER_GET_RESP_DEV_CAPS_CFG_SECURE_FW_UPD_SUPPORTED  0x1UL
#define VER_GET_RESP_DEV_CAPS_CFG_FW_DCBX_AGENT_SUPPORTED  0x2UL
+   #define VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_SUPPORTED  0x4UL
+   #define VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_REQUIRED   0x8UL
u8 roce_fw_maj;
u8 roce_fw_min;
u8 roce_fw_bld;
@@ -1919,6 +1921,219 @@ struct hwrm_port_phy_i2c_read_output {
u8 valid;
 };
 
+/* hwrm_port_led_cfg */
+/* Input (64 bytes) */
+struct hwrm_port_led_cfg_input {
+   __le16 req_type;
+   __le16 cmpl_ring;
+   __le16 seq_id;
+   __le16 target_id;
+   __le64 resp_addr;
+   __le32 enables;
+   #define PORT_LED_CFG_REQ_ENABLES_LED0_ID0x1UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED0_STATE 0x2UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED0_COLOR 0x4UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED0_BLINK_ON  0x8UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED0_BLINK_OFF 0x10UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED0_GROUP_ID  0x20UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED1_ID0x40UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED1_STATE 0x80UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED1_COLOR 0x100UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED1_BLINK_ON  0x200UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED1_BLINK_OFF 0x400UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED1_GROUP_ID  0x800UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED2_ID0x1000UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED2_STATE 0x2000UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED2_COLOR 0x4000UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED2_BLINK_ON  0x8000UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED2_BLINK_OFF 0x1UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED2_GROUP_ID  0x2UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED3_ID0x4UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED3_STATE 0x8UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED3_COLOR 0x10UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED3_BLINK_ON  0x20UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED3_BLINK_OFF 0x40UL
+   #define PORT_LED_CFG_REQ_ENABLES_LED3_GROUP_ID  0x80UL
+   __le16 port_id;
+   u8 num_leds;
+   u8 rsvd;
+   u8 led0_id;
+   u8 led0_state;
+   #define PORT_LED_CFG_REQ_LED0_STATE_DEFAULT0x0UL
+   #define PORT_LED_CFG_REQ_LED0_STATE_OFF0x1UL
+   #define PORT_LED_CFG_REQ_LED0_STATE_ON 0x2UL
+   #define PORT_LED_CFG_REQ_LED0_STATE_BLINK  0x3UL
+   #define PORT_LED_CFG_REQ_LED0_STATE_BLINKALT   0x4UL
+   u8 led0_color;
+   #define PORT_LED_CFG_REQ_LED0_COLOR_DEFAULT0x0UL
+   #define PORT_LED_CFG_REQ_LED0_COLOR_AMBER  0x1UL
+   #define PORT_LED_CFG_REQ_LED0_COLOR_GREEN  0x2UL
+   #define PORT_LED_CFG_REQ_LED0_COLOR_GREENAMBER 0x3UL
+   u8 unused_0;
+   __le16 led0_blink_on;
+   __le16 led0_blink_off;
+   u8 led0_group_id;
+   u8 rsvd0;
+   u8 led1_id;
+   u8 led1_state;
+   #define PORT_LED_CFG_REQ_LED1_STATE_DEFAULT0x0UL
+   #define PORT_LED_CFG_REQ_LED1_STATE_OFF0x1UL
+   #define PORT_LED_CFG_REQ_LED1_STATE_ON

[PATCH net-next v2 0/5] bnxt_en: Misc. updates for net-next.

2017-01-12 Thread Michael Chan

Miscellaneous updates including firmware spec update, ethtool -p blinking
LED support, RDMA SRIOV config callback, and minor fixes.

v2: Dropped the DCBX RoCE app TLV patch until the ETH_P_IBOE RDMA patch
is merged.

Michael Chan (5):
  bnxt_en: Fix compiler warnings when CONFIG_RFS_ACCEL is not defined.
  bnxt_en: Clear TPA flags when BNXT_FLAG_NO_AGG_RINGS is set.
  bnxt_en: Update to firmware interface spec to 1.6.1.
  bnxt_en: Add support for ethtool -p.
  bnxt_en: Add the ulp_sriov_cfg hooks for bnxt_re RDMA driver.

 drivers/net/ethernet/broadcom/bnxt/bnxt.c |  44 
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  17 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  44 +++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h |  23 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 261 --
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c   |   5 +
 6 files changed, 369 insertions(+), 25 deletions(-)

-- 
1.8.3.1

[PATCH net-next v2 2/5] bnxt_en: Clear TPA flags when BNXT_FLAG_NO_AGG_RINGS is set.

2017-01-12 Thread Michael Chan

Commit bdbd1eb59c56 ("bnxt_en: Handle no aggregation ring gracefully.")
introduced the BNXT_FLAG_NO_AGG_RINGS flag.  For consistency,
bnxt_set_tpa_flags() should also clear TPA flags when there are no
aggregation rings.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c091850..df2358b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2467,6 +2467,8 @@ static int bnxt_calc_nr_ring_pages(u32 ring_size, int 
desc_per_pg)
 static void bnxt_set_tpa_flags(struct bnxt *bp)
 {
bp->flags &= ~BNXT_FLAG_TPA;
+   if (bp->flags & BNXT_FLAG_NO_AGG_RINGS)
+   return;
if (bp->dev->features & NETIF_F_LRO)
bp->flags |= BNXT_FLAG_LRO;
if (bp->dev->features & NETIF_F_GRO)
-- 
1.8.3.1

[PATCH net-next v2 1/5] bnxt_en: Fix compiler warnings when CONFIG_RFS_ACCEL is not defined.

2017-01-12 Thread Michael Chan

  CC [M]  drivers/net/ethernet/broadcom/bnxt/bnxt.o
drivers/net/ethernet/broadcom/bnxt/bnxt.c:4947:21: warning: 
‘bnxt_get_max_func_rss_ctxs’ defined but not used [-Wunused-function]
 static unsigned int bnxt_get_max_func_rss_ctxs(struct bnxt *bp)
 ^
  CC [M]  drivers/net/ethernet/broadcom/bnxt/bnxt.o
drivers/net/ethernet/broadcom/bnxt/bnxt.c:4956:21: warning: 
‘bnxt_get_max_func_vnics’ defined but not used [-Wunused-function]
 static unsigned int bnxt_get_max_func_vnics(struct bnxt *bp)
 ^

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 7bd2a85..c091850 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4944,6 +4944,7 @@ static int bnxt_setup_int_mode(struct bnxt *bp)
return rc;
 }
 
+#ifdef CONFIG_RFS_ACCEL
 static unsigned int bnxt_get_max_func_rss_ctxs(struct bnxt *bp)
 {
 #if defined(CONFIG_BNXT_SRIOV)
@@ -4961,6 +4962,7 @@ static unsigned int bnxt_get_max_func_vnics(struct bnxt 
*bp)
 #endif
return bp->pf.max_vnics;
 }
+#endif
 
 unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
 {
-- 
1.8.3.1

[PATCH net-next v2 5/5] bnxt_en: Add the ulp_sriov_cfg hooks for bnxt_re RDMA driver.

2017-01-12 Thread Michael Chan

Add the ulp_sriov_cfg callbacks when the number of VFs is changing.  This
allows the RDMA driver to provision RDMA resources for the VFs.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 64ef0e5..0b8cd74 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -15,6 +15,7 @@
 #include 
 #include "bnxt_hsi.h"
 #include "bnxt.h"
+#include "bnxt_ulp.h"
 #include "bnxt_sriov.h"
 #include "bnxt_ethtool.h"
 
@@ -555,6 +556,8 @@ static int bnxt_sriov_enable(struct bnxt *bp, int *num_vfs)
if (rc)
goto err_out2;
 
+   bnxt_ulp_sriov_cfg(bp, *num_vfs);
+
rc = pci_enable_sriov(bp->pdev, *num_vfs);
if (rc)
goto err_out2;
@@ -596,6 +599,8 @@ void bnxt_sriov_disable(struct bnxt *bp)
rtnl_lock();
bnxt_restore_pf_fw_resources(bp);
rtnl_unlock();
+
+   bnxt_ulp_sriov_cfg(bp, 0);
 }
 
 int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs)
-- 
1.8.3.1

Re: [PATCH] [v2] net: qcom/emac: add ethtool support

2017-01-12 Thread Timur Tabi


Timur Tabi wrote:

+static void emac_get_pauseparam(struct net_device *netdev,
+   struct ethtool_pauseparam *pause)
+{
+   struct phy_device *phydev = netdev->phydev;
+
+   if (phydev) {
+   if (phydev->autoneg)
+   pause->autoneg = 1;
+   if (phydev->pause)
+   pause->rx_pause = 1;
+   if (phydev->pause != phydev->asym_pause)
+   pause->tx_pause = 1;
+   }
+}


I finally figured out why this code was bothering me.

This function works, as long as I do NOT implement set_pauseparam. 
That's because the driver always matches the pause frame support in the 
MAC with whatever the PHY is doing.  Since the MAC and PHY are always in 
sync, I can use the PHY settings for get_pauseparam.


However, technically this is not correct.  get_pauseparam is supposed to 
return the setting of the MAC, not the PHY.  If I also implement 
set_pauseparam, which can force the MAC to ignore the PHY and 
enable/disable pause frames arbitrarily, then the above function is wrong.


Do I finally understand this correctly?

--
Sent by an employee of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the
Code Aurora Forum, hosted by The Linux Foundation.

[PATCH net-next v2 02/13] tcp: new helper for RACK to detect loss

2017-01-12 Thread Yuchung Cheng

Create a new helper tcp_rack_detect_loss to prepare the upcoming
RACK reordering timer patch.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/net/tcp.h   |  3 +--
 net/ipv4/tcp_input.c| 12 
 net/ipv4/tcp_recovery.c | 22 +-
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1da0aa724929..51183bba3835 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1863,8 +1863,7 @@ extern int sysctl_tcp_recovery;
 /* Use TCP RACK to detect (some) tail and retransmit losses */
 #define TCP_RACK_LOST_RETRANS  0x1
 
-extern int tcp_rack_mark_lost(struct sock *sk);
-
+extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp,
 const struct skb_mstamp *xmit_time, u8 sacked);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ec6d84363024..bb24b93e64bc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2865,10 +2865,14 @@ static void tcp_fastretrans_alert(struct sock *sk, 
const int acked,
}
 
/* Use RACK to detect loss */
-   if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
-   tcp_rack_mark_lost(sk)) {
-   flag |= FLAG_LOST_RETRANS;
-   *ack_flag |= FLAG_LOST_RETRANS;
+   if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+   u32 prior_retrans = tp->retrans_out;
+
+   tcp_rack_mark_lost(sk);
+   if (prior_retrans > tp->retrans_out) {
+   flag |= FLAG_LOST_RETRANS;
+   *ack_flag |= FLAG_LOST_RETRANS;
+   }
}
 
/* E. Process state. */
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index f38dba5aed7a..7ea0377229c0 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,17 +32,11 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct 
sk_buff *skb)
  * The current version is only used after recovery starts but can be
  * easily extended to detect the first loss.
  */
-int tcp_rack_mark_lost(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk)
 {
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
-   u32 reo_wnd, prior_retrans = tp->retrans_out;
-
-   if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
-   return 0;
-
-   /* Reset the advanced flag to avoid unnecessary queue scanning */
-   tp->rack.advanced = 0;
+   u32 reo_wnd;
 
/* To be more reordering resilient, allow min_rtt/4 settling delay
 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
@@ -82,7 +76,17 @@ int tcp_rack_mark_lost(struct sock *sk)
break;
}
}
-   return prior_retrans - tp->retrans_out;
+}
+
+void tcp_rack_mark_lost(struct sock *sk)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+
+   if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+   return;
+   /* Reset the advanced flag to avoid unnecessary queue scanning */
+   tp->rack.advanced = 0;
+   tcp_rack_detect_loss(sk);
 }
 
 /* Record the most recently (re)sent time among the (s)acked packets */
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next v2 03/13] tcp: record most recent RTT in RACK loss detection

2017-01-12 Thread Yuchung Cheng

Record the most recent RTT in RACK. It is often identical to the
"ca_rtt_us" values in tcp_clean_rtx_queue. But when the packet has
been retransmitted, RACK choses to believe the ACK is for the
(latest) retransmitted packet if the RTT is over minimum RTT.

This requires passing the arrival time of the most recent ACK to
RACK routines. The timestamp is now recorded in the "ack_time"
in tcp_sacktag_state during the ACK processing.

This patch does not change the RACK algorithm itself. It only adds
the RTT variable to prepare the next main patch.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/linux/tcp.h |  1 +
 include/net/tcp.h   |  7 ---
 net/ipv4/tcp_input.c| 36 ++--
 net/ipv4/tcp_recovery.c | 41 +++--
 4 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fc5848dad7a4..1255c592719c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -207,6 +207,7 @@ struct tcp_sock {
/* Information of the most recently (s)acked skb */
struct tcp_rack {
struct skb_mstamp mstamp; /* (Re)sent time of the skb */
+   u32 rtt_us;  /* Associated RTT */
u8 advanced; /* mstamp advanced since last lost marking */
u8 reord;/* reordering detected */
} rack;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 51183bba3835..1439107658c2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1863,9 +1863,10 @@ extern int sysctl_tcp_recovery;
 /* Use TCP RACK to detect (some) tail and retransmit losses */
 #define TCP_RACK_LOST_RETRANS  0x1
 
-extern void tcp_rack_mark_lost(struct sock *sk);
-extern void tcp_rack_advance(struct tcp_sock *tp,
-const struct skb_mstamp *xmit_time, u8 sacked);
+extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
+extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+const struct skb_mstamp *xmit_time,
+const struct skb_mstamp *ack_time);
 
 /*
  * Save and compile IPv4 options, return a pointer to it
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bb24b93e64bc..8ccd171999bf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1135,6 +1135,7 @@ struct tcp_sacktag_state {
 */
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
+   struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
struct rate_sample *rate;
int flag;
 };
@@ -1217,7 +1218,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
 
if (!(sacked & TCPCB_SACKED_ACKED)) {
-   tcp_rack_advance(tp, xmit_time, sacked);
+   tcp_rack_advance(tp, sacked, xmit_time, >ack_time);
 
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
@@ -2813,7 +2814,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const 
int acked)
  * tcp_xmit_retransmit_queue().
  */
 static void tcp_fastretrans_alert(struct sock *sk, const int acked,
- bool is_dupack, int *ack_flag, int *rexmit)
+ bool is_dupack, int *ack_flag, int *rexmit,
+ const struct skb_mstamp *ack_time)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2868,7 +2870,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const 
int acked,
if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
u32 prior_retrans = tp->retrans_out;
 
-   tcp_rack_mark_lost(sk);
+   tcp_rack_mark_lost(sk, ack_time);
if (prior_retrans > tp->retrans_out) {
flag |= FLAG_LOST_RETRANS;
*ack_flag |= FLAG_LOST_RETRANS;
@@ -3105,11 +3107,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct 
sk_buff *skb,
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
   u32 prior_snd_una, int *acked,
-  struct tcp_sacktag_state *sack,
-  struct skb_mstamp *now)
+  struct tcp_sacktag_state *sack)
 {
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt;
+   struct skb_mstamp *now = >ack_time;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
@@ -3169,7 +3171,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int 
prior_fackets,
} else if (tcp_is_sack(tp)) {
tp->delivered += acked_pcount;

[PATCH net-next v2 04/13] tcp: add reordering timer in RACK loss detection

2017-01-12 Thread Yuchung Cheng

This patch makes RACK install a reordering timer when it suspects
some packets might be lost, but wants to delay the decision
a little bit to accomodate reordering.

It does not create a new timer but instead repurposes the existing
RTO timer, because both are meant to retransmit packets.
Specifically it arms a timer ICSK_TIME_REO_TIMEOUT when
the RACK timing check fails. The wait time is set to

  RACK.RTT + RACK.reo_wnd - (NOW - Packet.xmit_time) + fudge

This translates to expecting a packet (Packet) should take
(RACK.RTT + RACK.reo_wnd + fudge) to deliver after it was sent.

When there are multiple packets that need a timer, we use one timer
with the maximum timeout. Therefore the timer conservatively uses
the maximum window to expire N packets by one timeout, instead of
N timeouts to expire N packets sent at different times.

The fudge factor is 2 jiffies to ensure when the timer fires, all
the suspected packets would exceed the deadline and be marked lost
by tcp_rack_detect_loss(). It has to be at least 1 jiffy because the
clock may tick between calling icsk_reset_xmit_timer(timeout) and
actually hang the timer. The next jiffy is to lower-bound the timeout
to 2 jiffies when reo_wnd is < 1ms.

When the reordering timer fires (tcp_rack_reo_timeout): If we aren't
in Recovery we'll enter fast recovery and force fast retransmit.
This is very similar to the early retransmit (RFC5827) except RACK
is not constrained to only enter recovery for small outstanding
flights.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/net/inet_connection_sock.h |  4 ++-
 include/net/tcp.h  |  4 +++
 net/ipv4/inet_diag.c   |  1 +
 net/ipv4/tcp_input.c   |  6 ++--
 net/ipv4/tcp_ipv4.c|  1 +
 net/ipv4/tcp_output.c  |  3 +-
 net/ipv4/tcp_recovery.c| 57 +-
 net/ipv4/tcp_timer.c   |  3 ++
 net/ipv6/tcp_ipv6.c|  1 +
 9 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 85ee3879499e..84b2edde09b1 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -144,6 +144,7 @@ struct inet_connection_sock {
 #define ICSK_TIME_PROBE0   3   /* Zero window probe timer */
 #define ICSK_TIME_EARLY_RETRANS 4  /* Early retransmit timer */
 #define ICSK_TIME_LOSS_PROBE   5   /* Tail loss probe timer */
+#define ICSK_TIME_REO_TIMEOUT  6   /* Reordering timer */
 
 static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
 {
@@ -234,7 +235,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock 
*sk, const int what,
}
 
if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
-   what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE) {
+   what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||
+   what == ICSK_TIME_REO_TIMEOUT) {
icsk->icsk_pending = what;
icsk->icsk_timeout = jiffies + when;
sk_reset_timer(sk, >icsk_retransmit_timer, 
icsk->icsk_timeout);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1439107658c2..64fcdeb3358b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -143,6 +143,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval 
between probes
 * for local resources.
 */
+#define TCP_REO_TIMEOUT_MIN(2000) /* Min RACK reordering timeout in usec */
 
 #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */
 #define TCP_KEEPALIVE_PROBES   9   /* Max of 9 keepalive probes
*/
@@ -397,6 +398,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff 
*skb,
 int tcp_child_process(struct sock *parent, struct sock *child,
  struct sk_buff *skb);
 void tcp_enter_loss(struct sock *sk);
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
 void tcp_clear_retrans(struct tcp_sock *tp);
 void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
@@ -541,6 +543,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff 
*skb, int segs);
 void tcp_retransmit_timer(struct sock *sk);
 void tcp_xmit_retransmit_queue(struct sock *);
 void tcp_simple_retransmit(struct sock *);
+void tcp_enter_recovery(struct sock *sk, bool ece_ack);
 int tcp_trim_head(struct sock *, struct sk_buff *, u32);
 int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
 
@@ -1867,6 +1870,7 @@ extern void tcp_rack_mark_lost(struct sock *sk, const 
struct skb_mstamp *now);
 extern void tcp_rack_advance(struct tcp_sock *tp,

[PATCH net-next v2 07/13] tcp: enable RACK loss detection to trigger recovery

2017-01-12 Thread Yuchung Cheng

This patch changes two things:

1. Start fast recovery with RACK in addition to other heuristics
   (e.g., DUPACK threshold, FACK). Prior to this change RACK
   is enabled to detect losses only after the recovery has
   started by other algorithms.

2. Disable TCP early retransmit. RACK subsumes the early retransmit
   with the new reordering timer feature. A latter patch in this
   series removes the early retransmit code.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/net/tcp.h   | 11 ---
 net/ipv4/tcp_input.c| 29 +
 net/ipv4/tcp_recovery.c | 16 ++--
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5fb1e75a32a9..423438dd6fe9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -262,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
+extern int sysctl_tcp_recovery;
+#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
+
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
 extern int sysctl_tcp_min_tso_segs;
@@ -1043,6 +1046,7 @@ static inline void tcp_enable_early_retrans(struct 
tcp_sock *tp)
 
tp->do_early_retrans = sysctl_tcp_early_retrans &&
sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
+   !(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
net->ipv4.sysctl_tcp_reordering == 3;
 }
 
@@ -1859,13 +1863,6 @@ void tcp_v4_init(void);
 void tcp_init(void);
 
 /* tcp_recovery.c */
-
-/* Flags to enable various loss recovery features. See below */
-extern int sysctl_tcp_recovery;
-
-/* Use TCP RACK to detect (some) tail and retransmit losses */
-#define TCP_RACK_LOST_RETRANS  0x1
-
 extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 const struct skb_mstamp *xmit_time,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c98dc874825..4ad75b8c4fee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2129,10 +2129,25 @@ static bool tcp_pause_early_retransmit(struct sock *sk, 
int flag)
  * F.e. after RTO, when all the queue is considered as lost,
  * lost_out = packets_out and in_flight = retrans_out.
  *
- * Essentially, we have now two algorithms counting
+ * Essentially, we have now a few algorithms detecting
  * lost packets.
  *
- * FACK: It is the simplest heuristics. As soon as we decided
+ * If the receiver supports SACK:
+ *
+ * RFC6675/3517: It is the conventional algorithm. A packet is
+ * considered lost if the number of higher sequence packets
+ * SACKed is greater than or equal the DUPACK thoreshold
+ * (reordering). This is implemented in tcp_mark_head_lost and
+ * tcp_update_scoreboard.
+ *
+ * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ * (2017-) that checks timing instead of counting DUPACKs.
+ * Essentially a packet is considered lost if it's not S/ACKed
+ * after RTT + reordering_window, where both metrics are
+ * dynamically measured and adjusted. This is implemented in
+ * tcp_rack_mark_lost.
+ *
+ * FACK: it is the simplest heuristics. As soon as we decided
  * that something is lost, we decide that _all_ not SACKed
  * packets until the most forward SACK are lost. I.e.
  * lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2141,16 +2156,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, 
int flag)
  * takes place. We use FACK by default until reordering
  * is suspected on the path to this destination.
  *
- * NewReno: when Recovery is entered, we assume that one segment
+ * If the receiver does not support SACK:
+ *
+ * NewReno (RFC6582): in Recovery we assume that one segment
  * is lost (classic Reno). While we are in Recovery and
  * a partial ACK arrives, we assume that one more packet
  * is lost (NewReno). This heuristics are the same in NewReno
  * and SACK.
  *
- *  Imagine, that's all! Forget about all this shamanism about CWND inflation
- *  deflation etc. CWND is real congestion window, never inflated, changes
- *  only according to classic VJ rules.
- *
  * Really tricky (and requiring careful tuning) part of algorithm
  * is hidden in functions tcp_time_to_recover() and 
tcp_xmit_retransmit_queue().
  * The first determines the moment _when_ we should reduce CWND

[PATCH net-next v2 12/13] tcp: remove thin_dupack feature

2017-01-12 Thread Yuchung Cheng

Thin stream DUPACK is to start fast recovery on only one DUPACK
provided the connection is a thin stream (i.e., low inflight).  But
this older feature is now subsumed with RACK. If a connection
receives only a single DUPACK, RACK would arm a reordering timer
and soon starts fast recovery instead of timeout if no further
ACKs are received.

The socket option (THIN_DUPACK) is kept as a nop for compatibility.
Note that this patch does not change another thin-stream feature
which enables linear RTO. Although it might be good to generalize
that in the future (i.e., linear RTO for the first say 3 retries).

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 Documentation/networking/ip-sysctl.txt | 12 
 include/linux/tcp.h|  2 +-
 net/ipv4/sysctl_net_ipv4.c |  7 ---
 net/ipv4/tcp.c |  6 ++
 net/ipv4/tcp_input.c   | 13 -
 5 files changed, 3 insertions(+), 37 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 7de2cf79e16f..aa1bb49f1dc6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -703,18 +703,6 @@ tcp_thin_linear_timeouts - BOOLEAN
Documentation/networking/tcp-thin.txt
Default: 0
 
-tcp_thin_dupack - BOOLEAN
-   Enable dynamic triggering of retransmissions after one dupACK
-   for thin streams. If set, a check is performed upon reception
-   of a dupACK to determine if the stream is thin (less than 4
-   packets in flight). As long as the stream is found to be thin,
-   data is retransmitted on the first received dupACK. This
-   improves retransmission latency for non-aggressive thin
-   streams, often found to be time-dependent.
-   For more information on thin streams, see
-   Documentation/networking/tcp-thin.txt
-   Default: 0
-
 tcp_limit_output_bytes - INTEGER
Controls TCP Small Queue limit per tcp socket.
TCP bulk sender tends to increase packets in flight until it
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4733368f953a..6c22332afb75 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -220,7 +220,7 @@ struct tcp_sock {
unused:5;
u8  nonagle : 4,/* Disable Nagle algorithm? */
thin_lto: 1,/* Use linear timeouts for thin streams */
-   thin_dupack : 1,/* Fast retransmit on first dupack  */
+   unused1 : 1,
repair  : 1,
frto: 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8  repair_queue;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0f2d37e8e983..c8d283615c6f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -537,13 +537,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler   = proc_dointvec
},
{
-   .procname   = "tcp_thin_dupack",
-   .data   = _tcp_thin_dupack,
-   .maxlen = sizeof(int),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec
-   },
-   {
.procname   = "tcp_early_retrans",
.data   = _tcp_early_retrans,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d9023e8ed53e..aba6ea76338e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2474,9 +2474,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_THIN_DUPACK:
if (val < 0 || val > 1)
err = -EINVAL;
-   else {
-   tp->thin_dupack = val;
-   }
break;
 
case TCP_REPAIR:
@@ -2966,8 +2963,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
+
case TCP_THIN_DUPACK:
-   val = tp->thin_dupack;
+   val = 0;
break;
 
case TCP_REPAIR:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 87315ab1ab1a..39ebc20ca1b2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-
-int sysctl_tcp_thin_dupack __read_mostly;
-
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -2170,16 +2167,6 @@ static bool tcp_time_to_recover(struct sock *sk, int 
flag)
if (tcp_dupack_heuristics(tp) > tp->reordering)

[PATCH net-next v2 06/13] tcp: check undo conditions before detecting losses

2017-01-12 Thread Yuchung Cheng

Currently RACK would mark loss before the undo operations in TCP
loss recovery. This could incorrectly identify real losses as
spurious. For example a sender first experiences a delay spike and
then eventually some packets were lost due to buffer overrun.
In this case, the sender should perform fast recovery b/c not all
the packets were lost.

But the sender may first trigger a (spurious) RTO and reset
cwnd to 1. The following ACKs may used to mark real losses by
tcp_rack_mark_lost. Then in tcp_process_loss this ACK could trigger
F-RTO undo condition and unmark real losses and revert the cwnd
reduction. If there are no more ACKs coming back, eventually the
sender would timeout again instead of performing fast recovery.

The patch fixes this incorrect process by always performing
the undo checks before detecting losses.

Fixes: 4f41b1c58a32 ("tcp: use RACK to detect losses")
Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 33 -
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e42ca11c0326..9c98dc874825 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2801,6 +2801,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const 
int acked)
return false;
 }
 
+static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
+  const struct skb_mstamp *ack_time)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+
+   /* Use RACK to detect loss */
+   if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+   u32 prior_retrans = tp->retrans_out;
+
+   tcp_rack_mark_lost(sk, ack_time);
+   if (prior_retrans > tp->retrans_out)
+   *ack_flag |= FLAG_LOST_RETRANS;
+   }
+}
+
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2866,17 +2881,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const 
int acked,
}
}
 
-   /* Use RACK to detect loss */
-   if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
-   u32 prior_retrans = tp->retrans_out;
-
-   tcp_rack_mark_lost(sk, ack_time);
-   if (prior_retrans > tp->retrans_out) {
-   flag |= FLAG_LOST_RETRANS;
-   *ack_flag |= FLAG_LOST_RETRANS;
-   }
-   }
-
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
@@ -2894,11 +2898,13 @@ static void tcp_fastretrans_alert(struct sock *sk, 
const int acked,
tcp_try_keep_open(sk);
return;
}
+   tcp_rack_identify_loss(sk, ack_flag, ack_time);
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit);
-   if (icsk->icsk_ca_state != TCP_CA_Open &&
-   !(flag & FLAG_LOST_RETRANS))
+   tcp_rack_identify_loss(sk, ack_flag, ack_time);
+   if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+ (*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
default:
@@ -2912,6 +2918,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const 
int acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
 
+   tcp_rack_identify_loss(sk, ack_flag, ack_time);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next v2 13/13] tcp: disable fack by default

2017-01-12 Thread Yuchung Cheng

This patch disables FACK by default as RACK is the successor of FACK
(inspired by the insights behind FACK).

FACK[1] in Linux works as follows: a packet P is deemed lost,
if packet Q of higher sequence is s/acked and P and Q are distant
by at least dupthresh number of packets in sequence space.

FACK is more aggressive than the IETF recommened recovery for SACK
(RFC3517 A Conservative Selective Acknowledgment (SACK)-based Loss
 Recovery Algorithm for TCP), because a single SACK may trigger
fast recovery. This obviously won't work well with reordering so
FACK is dynamically disabled upon detecting reordering.

RACK supersedes FACK by using time distance instead of sequence
distance. On reordering, RACK waits for a quarter of RTT receiving
a single SACK before starting recovery. (the timer can be made more
adaptive in the future by measuring reordering distance in time,
but currently RTT/4 seem to work well.) Once the recovery starts,
RACK behaves almost like FACK because it reduces the reodering
window to 1ms, so it fast retransmits quickly. In addition RACK
can detect loss retransmission as it does not care about the packet
sequences (being repeated or not), which is extremely useful when
the connection is going through a traffic policer.

Google server experiments indicate that disabling FACK after enabling
RACK has negligible impact on the overall loss recovery performance
with more reordering events detected.  But we still keep the FACK
implementation for backup if RACK has bugs that needs to be disabled.

[1] M. Mathis, J. Mahdavi, "Forward Acknowledgment: Refining
TCP Congestion Control," In Proceedings of SIGCOMM '96, August 1996.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 39ebc20ca1b2..1a34e9278c07 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,7 @@
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
-int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
@@ -2114,7 +2114,8 @@ static inline int tcp_dupack_heuristics(const struct 
tcp_sock *tp)
  * dynamically measured and adjusted. This is implemented in
  * tcp_rack_mark_lost.
  *
- * FACK: it is the simplest heuristics. As soon as we decided
+ * FACK (Disabled by default. Subsumbed by RACK):
+ * It is the simplest heuristics. As soon as we decided
  * that something is lost, we decide that _all_ not SACKed
  * packets until the most forward SACK are lost. I.e.
  * lost_out = fackets_out - sacked_out and left_out = fackets_out.
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next v2 10/13] tcp: remove early retransmit

2017-01-12 Thread Yuchung Cheng

This patch removes the support of RFC5827 early retransmit (i.e.,
fast recovery on small inflight with <3 dupacks) because it is
subsumed by the new RACK loss detection. More specifically when
RACK receives DUPACKs, it'll arm a reordering timer to start fast
recovery after a quarter of (min)RTT, hence it covers the early
retransmit except RACK does not limit itself to specific inflight
or dupack numbers.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 Documentation/networking/ip-sysctl.txt | 19 +++
 include/linux/tcp.h|  3 +-
 include/net/tcp.h  | 19 ---
 net/ipv4/inet_diag.c   |  1 -
 net/ipv4/tcp.c |  3 --
 net/ipv4/tcp_input.c   | 60 ++
 net/ipv4/tcp_ipv4.c|  1 -
 net/ipv4/tcp_metrics.c |  1 -
 net/ipv4/tcp_minisocks.c   |  1 -
 net/ipv4/tcp_output.c  | 11 +++
 net/ipv4/tcp_timer.c   |  3 --
 net/ipv6/tcp_ipv6.c|  1 -
 12 files changed, 12 insertions(+), 111 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 7dd65c9cf707..7de2cf79e16f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN
Allows TCP to send "duplicate" SACKs.
 
 tcp_early_retrans - INTEGER
-   Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
-   for triggering fast retransmit when the amount of outstanding data is
-   small and when no previously unsent data can be transmitted (such
-   that limited transmit could be used). Also controls the use of
-   Tail loss probe (TLP) that converts RTOs occurring due to tail
-   losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01).
+   Tail loss probe (TLP) converts RTOs occurring due to tail
+   losses into fast recovery (draft-ietf-tcpm-rack). Note that
+   TLP requires RACK to function properly (see tcp_recovery below)
Possible values:
-   0 disables ER
-   1 enables ER
-   2 enables ER but delays fast recovery and fast retransmit
- by a fourth of RTT. This mitigates connection falsely
- recovers when network has a small degree of reordering
- (less than 3 packets).
-   3 enables delayed ER and TLP.
-   4 enables TLP only.
+   0 disables TLP
+   3 or 4 enables TLP
Default: 3
 
 tcp_ecn - INTEGER
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8e5f4c15d0e5..4733368f953a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -224,8 +224,7 @@ struct tcp_sock {
repair  : 1,
frto: 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8  repair_queue;
-   u8  do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
-   syn_data:1, /* SYN includes data */
+   u8  syn_data:1, /* SYN includes data */
syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 423438dd6fe9..c55d65f74f7f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -565,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 const struct sk_buff *next_skb);
 
 /* tcp_input.c */
-void tcp_resume_early_retransmit(struct sock *sk);
 void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk);
@@ -1037,24 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
 }
 
-/* TCP early-retransmit (ER) is similar to but more conservative than
- * the thin-dupack feature.  Enable ER only if thin-dupack is disabled.
- */
-static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
-{
-   struct net *net = sock_net((struct sock *)tp);
-
-   tp->do_early_retrans = sysctl_tcp_early_retrans &&
-   sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
-   !(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
-   net->ipv4.sysctl_tcp_reordering == 3;
-}
-
-static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
-{
-   tp->do_early_retrans = 0;
-}
-
 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
return tp->sacked_out + tp->lost_out;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index d216e40623d3..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++

[PATCH net-next v2 11/13] tcp: remove RFC4653 NCR

2017-01-12 Thread Yuchung Cheng

This patch removes the (partial) implementation of the aggressive
limited transmit in RFC4653 TCP Non-Congestion Robustness (NCR).

NCR is a mitigation to the problem created by the dynamic
DUPACK threshold.  With the current adaptive DUPACK threshold
(tp->reordering) could cause timeouts by preventing fast recovery.
For example, if the last packet of a cwnd burst was reordered, the
threshold will be set to the size of cwnd. But if next application
burst is smaller than threshold and has drops instead of reorderings,
the sender would not trigger fast recovery but instead resorts to a
timeout recovery.

NCR mitigates this issue by checking the number of DUPACKs against
the current flight size additionally. The techniqueue is similar to
the early retransmit RFC.

With RACK loss detection, this mitigation is not needed, because RACK
does not use DUPACK threshold to detect losses. RACK arms a reordering
timer to fire at most a quarter RTT later to start fast recovery.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 15 ---
 1 file changed, 15 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 79c819077a59..87315ab1ab1a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2161,8 +2161,6 @@ static inline int tcp_dupack_heuristics(const struct 
tcp_sock *tp)
 static bool tcp_time_to_recover(struct sock *sk, int flag)
 {
struct tcp_sock *tp = tcp_sk(sk);
-   __u32 packets_out;
-   int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 
/* Trick#1: The loss is proven. */
if (tp->lost_out)
@@ -2172,19 +2170,6 @@ static bool tcp_time_to_recover(struct sock *sk, int 
flag)
if (tcp_dupack_heuristics(tp) > tp->reordering)
return true;
 
-   /* Trick#4: It is still not OK... But will it be useful to delay
-* recovery more?
-*/
-   packets_out = tp->packets_out;
-   if (packets_out <= tp->reordering &&
-   tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
-   !tcp_may_send_now(sk)) {
-   /* We have nothing to send. This connection is limited
-* either by receiver window or by application.
-*/
-   return true;
-   }
-
/* If a thin stream is detected, retransmit after first
 * received dupack. Employ only if SACK is supported in order
 * to avoid possible corner-case series of spurious retransmissions
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next v2 00/13] tcp: RACK fast recovery

2017-01-12 Thread Yuchung Cheng

The patch set enables RACK loss detection (draft-ietf-tcpm-rack-01)
to trigger fast recovery with a reordering timer.

Previously RACK has been running in auxiliary mode where it is
used to detect packet losses once the recovery has triggered by
other algorithms (e.g., FACK). By inspecting packet timestamps,
RACK can start ACK-driven repairs timely. A few similar heuristics
are no longer needed and are either removed or disabled to reduce
the complexity of the Linux TCP loss recovery engine:

  1. FACK (Forward Acknowledgement)
  2. Early Retransmit (RFC5827)
  3. thin_dupack (fast recovery on single DUPACK for thin-streams)
  4. NCR (Non-Congestion Robustness RFC4653) (RFC4653)
  5. Forward Retransmit

After this change, Linux's loss recovery algorithms consist of
  1. Conventional DUPACK threshold approach (RFC6675)
  2. RACK and Tail Loss Probe (draft-ietf-tcpm-rack-01)
  3. RTO plus F-RTO extension (RFC5682)

The patch set has been tested on Google servers extensively and
presented in several IETF meetings. The data suggests that RACK
successfully improves recovery performance:
https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-draft-ietf-tcpm-rack-01.pdf
https://www.ietf.org/proceedings/96/slides/slides-96-tcpm-3.pdf

Yuchung Cheng (13):
  tcp: new helper function for RACK loss detection
  tcp: new helper for RACK to detect loss
  tcp: record most recent RTT in RACK loss detection
  tcp: add reordering timer in RACK loss detection
  tcp: use sequence to break TS ties for RACK loss detection
  tcp: check undo conditions before detecting losses
  tcp: enable RACK loss detection to trigger recovery
  tcp: extend F-RTO to catch more spurious timeouts
  tcp: remove forward retransmit feature
  tcp: remove early retransmit
  tcp: remove RFC4653 NCR
  tcp: remove thin_dupack feature
  tcp: disable fack by default

 Documentation/networking/ip-sysctl.txt |  31 +
 include/linux/tcp.h|   8 +-
 include/net/inet_connection_sock.h |   4 +-
 include/net/tcp.h  |  40 ++
 net/ipv4/inet_diag.c   |   2 +-
 net/ipv4/sysctl_net_ipv4.c |   7 --
 net/ipv4/tcp.c |   9 +-
 net/ipv4/tcp_input.c   | 224 +
 net/ipv4/tcp_ipv4.c|   2 +-
 net/ipv4/tcp_metrics.c |   1 -
 net/ipv4/tcp_minisocks.c   |   1 -
 net/ipv4/tcp_output.c  |  75 ++-
 net/ipv4/tcp_recovery.c| 148 --
 net/ipv4/tcp_timer.c   |   4 +-
 net/ipv6/tcp_ipv6.c|   2 +-
 15 files changed, 237 insertions(+), 321 deletions(-)

-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next v2 05/13] tcp: use sequence to break TS ties for RACK loss detection

2017-01-12 Thread Yuchung Cheng

The packets inside a jumbo skb (e.g., TSO) share the same skb
timestamp, even though they are sent sequentially on the wire. Since
RACK is based on time, it can not detect some packets inside the
same skb are lost.  However, we can leverage the packet sequence
numbers as extended timestamps to detect losses. Therefore, when
RACK timestamp is identical to skb's timestamp (i.e., one of the
packets of the skb is acked or sacked), we use the sequence numbers
of the acked and unacked packets to break ties.

We can use the same sequence logic to advance RACK xmit time as
well to detect more losses and avoid timeout.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/linux/tcp.h |  1 +
 include/net/tcp.h   |  2 +-
 net/ipv4/tcp_input.c|  5 +++--
 net/ipv4/tcp_recovery.c | 17 ++---
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1255c592719c..970d5f00589f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -208,6 +208,7 @@ struct tcp_sock {
struct tcp_rack {
struct skb_mstamp mstamp; /* (Re)sent time of the skb */
u32 rtt_us;  /* Associated RTT */
+   u32 end_seq; /* Ending TCP sequence of the skb */
u8 advanced; /* mstamp advanced since last lost marking */
u8 reord;/* reordering detected */
} rack;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 64fcdeb3358b..5fb1e75a32a9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1867,7 +1867,7 @@ extern int sysctl_tcp_recovery;
 #define TCP_RACK_LOST_RETRANS  0x1
 
 extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
-extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 const struct skb_mstamp *xmit_time,
 const struct skb_mstamp *ack_time);
 extern void tcp_rack_reo_timeout(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index be1191829963..e42ca11c0326 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1218,7 +1218,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
 
if (!(sacked & TCPCB_SACKED_ACKED)) {
-   tcp_rack_advance(tp, sacked, xmit_time, >ack_time);
+   tcp_rack_advance(tp, sacked, end_seq,
+xmit_time, >ack_time);
 
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
@@ -3171,7 +3172,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int 
prior_fackets,
} else if (tcp_is_sack(tp)) {
tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb))
-   tcp_rack_advance(tp, sacked,
+   tcp_rack_advance(tp, sacked, scb->end_seq,
 >skb_mstamp,
 >ack_time);
}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index eb39b1b6d1dc..1e330a2f913d 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -16,6 +16,14 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct 
sk_buff *skb)
}
 }
 
+static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
+   const struct skb_mstamp *t2,
+   u32 seq1, u32 seq2)
+{
+   return skb_mstamp_after(t1, t2) ||
+  (t1->v64 == t2->v64 && after(seq1, seq2));
+}
+
 /* Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
@@ -60,7 +68,8 @@ static void tcp_rack_detect_loss(struct sock *sk, const 
struct skb_mstamp *now,
scb->sacked & TCPCB_SACKED_ACKED)
continue;
 
-   if (skb_mstamp_after(>rack.mstamp, >skb_mstamp)) {
+   if (tcp_rack_sent_after(>rack.mstamp, >skb_mstamp,
+   tp->rack.end_seq, scb->end_seq)) {
/* Step 3 in draft-cheng-tcpm-rack-00.txt:
 * A packet is lost if its elapsed time is beyond
 * the recent RTT plus the reordering window.
@@ -113,14 +122,15 @@ void tcp_rack_mark_lost(struct sock *sk, const struct 
skb_mstamp *now)
  * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
  * draft-cheng-tcpm-rack-00.txt
  */
-void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
  const struct skb_mstamp

[PATCH net-next v2 09/13] tcp: remove forward retransmit feature

2017-01-12 Thread Yuchung Cheng

Forward retransmit is an esoteric feature in RFC3517 (condition(3)
in the NextSeg()). Basically if a packet is not considered lost by
the current criteria (# of dupacks etc), but the congestion window
has room for more packets, then retransmit this packet.

However it actually conflicts with the rest of recovery design. For
example, when reordering is detected we want to be conservative
in retransmitting packets but forward-retransmit feature would
break that to force more retransmission. Also the implementation is
fairly complicated inside the retransmission logic inducing extra
iterations in the write queue. With RACK losses are being detected
timely and this heuristic is no longer necessary. There this patch
removes the feature.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/linux/tcp.h   |  1 -
 net/ipv4/tcp_input.c  |  5 -
 net/ipv4/tcp_output.c | 61 +++
 3 files changed, 3 insertions(+), 64 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 970d5f00589f..8e5f4c15d0e5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -307,7 +307,6 @@ struct tcp_sock {
 */
 
int lost_cnt_hint;
-   u32 retransmit_high;/* L-bits may be on up to this seqno */
 
u32 prior_ssthresh; /* ssthresh saved at recovery start */
u32 high_seq;   /* snd_nxt at onset of congestion   */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9469ce384d3b..a041a92348ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -916,10 +916,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock 
*tp, struct sk_buff *skb)
before(TCP_SKB_CB(skb)->seq,
   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = skb;
-
-   if (!tp->lost_out ||
-   after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
-   tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 }
 
 /* Sum the number of packets on the wire we have marked as lost.
@@ -1983,7 +1979,6 @@ void tcp_enter_loss(struct sock *sk)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
-   tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
}
tcp_verify_left_out(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0ba9026cb70d..6327e4d368a4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2831,36 +2831,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff 
*skb, int segs)
return err;
 }
 
-/* Check if we forward retransmits are possible in the current
- * window/congestion state.
- */
-static bool tcp_can_forward_retransmit(struct sock *sk)
-{
-   const struct inet_connection_sock *icsk = inet_csk(sk);
-   const struct tcp_sock *tp = tcp_sk(sk);
-
-   /* Forward retransmissions are possible only during Recovery. */
-   if (icsk->icsk_ca_state != TCP_CA_Recovery)
-   return false;
-
-   /* No forward retransmissions in Reno are possible. */
-   if (tcp_is_reno(tp))
-   return false;
-
-   /* Yeah, we have to make difficult choice between forward transmission
-* and retransmission... Both ways have their merits...
-*
-* For now we do not retransmit anything, while we have some new
-* segments to send. In the other cases, follow rule 3 for
-* NextSeg() specified in RFC3517.
-*/
-
-   if (tcp_may_send_now(sk))
-   return false;
-
-   return true;
-}
-
 /* This gets called after a retransmit timeout, and the initially
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
@@ -2875,24 +2845,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *hole = NULL;
-   u32 max_segs, last_lost;
+   u32 max_segs;
int mib_idx;
-   int fwd_rexmitting = 0;
 
if (!tp->packets_out)
return;
 
-   if (!tp->lost_out)
-   tp->retransmit_high = tp->snd_una;
-
if (tp->retransmit_skb_hint) {
skb = tp->retransmit_skb_hint;
-   last_lost = TCP_SKB_CB(skb)->end_seq;
-   if (after(last_lost, tp->retransmit_high))
-   last_lost = tp->retransmit_high;
} else {
skb = tcp_write_queue_head(sk);
-   last_lost = tp->snd_una;
}
 
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2915,31 +2877,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)

[PATCH net-next v2 01/13] tcp: new helper function for RACK loss detection

2017-01-12 Thread Yuchung Cheng

Create a new helper tcp_rack_mark_skb_lost to prepare the
upcoming RACK reordering timer support.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_recovery.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index e36df4fcfeba..f38dba5aed7a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -3,6 +3,19 @@
 
 int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
 
+static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+
+   tcp_skb_mark_lost_uncond_verify(tp, skb);
+   if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+   /* Account for retransmits that are lost again */
+   TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+   tp->retrans_out -= tcp_skb_pcount(skb);
+   NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
+   }
+}
+
 /* Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
@@ -61,13 +74,7 @@ int tcp_rack_mark_lost(struct sock *sk)
continue;
 
/* skb is lost if packet sent later is sacked */
-   tcp_skb_mark_lost_uncond_verify(tp, skb);
-   if (scb->sacked & TCPCB_SACKED_RETRANS) {
-   scb->sacked &= ~TCPCB_SACKED_RETRANS;
-   tp->retrans_out -= tcp_skb_pcount(skb);
-   NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPLOSTRETRANSMIT);
-   }
+   tcp_rack_mark_skb_lost(sk, skb);
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
 * b/c the rest are all sent after rack_sent
-- 
2.11.0.483.g087da7b7c-goog

[PATCH net-next v2 08/13] tcp: extend F-RTO to catch more spurious timeouts

2017-01-12 Thread Yuchung Cheng

Current F-RTO reverts cwnd reset whenever a never-retransmitted
packet was (s)acked. The timeout can be declared spurious because
the packets acknoledged with this ACK was transmitted before the
timeout, so clearly not all the packets are lost to reset the cwnd.

This nice detection does not really depend F-RTO internals. This
patch applies the detection universally. On Google servers this
change detected 20% more spurious timeouts.

Suggested-by: Neal Cardwell 
Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 33 +++--
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4ad75b8c4fee..9469ce384d3b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1939,7 +1939,6 @@ void tcp_enter_loss(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sk_buff *skb;
-   bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg;  /* is receiver reneging on SACKs? */
bool mark_lost;
 
@@ -2000,13 +1999,15 @@ void tcp_enter_loss(struct sock *sk)
tp->high_seq = tp->snd_nxt;
tcp_ecn_queue_cwr(tp);
 
-   /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
-* loss recovery is underway except recurring timeout(s) on
-* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+   /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
+* if a previous recovery is underway, otherwise it may incorrectly
+* call a timeout spurious if some previously retransmitted packets
+* are s/acked (sec 3.2). We do not apply that retriction since
+* retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
+* so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
+* on PTMU discovery to avoid sending new data.
 */
-   tp->frto = sysctl_tcp_frto &&
-  (new_recovery || icsk->icsk_retransmits) &&
-  !inet_csk(sk)->icsk_mtup.probe_size;
+   tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size;
 }
 
 /* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2740,14 +2741,18 @@ static void tcp_process_loss(struct sock *sk, int flag, 
bool is_dupack,
tcp_try_undo_loss(sk, false))
return;
 
-   if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
-   /* Step 3.b. A timeout is spurious if not all data are
-* lost, i.e., never-retransmitted data are (s)acked.
-*/
-   if ((flag & FLAG_ORIG_SACK_ACKED) &&
-   tcp_try_undo_loss(sk, true))
-   return;
+   /* The ACK (s)acks some never-retransmitted data meaning not all
+* the data packets before the timeout were lost. Therefore we
+* undo the congestion window and state. This is essentially
+* the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
+* a retransmitted skb is permantly marked, we can apply such an
+* operation even if F-RTO was not used.
+*/
+   if ((flag & FLAG_ORIG_SACK_ACKED) &&
+   tcp_try_undo_loss(sk, tp->undo_marker))
+   return;
 
+   if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
if (after(tp->snd_nxt, tp->high_seq)) {
if (flag & FLAG_DATA_SACKED || is_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
-- 
2.11.0.483.g087da7b7c-goog

Re: [net-next 01/13] tcp: new helper function for RACK loss detection

2017-01-12 Thread Yuchung Cheng

On Thu, Jan 12, 2017 at 10:03 PM, Yuchung Cheng  wrote:
>
> Create a new helper tcp_rack_mark_skb_lost to prepare the
> upcoming RACK reordering timer support.
>
> Signed-off-by: Yuchung Cheng 
> Signed-off-by: Neal Cardwell 
> Acked-by: Eric Dumazet 
> ---
Oops I messed up the subject headers. Let me resubmit. Sorry.

>  net/ipv4/tcp_recovery.c | 21 ++---
>  1 file changed, 14 insertions(+), 7 deletions(-)
>
> diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
> index e36df4fcfeba..f38dba5aed7a 100644
> --- a/net/ipv4/tcp_recovery.c
> +++ b/net/ipv4/tcp_recovery.c
> @@ -3,6 +3,19 @@
>
>  int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
>
> +static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
> +{
> +   struct tcp_sock *tp = tcp_sk(sk);
> +
> +   tcp_skb_mark_lost_uncond_verify(tp, skb);
> +   if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
> +   /* Account for retransmits that are lost again */
> +   TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
> +   tp->retrans_out -= tcp_skb_pcount(skb);
> +   NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
> +   }
> +}
> +
>  /* Marks a packet lost, if some packet sent later has been (s)acked.
>   * The underlying idea is similar to the traditional dupthresh and FACK
>   * but they look at different metrics:
> @@ -61,13 +74,7 @@ int tcp_rack_mark_lost(struct sock *sk)
> continue;
>
> /* skb is lost if packet sent later is sacked */
> -   tcp_skb_mark_lost_uncond_verify(tp, skb);
> -   if (scb->sacked & TCPCB_SACKED_RETRANS) {
> -   scb->sacked &= ~TCPCB_SACKED_RETRANS;
> -   tp->retrans_out -= tcp_skb_pcount(skb);
> -   NET_INC_STATS(sock_net(sk),
> - LINUX_MIB_TCPLOSTRETRANSMIT);
> -   }
> +   tcp_rack_mark_skb_lost(sk, skb);
> } else if (!(scb->sacked & TCPCB_RETRANS)) {
> /* Original data are sent sequentially so stop early
>  * b/c the rest are all sent after rack_sent
> --
> 2.11.0.483.g087da7b7c-goog
>

[net-next 07/13] tcp: enable RACK loss detection to trigger recovery

2017-01-12 Thread Yuchung Cheng

This patch changes two things:

1. Start fast recovery with RACK in addition to other heuristics
   (e.g., DUPACK threshold, FACK). Prior to this change RACK
   is enabled to detect losses only after the recovery has
   started by other algorithms.

2. Disable TCP early retransmit. RACK subsumes the early retransmit
   with the new reordering timer feature. A latter patch in this
   series removes the early retransmit code.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/net/tcp.h   | 11 ---
 net/ipv4/tcp_input.c| 29 +
 net/ipv4/tcp_recovery.c | 16 ++--
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5fb1e75a32a9..423438dd6fe9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -262,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
+extern int sysctl_tcp_recovery;
+#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
+
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
 extern int sysctl_tcp_min_tso_segs;
@@ -1043,6 +1046,7 @@ static inline void tcp_enable_early_retrans(struct 
tcp_sock *tp)
 
tp->do_early_retrans = sysctl_tcp_early_retrans &&
sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
+   !(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
net->ipv4.sysctl_tcp_reordering == 3;
 }
 
@@ -1859,13 +1863,6 @@ void tcp_v4_init(void);
 void tcp_init(void);
 
 /* tcp_recovery.c */
-
-/* Flags to enable various loss recovery features. See below */
-extern int sysctl_tcp_recovery;
-
-/* Use TCP RACK to detect (some) tail and retransmit losses */
-#define TCP_RACK_LOST_RETRANS  0x1
-
 extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 const struct skb_mstamp *xmit_time,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c98dc874825..4ad75b8c4fee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2129,10 +2129,25 @@ static bool tcp_pause_early_retransmit(struct sock *sk, 
int flag)
  * F.e. after RTO, when all the queue is considered as lost,
  * lost_out = packets_out and in_flight = retrans_out.
  *
- * Essentially, we have now two algorithms counting
+ * Essentially, we have now a few algorithms detecting
  * lost packets.
  *
- * FACK: It is the simplest heuristics. As soon as we decided
+ * If the receiver supports SACK:
+ *
+ * RFC6675/3517: It is the conventional algorithm. A packet is
+ * considered lost if the number of higher sequence packets
+ * SACKed is greater than or equal the DUPACK thoreshold
+ * (reordering). This is implemented in tcp_mark_head_lost and
+ * tcp_update_scoreboard.
+ *
+ * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ * (2017-) that checks timing instead of counting DUPACKs.
+ * Essentially a packet is considered lost if it's not S/ACKed
+ * after RTT + reordering_window, where both metrics are
+ * dynamically measured and adjusted. This is implemented in
+ * tcp_rack_mark_lost.
+ *
+ * FACK: it is the simplest heuristics. As soon as we decided
  * that something is lost, we decide that _all_ not SACKed
  * packets until the most forward SACK are lost. I.e.
  * lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2141,16 +2156,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, 
int flag)
  * takes place. We use FACK by default until reordering
  * is suspected on the path to this destination.
  *
- * NewReno: when Recovery is entered, we assume that one segment
+ * If the receiver does not support SACK:
+ *
+ * NewReno (RFC6582): in Recovery we assume that one segment
  * is lost (classic Reno). While we are in Recovery and
  * a partial ACK arrives, we assume that one more packet
  * is lost (NewReno). This heuristics are the same in NewReno
  * and SACK.
  *
- *  Imagine, that's all! Forget about all this shamanism about CWND inflation
- *  deflation etc. CWND is real congestion window, never inflated, changes
- *  only according to classic VJ rules.
- *
  * Really tricky (and requiring careful tuning) part of algorithm
  * is hidden in functions tcp_time_to_recover() and 
tcp_xmit_retransmit_queue().
  * The first determines the moment _when_ we should reduce CWND

[net-next 11/13] tcp: remove RFC4653 NCR

2017-01-12 Thread Yuchung Cheng

This patch removes the (partial) implementation of the aggressive
limited transmit in RFC4653 TCP Non-Congestion Robustness (NCR).

NCR is a mitigation to the problem created by the dynamic
DUPACK threshold.  With the current adaptive DUPACK threshold
(tp->reordering) could cause timeouts by preventing fast recovery.
For example, if the last packet of a cwnd burst was reordered, the
threshold will be set to the size of cwnd. But if next application
burst is smaller than threshold and has drops instead of reorderings,
the sender would not trigger fast recovery but instead resorts to a
timeout recovery.

NCR mitigates this issue by checking the number of DUPACKs against
the current flight size additionally. The techniqueue is similar to
the early retransmit RFC.

With RACK loss detection, this mitigation is not needed, because RACK
does not use DUPACK threshold to detect losses. RACK arms a reordering
timer to fire at most a quarter RTT later to start fast recovery.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 15 ---
 1 file changed, 15 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 79c819077a59..87315ab1ab1a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2161,8 +2161,6 @@ static inline int tcp_dupack_heuristics(const struct 
tcp_sock *tp)
 static bool tcp_time_to_recover(struct sock *sk, int flag)
 {
struct tcp_sock *tp = tcp_sk(sk);
-   __u32 packets_out;
-   int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 
/* Trick#1: The loss is proven. */
if (tp->lost_out)
@@ -2172,19 +2170,6 @@ static bool tcp_time_to_recover(struct sock *sk, int 
flag)
if (tcp_dupack_heuristics(tp) > tp->reordering)
return true;
 
-   /* Trick#4: It is still not OK... But will it be useful to delay
-* recovery more?
-*/
-   packets_out = tp->packets_out;
-   if (packets_out <= tp->reordering &&
-   tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
-   !tcp_may_send_now(sk)) {
-   /* We have nothing to send. This connection is limited
-* either by receiver window or by application.
-*/
-   return true;
-   }
-
/* If a thin stream is detected, retransmit after first
 * received dupack. Employ only if SACK is supported in order
 * to avoid possible corner-case series of spurious retransmissions
-- 
2.11.0.483.g087da7b7c-goog

[net-next 08/13] tcp: extend F-RTO to catch more spurious timeouts

2017-01-12 Thread Yuchung Cheng

Current F-RTO reverts cwnd reset whenever a never-retransmitted
packet was (s)acked. The timeout can be declared spurious because
the packets acknoledged with this ACK was transmitted before the
timeout, so clearly not all the packets are lost to reset the cwnd.

This nice detection does not really depend F-RTO internals. This
patch applies the detection universally. On Google servers this
change detected 20% more spurious timeouts.

Suggested-by: Neal Cardwell 
Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 33 +++--
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4ad75b8c4fee..9469ce384d3b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1939,7 +1939,6 @@ void tcp_enter_loss(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sk_buff *skb;
-   bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg;  /* is receiver reneging on SACKs? */
bool mark_lost;
 
@@ -2000,13 +1999,15 @@ void tcp_enter_loss(struct sock *sk)
tp->high_seq = tp->snd_nxt;
tcp_ecn_queue_cwr(tp);
 
-   /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
-* loss recovery is underway except recurring timeout(s) on
-* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+   /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
+* if a previous recovery is underway, otherwise it may incorrectly
+* call a timeout spurious if some previously retransmitted packets
+* are s/acked (sec 3.2). We do not apply that retriction since
+* retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
+* so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
+* on PTMU discovery to avoid sending new data.
 */
-   tp->frto = sysctl_tcp_frto &&
-  (new_recovery || icsk->icsk_retransmits) &&
-  !inet_csk(sk)->icsk_mtup.probe_size;
+   tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size;
 }
 
 /* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2740,14 +2741,18 @@ static void tcp_process_loss(struct sock *sk, int flag, 
bool is_dupack,
tcp_try_undo_loss(sk, false))
return;
 
-   if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
-   /* Step 3.b. A timeout is spurious if not all data are
-* lost, i.e., never-retransmitted data are (s)acked.
-*/
-   if ((flag & FLAG_ORIG_SACK_ACKED) &&
-   tcp_try_undo_loss(sk, true))
-   return;
+   /* The ACK (s)acks some never-retransmitted data meaning not all
+* the data packets before the timeout were lost. Therefore we
+* undo the congestion window and state. This is essentially
+* the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
+* a retransmitted skb is permantly marked, we can apply such an
+* operation even if F-RTO was not used.
+*/
+   if ((flag & FLAG_ORIG_SACK_ACKED) &&
+   tcp_try_undo_loss(sk, tp->undo_marker))
+   return;
 
+   if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
if (after(tp->snd_nxt, tp->high_seq)) {
if (flag & FLAG_DATA_SACKED || is_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
-- 
2.11.0.483.g087da7b7c-goog

[net-next 05/13] tcp: use sequence to break TS ties for RACK loss detection

2017-01-12 Thread Yuchung Cheng

The packets inside a jumbo skb (e.g., TSO) share the same skb
timestamp, even though they are sent sequentially on the wire. Since
RACK is based on time, it can not detect some packets inside the
same skb are lost.  However, we can leverage the packet sequence
numbers as extended timestamps to detect losses. Therefore, when
RACK timestamp is identical to skb's timestamp (i.e., one of the
packets of the skb is acked or sacked), we use the sequence numbers
of the acked and unacked packets to break ties.

We can use the same sequence logic to advance RACK xmit time as
well to detect more losses and avoid timeout.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/linux/tcp.h |  1 +
 include/net/tcp.h   |  2 +-
 net/ipv4/tcp_input.c|  5 +++--
 net/ipv4/tcp_recovery.c | 17 ++---
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1255c592719c..970d5f00589f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -208,6 +208,7 @@ struct tcp_sock {
struct tcp_rack {
struct skb_mstamp mstamp; /* (Re)sent time of the skb */
u32 rtt_us;  /* Associated RTT */
+   u32 end_seq; /* Ending TCP sequence of the skb */
u8 advanced; /* mstamp advanced since last lost marking */
u8 reord;/* reordering detected */
} rack;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 64fcdeb3358b..5fb1e75a32a9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1867,7 +1867,7 @@ extern int sysctl_tcp_recovery;
 #define TCP_RACK_LOST_RETRANS  0x1
 
 extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
-extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 const struct skb_mstamp *xmit_time,
 const struct skb_mstamp *ack_time);
 extern void tcp_rack_reo_timeout(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index be1191829963..e42ca11c0326 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1218,7 +1218,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
 
if (!(sacked & TCPCB_SACKED_ACKED)) {
-   tcp_rack_advance(tp, sacked, xmit_time, >ack_time);
+   tcp_rack_advance(tp, sacked, end_seq,
+xmit_time, >ack_time);
 
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
@@ -3171,7 +3172,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int 
prior_fackets,
} else if (tcp_is_sack(tp)) {
tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb))
-   tcp_rack_advance(tp, sacked,
+   tcp_rack_advance(tp, sacked, scb->end_seq,
 >skb_mstamp,
 >ack_time);
}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index eb39b1b6d1dc..1e330a2f913d 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -16,6 +16,14 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct 
sk_buff *skb)
}
 }
 
+static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
+   const struct skb_mstamp *t2,
+   u32 seq1, u32 seq2)
+{
+   return skb_mstamp_after(t1, t2) ||
+  (t1->v64 == t2->v64 && after(seq1, seq2));
+}
+
 /* Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
@@ -60,7 +68,8 @@ static void tcp_rack_detect_loss(struct sock *sk, const 
struct skb_mstamp *now,
scb->sacked & TCPCB_SACKED_ACKED)
continue;
 
-   if (skb_mstamp_after(>rack.mstamp, >skb_mstamp)) {
+   if (tcp_rack_sent_after(>rack.mstamp, >skb_mstamp,
+   tp->rack.end_seq, scb->end_seq)) {
/* Step 3 in draft-cheng-tcpm-rack-00.txt:
 * A packet is lost if its elapsed time is beyond
 * the recent RTT plus the reordering window.
@@ -113,14 +122,15 @@ void tcp_rack_mark_lost(struct sock *sk, const struct 
skb_mstamp *now)
  * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
  * draft-cheng-tcpm-rack-00.txt
  */
-void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
  const struct skb_mstamp

[net-next 12/13] tcp: remove thin_dupack feature

2017-01-12 Thread Yuchung Cheng

Thin stream DUPACK is to start fast recovery on only one DUPACK
provided the connection is a thin stream (i.e., low inflight).  But
this older feature is now subsumed with RACK. If a connection
receives only a single DUPACK, RACK would arm a reordering timer
and soon starts fast recovery instead of timeout if no further
ACKs are received.

The socket option (THIN_DUPACK) is kept as a nop for compatibility.
Note that this patch does not change another thin-stream feature
which enables linear RTO. Although it might be good to generalize
that in the future (i.e., linear RTO for the first say 3 retries).

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 Documentation/networking/ip-sysctl.txt | 12 
 include/linux/tcp.h|  2 +-
 net/ipv4/sysctl_net_ipv4.c |  7 ---
 net/ipv4/tcp.c |  6 ++
 net/ipv4/tcp_input.c   | 13 -
 5 files changed, 3 insertions(+), 37 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 7de2cf79e16f..aa1bb49f1dc6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -703,18 +703,6 @@ tcp_thin_linear_timeouts - BOOLEAN
Documentation/networking/tcp-thin.txt
Default: 0
 
-tcp_thin_dupack - BOOLEAN
-   Enable dynamic triggering of retransmissions after one dupACK
-   for thin streams. If set, a check is performed upon reception
-   of a dupACK to determine if the stream is thin (less than 4
-   packets in flight). As long as the stream is found to be thin,
-   data is retransmitted on the first received dupACK. This
-   improves retransmission latency for non-aggressive thin
-   streams, often found to be time-dependent.
-   For more information on thin streams, see
-   Documentation/networking/tcp-thin.txt
-   Default: 0
-
 tcp_limit_output_bytes - INTEGER
Controls TCP Small Queue limit per tcp socket.
TCP bulk sender tends to increase packets in flight until it
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4733368f953a..6c22332afb75 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -220,7 +220,7 @@ struct tcp_sock {
unused:5;
u8  nonagle : 4,/* Disable Nagle algorithm? */
thin_lto: 1,/* Use linear timeouts for thin streams */
-   thin_dupack : 1,/* Fast retransmit on first dupack  */
+   unused1 : 1,
repair  : 1,
frto: 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8  repair_queue;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0f2d37e8e983..c8d283615c6f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -537,13 +537,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler   = proc_dointvec
},
{
-   .procname   = "tcp_thin_dupack",
-   .data   = _tcp_thin_dupack,
-   .maxlen = sizeof(int),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec
-   },
-   {
.procname   = "tcp_early_retrans",
.data   = _tcp_early_retrans,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d9023e8ed53e..aba6ea76338e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2474,9 +2474,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_THIN_DUPACK:
if (val < 0 || val > 1)
err = -EINVAL;
-   else {
-   tp->thin_dupack = val;
-   }
break;
 
case TCP_REPAIR:
@@ -2966,8 +2963,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
+
case TCP_THIN_DUPACK:
-   val = tp->thin_dupack;
+   val = 0;
break;
 
case TCP_REPAIR:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 87315ab1ab1a..39ebc20ca1b2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-
-int sysctl_tcp_thin_dupack __read_mostly;
-
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -2170,16 +2167,6 @@ static bool tcp_time_to_recover(struct sock *sk, int 
flag)
if (tcp_dupack_heuristics(tp) > tp->reordering)

[net-next 06/13] tcp: check undo conditions before detecting losses

2017-01-12 Thread Yuchung Cheng

Currently RACK would mark loss before the undo operations in TCP
loss recovery. This could incorrectly identify real losses as
spurious. For example a sender first experiences a delay spike and
then eventually some packets were lost due to buffer overrun.
In this case, the sender should perform fast recovery b/c not all
the packets were lost.

But the sender may first trigger a (spurious) RTO and reset
cwnd to 1. The following ACKs may used to mark real losses by
tcp_rack_mark_lost. Then in tcp_process_loss this ACK could trigger
F-RTO undo condition and unmark real losses and revert the cwnd
reduction. If there are no more ACKs coming back, eventually the
sender would timeout again instead of performing fast recovery.

The patch fixes this incorrect process by always performing
the undo checks before detecting losses.

Fixes: 4f41b1c58a32 ("tcp: use RACK to detect losses")
Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 33 -
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e42ca11c0326..9c98dc874825 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2801,6 +2801,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const 
int acked)
return false;
 }
 
+static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
+  const struct skb_mstamp *ack_time)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+
+   /* Use RACK to detect loss */
+   if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+   u32 prior_retrans = tp->retrans_out;
+
+   tcp_rack_mark_lost(sk, ack_time);
+   if (prior_retrans > tp->retrans_out)
+   *ack_flag |= FLAG_LOST_RETRANS;
+   }
+}
+
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2866,17 +2881,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const 
int acked,
}
}
 
-   /* Use RACK to detect loss */
-   if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
-   u32 prior_retrans = tp->retrans_out;
-
-   tcp_rack_mark_lost(sk, ack_time);
-   if (prior_retrans > tp->retrans_out) {
-   flag |= FLAG_LOST_RETRANS;
-   *ack_flag |= FLAG_LOST_RETRANS;
-   }
-   }
-
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
@@ -2894,11 +2898,13 @@ static void tcp_fastretrans_alert(struct sock *sk, 
const int acked,
tcp_try_keep_open(sk);
return;
}
+   tcp_rack_identify_loss(sk, ack_flag, ack_time);
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit);
-   if (icsk->icsk_ca_state != TCP_CA_Open &&
-   !(flag & FLAG_LOST_RETRANS))
+   tcp_rack_identify_loss(sk, ack_flag, ack_time);
+   if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+ (*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
default:
@@ -2912,6 +2918,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const 
int acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
 
+   tcp_rack_identify_loss(sk, ack_flag, ack_time);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
-- 
2.11.0.483.g087da7b7c-goog

[net-next 02/13] tcp: new helper for RACK to detect loss

2017-01-12 Thread Yuchung Cheng

Create a new helper tcp_rack_detect_loss to prepare the upcoming
RACK reordering timer patch.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/net/tcp.h   |  3 +--
 net/ipv4/tcp_input.c| 12 
 net/ipv4/tcp_recovery.c | 22 +-
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1da0aa724929..51183bba3835 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1863,8 +1863,7 @@ extern int sysctl_tcp_recovery;
 /* Use TCP RACK to detect (some) tail and retransmit losses */
 #define TCP_RACK_LOST_RETRANS  0x1
 
-extern int tcp_rack_mark_lost(struct sock *sk);
-
+extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp,
 const struct skb_mstamp *xmit_time, u8 sacked);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ec6d84363024..bb24b93e64bc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2865,10 +2865,14 @@ static void tcp_fastretrans_alert(struct sock *sk, 
const int acked,
}
 
/* Use RACK to detect loss */
-   if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
-   tcp_rack_mark_lost(sk)) {
-   flag |= FLAG_LOST_RETRANS;
-   *ack_flag |= FLAG_LOST_RETRANS;
+   if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+   u32 prior_retrans = tp->retrans_out;
+
+   tcp_rack_mark_lost(sk);
+   if (prior_retrans > tp->retrans_out) {
+   flag |= FLAG_LOST_RETRANS;
+   *ack_flag |= FLAG_LOST_RETRANS;
+   }
}
 
/* E. Process state. */
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index f38dba5aed7a..7ea0377229c0 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,17 +32,11 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct 
sk_buff *skb)
  * The current version is only used after recovery starts but can be
  * easily extended to detect the first loss.
  */
-int tcp_rack_mark_lost(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk)
 {
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
-   u32 reo_wnd, prior_retrans = tp->retrans_out;
-
-   if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
-   return 0;
-
-   /* Reset the advanced flag to avoid unnecessary queue scanning */
-   tp->rack.advanced = 0;
+   u32 reo_wnd;
 
/* To be more reordering resilient, allow min_rtt/4 settling delay
 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
@@ -82,7 +76,17 @@ int tcp_rack_mark_lost(struct sock *sk)
break;
}
}
-   return prior_retrans - tp->retrans_out;
+}
+
+void tcp_rack_mark_lost(struct sock *sk)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+
+   if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+   return;
+   /* Reset the advanced flag to avoid unnecessary queue scanning */
+   tp->rack.advanced = 0;
+   tcp_rack_detect_loss(sk);
 }
 
 /* Record the most recently (re)sent time among the (s)acked packets */
-- 
2.11.0.483.g087da7b7c-goog

[net-next 04/13] tcp: add reordering timer in RACK loss detection

2017-01-12 Thread Yuchung Cheng

This patch makes RACK install a reordering timer when it suspects
some packets might be lost, but wants to delay the decision
a little bit to accomodate reordering.

It does not create a new timer but instead repurposes the existing
RTO timer, because both are meant to retransmit packets.
Specifically it arms a timer ICSK_TIME_REO_TIMEOUT when
the RACK timing check fails. The wait time is set to

  RACK.RTT + RACK.reo_wnd - (NOW - Packet.xmit_time) + fudge

This translates to expecting a packet (Packet) should take
(RACK.RTT + RACK.reo_wnd + fudge) to deliver after it was sent.

When there are multiple packets that need a timer, we use one timer
with the maximum timeout. Therefore the timer conservatively uses
the maximum window to expire N packets by one timeout, instead of
N timeouts to expire N packets sent at different times.

The fudge factor is 2 jiffies to ensure when the timer fires, all
the suspected packets would exceed the deadline and be marked lost
by tcp_rack_detect_loss(). It has to be at least 1 jiffy because the
clock may tick between calling icsk_reset_xmit_timer(timeout) and
actually hang the timer. The next jiffy is to lower-bound the timeout
to 2 jiffies when reo_wnd is < 1ms.

When the reordering timer fires (tcp_rack_reo_timeout): If we aren't
in Recovery we'll enter fast recovery and force fast retransmit.
This is very similar to the early retransmit (RFC5827) except RACK
is not constrained to only enter recovery for small outstanding
flights.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/net/inet_connection_sock.h |  4 ++-
 include/net/tcp.h  |  4 +++
 net/ipv4/inet_diag.c   |  1 +
 net/ipv4/tcp_input.c   |  6 ++--
 net/ipv4/tcp_ipv4.c|  1 +
 net/ipv4/tcp_output.c  |  3 +-
 net/ipv4/tcp_recovery.c| 57 +-
 net/ipv4/tcp_timer.c   |  3 ++
 net/ipv6/tcp_ipv6.c|  1 +
 9 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 85ee3879499e..84b2edde09b1 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -144,6 +144,7 @@ struct inet_connection_sock {
 #define ICSK_TIME_PROBE0   3   /* Zero window probe timer */
 #define ICSK_TIME_EARLY_RETRANS 4  /* Early retransmit timer */
 #define ICSK_TIME_LOSS_PROBE   5   /* Tail loss probe timer */
+#define ICSK_TIME_REO_TIMEOUT  6   /* Reordering timer */
 
 static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
 {
@@ -234,7 +235,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock 
*sk, const int what,
}
 
if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
-   what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE) {
+   what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||
+   what == ICSK_TIME_REO_TIMEOUT) {
icsk->icsk_pending = what;
icsk->icsk_timeout = jiffies + when;
sk_reset_timer(sk, >icsk_retransmit_timer, 
icsk->icsk_timeout);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1439107658c2..64fcdeb3358b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -143,6 +143,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval 
between probes
 * for local resources.
 */
+#define TCP_REO_TIMEOUT_MIN(2000) /* Min RACK reordering timeout in usec */
 
 #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */
 #define TCP_KEEPALIVE_PROBES   9   /* Max of 9 keepalive probes
*/
@@ -397,6 +398,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff 
*skb,
 int tcp_child_process(struct sock *parent, struct sock *child,
  struct sk_buff *skb);
 void tcp_enter_loss(struct sock *sk);
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
 void tcp_clear_retrans(struct tcp_sock *tp);
 void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
@@ -541,6 +543,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff 
*skb, int segs);
 void tcp_retransmit_timer(struct sock *sk);
 void tcp_xmit_retransmit_queue(struct sock *);
 void tcp_simple_retransmit(struct sock *);
+void tcp_enter_recovery(struct sock *sk, bool ece_ack);
 int tcp_trim_head(struct sock *, struct sk_buff *, u32);
 int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
 
@@ -1867,6 +1870,7 @@ extern void tcp_rack_mark_lost(struct sock *sk, const 
struct skb_mstamp *now);
 extern void tcp_rack_advance(struct tcp_sock *tp,

[net-next 13/13] tcp: disable fack by default

2017-01-12 Thread Yuchung Cheng

This patch disables FACK by default as RACK is the successor of FACK
(inspired by the insights behind FACK).

FACK[1] in Linux works as follows: a packet P is deemed lost,
if packet Q of higher sequence is s/acked and P and Q are distant
by at least dupthresh number of packets in sequence space.

FACK is more aggressive than the IETF recommened recovery for SACK
(RFC3517 A Conservative Selective Acknowledgment (SACK)-based Loss
 Recovery Algorithm for TCP), because a single SACK may trigger
fast recovery. This obviously won't work well with reordering so
FACK is dynamically disabled upon detecting reordering.

RACK supersedes FACK by using time distance instead of sequence
distance. On reordering, RACK waits for a quarter of RTT receiving
a single SACK before starting recovery. (the timer can be made more
adaptive in the future by measuring reordering distance in time,
but currently RTT/4 seem to work well.) Once the recovery starts,
RACK behaves almost like FACK because it reduces the reodering
window to 1ms, so it fast retransmits quickly. In addition RACK
can detect loss retransmission as it does not care about the packet
sequences (being repeated or not), which is extremely useful when
the connection is going through a traffic policer.

Google server experiments indicate that disabling FACK after enabling
RACK has negligible impact on the overall loss recovery performance
with more reordering events detected.  But we still keep the FACK
implementation for backup if RACK has bugs that needs to be disabled.

[1] M. Mathis, J. Mahdavi, "Forward Acknowledgment: Refining
TCP Congestion Control," In Proceedings of SIGCOMM '96, August 1996.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 39ebc20ca1b2..1a34e9278c07 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,7 @@
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
-int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
@@ -2114,7 +2114,8 @@ static inline int tcp_dupack_heuristics(const struct 
tcp_sock *tp)
  * dynamically measured and adjusted. This is implemented in
  * tcp_rack_mark_lost.
  *
- * FACK: it is the simplest heuristics. As soon as we decided
+ * FACK (Disabled by default. Subsumbed by RACK):
+ * It is the simplest heuristics. As soon as we decided
  * that something is lost, we decide that _all_ not SACKed
  * packets until the most forward SACK are lost. I.e.
  * lost_out = fackets_out - sacked_out and left_out = fackets_out.
-- 
2.11.0.483.g087da7b7c-goog

[net-next 01/13] tcp: new helper function for RACK loss detection

2017-01-12 Thread Yuchung Cheng

Create a new helper tcp_rack_mark_skb_lost to prepare the
upcoming RACK reordering timer support.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_recovery.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index e36df4fcfeba..f38dba5aed7a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -3,6 +3,19 @@
 
 int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
 
+static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+
+   tcp_skb_mark_lost_uncond_verify(tp, skb);
+   if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+   /* Account for retransmits that are lost again */
+   TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+   tp->retrans_out -= tcp_skb_pcount(skb);
+   NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
+   }
+}
+
 /* Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
@@ -61,13 +74,7 @@ int tcp_rack_mark_lost(struct sock *sk)
continue;
 
/* skb is lost if packet sent later is sacked */
-   tcp_skb_mark_lost_uncond_verify(tp, skb);
-   if (scb->sacked & TCPCB_SACKED_RETRANS) {
-   scb->sacked &= ~TCPCB_SACKED_RETRANS;
-   tp->retrans_out -= tcp_skb_pcount(skb);
-   NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPLOSTRETRANSMIT);
-   }
+   tcp_rack_mark_skb_lost(sk, skb);
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
 * b/c the rest are all sent after rack_sent
-- 
2.11.0.483.g087da7b7c-goog

[net-next 09/13] tcp: remove forward retransmit feature

2017-01-12 Thread Yuchung Cheng

Forward retransmit is an esoteric feature in RFC3517 (condition(3)
in the NextSeg()). Basically if a packet is not considered lost by
the current criteria (# of dupacks etc), but the congestion window
has room for more packets, then retransmit this packet.

However it actually conflicts with the rest of recovery design. For
example, when reordering is detected we want to be conservative
in retransmitting packets but forward-retransmit feature would
break that to force more retransmission. Also the implementation is
fairly complicated inside the retransmission logic inducing extra
iterations in the write queue. With RACK losses are being detected
timely and this heuristic is no longer necessary. There this patch
removes the feature.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/linux/tcp.h   |  1 -
 net/ipv4/tcp_input.c  |  5 -
 net/ipv4/tcp_output.c | 61 +++
 3 files changed, 3 insertions(+), 64 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 970d5f00589f..8e5f4c15d0e5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -307,7 +307,6 @@ struct tcp_sock {
 */
 
int lost_cnt_hint;
-   u32 retransmit_high;/* L-bits may be on up to this seqno */
 
u32 prior_ssthresh; /* ssthresh saved at recovery start */
u32 high_seq;   /* snd_nxt at onset of congestion   */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9469ce384d3b..a041a92348ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -916,10 +916,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock 
*tp, struct sk_buff *skb)
before(TCP_SKB_CB(skb)->seq,
   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = skb;
-
-   if (!tp->lost_out ||
-   after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
-   tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 }
 
 /* Sum the number of packets on the wire we have marked as lost.
@@ -1983,7 +1979,6 @@ void tcp_enter_loss(struct sock *sk)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
-   tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
}
tcp_verify_left_out(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0ba9026cb70d..6327e4d368a4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2831,36 +2831,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff 
*skb, int segs)
return err;
 }
 
-/* Check if we forward retransmits are possible in the current
- * window/congestion state.
- */
-static bool tcp_can_forward_retransmit(struct sock *sk)
-{
-   const struct inet_connection_sock *icsk = inet_csk(sk);
-   const struct tcp_sock *tp = tcp_sk(sk);
-
-   /* Forward retransmissions are possible only during Recovery. */
-   if (icsk->icsk_ca_state != TCP_CA_Recovery)
-   return false;
-
-   /* No forward retransmissions in Reno are possible. */
-   if (tcp_is_reno(tp))
-   return false;
-
-   /* Yeah, we have to make difficult choice between forward transmission
-* and retransmission... Both ways have their merits...
-*
-* For now we do not retransmit anything, while we have some new
-* segments to send. In the other cases, follow rule 3 for
-* NextSeg() specified in RFC3517.
-*/
-
-   if (tcp_may_send_now(sk))
-   return false;
-
-   return true;
-}
-
 /* This gets called after a retransmit timeout, and the initially
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
@@ -2875,24 +2845,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *hole = NULL;
-   u32 max_segs, last_lost;
+   u32 max_segs;
int mib_idx;
-   int fwd_rexmitting = 0;
 
if (!tp->packets_out)
return;
 
-   if (!tp->lost_out)
-   tp->retransmit_high = tp->snd_una;
-
if (tp->retransmit_skb_hint) {
skb = tp->retransmit_skb_hint;
-   last_lost = TCP_SKB_CB(skb)->end_seq;
-   if (after(last_lost, tp->retransmit_high))
-   last_lost = tp->retransmit_high;
} else {
skb = tcp_write_queue_head(sk);
-   last_lost = tp->snd_una;
}
 
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2915,31 +2877,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)

[net-next 10/13] tcp: remove early retransmit

2017-01-12 Thread Yuchung Cheng

This patch removes the support of RFC5827 early retransmit (i.e.,
fast recovery on small inflight with <3 dupacks) because it is
subsumed by the new RACK loss detection. More specifically when
RACK receives DUPACKs, it'll arm a reordering timer to start fast
recovery after a quarter of (min)RTT, hence it covers the early
retransmit except RACK does not limit itself to specific inflight
or dupack numbers.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 Documentation/networking/ip-sysctl.txt | 19 +++
 include/linux/tcp.h|  3 +-
 include/net/tcp.h  | 19 ---
 net/ipv4/inet_diag.c   |  1 -
 net/ipv4/tcp.c |  3 --
 net/ipv4/tcp_input.c   | 60 ++
 net/ipv4/tcp_ipv4.c|  1 -
 net/ipv4/tcp_metrics.c |  1 -
 net/ipv4/tcp_minisocks.c   |  1 -
 net/ipv4/tcp_output.c  | 11 +++
 net/ipv4/tcp_timer.c   |  3 --
 net/ipv6/tcp_ipv6.c|  1 -
 12 files changed, 12 insertions(+), 111 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 7dd65c9cf707..7de2cf79e16f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN
Allows TCP to send "duplicate" SACKs.
 
 tcp_early_retrans - INTEGER
-   Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
-   for triggering fast retransmit when the amount of outstanding data is
-   small and when no previously unsent data can be transmitted (such
-   that limited transmit could be used). Also controls the use of
-   Tail loss probe (TLP) that converts RTOs occurring due to tail
-   losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01).
+   Tail loss probe (TLP) converts RTOs occurring due to tail
+   losses into fast recovery (draft-ietf-tcpm-rack). Note that
+   TLP requires RACK to function properly (see tcp_recovery below)
Possible values:
-   0 disables ER
-   1 enables ER
-   2 enables ER but delays fast recovery and fast retransmit
- by a fourth of RTT. This mitigates connection falsely
- recovers when network has a small degree of reordering
- (less than 3 packets).
-   3 enables delayed ER and TLP.
-   4 enables TLP only.
+   0 disables TLP
+   3 or 4 enables TLP
Default: 3
 
 tcp_ecn - INTEGER
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8e5f4c15d0e5..4733368f953a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -224,8 +224,7 @@ struct tcp_sock {
repair  : 1,
frto: 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8  repair_queue;
-   u8  do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
-   syn_data:1, /* SYN includes data */
+   u8  syn_data:1, /* SYN includes data */
syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 423438dd6fe9..c55d65f74f7f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -565,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 const struct sk_buff *next_skb);
 
 /* tcp_input.c */
-void tcp_resume_early_retransmit(struct sock *sk);
 void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk);
@@ -1037,24 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
 }
 
-/* TCP early-retransmit (ER) is similar to but more conservative than
- * the thin-dupack feature.  Enable ER only if thin-dupack is disabled.
- */
-static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
-{
-   struct net *net = sock_net((struct sock *)tp);
-
-   tp->do_early_retrans = sysctl_tcp_early_retrans &&
-   sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
-   !(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
-   net->ipv4.sysctl_tcp_reordering == 3;
-}
-
-static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
-{
-   tp->do_early_retrans = 0;
-}
-
 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
return tp->sacked_out + tp->lost_out;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index d216e40623d3..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++

[net-next 03/13] tcp: record most recent RTT in RACK loss detection

2017-01-12 Thread Yuchung Cheng

Record the most recent RTT in RACK. It is often identical to the
"ca_rtt_us" values in tcp_clean_rtx_queue. But when the packet has
been retransmitted, RACK choses to believe the ACK is for the
(latest) retransmitted packet if the RTT is over minimum RTT.

This requires passing the arrival time of the most recent ACK to
RACK routines. The timestamp is now recorded in the "ack_time"
in tcp_sacktag_state during the ACK processing.

This patch does not change the RACK algorithm itself. It only adds
the RTT variable to prepare the next main patch.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Acked-by: Eric Dumazet 
---
 include/linux/tcp.h |  1 +
 include/net/tcp.h   |  7 ---
 net/ipv4/tcp_input.c| 36 ++--
 net/ipv4/tcp_recovery.c | 41 +++--
 4 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fc5848dad7a4..1255c592719c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -207,6 +207,7 @@ struct tcp_sock {
/* Information of the most recently (s)acked skb */
struct tcp_rack {
struct skb_mstamp mstamp; /* (Re)sent time of the skb */
+   u32 rtt_us;  /* Associated RTT */
u8 advanced; /* mstamp advanced since last lost marking */
u8 reord;/* reordering detected */
} rack;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 51183bba3835..1439107658c2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1863,9 +1863,10 @@ extern int sysctl_tcp_recovery;
 /* Use TCP RACK to detect (some) tail and retransmit losses */
 #define TCP_RACK_LOST_RETRANS  0x1
 
-extern void tcp_rack_mark_lost(struct sock *sk);
-extern void tcp_rack_advance(struct tcp_sock *tp,
-const struct skb_mstamp *xmit_time, u8 sacked);
+extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
+extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+const struct skb_mstamp *xmit_time,
+const struct skb_mstamp *ack_time);
 
 /*
  * Save and compile IPv4 options, return a pointer to it
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bb24b93e64bc..8ccd171999bf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1135,6 +1135,7 @@ struct tcp_sacktag_state {
 */
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
+   struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
struct rate_sample *rate;
int flag;
 };
@@ -1217,7 +1218,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
 
if (!(sacked & TCPCB_SACKED_ACKED)) {
-   tcp_rack_advance(tp, xmit_time, sacked);
+   tcp_rack_advance(tp, sacked, xmit_time, >ack_time);
 
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
@@ -2813,7 +2814,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const 
int acked)
  * tcp_xmit_retransmit_queue().
  */
 static void tcp_fastretrans_alert(struct sock *sk, const int acked,
- bool is_dupack, int *ack_flag, int *rexmit)
+ bool is_dupack, int *ack_flag, int *rexmit,
+ const struct skb_mstamp *ack_time)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2868,7 +2870,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const 
int acked,
if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
u32 prior_retrans = tp->retrans_out;
 
-   tcp_rack_mark_lost(sk);
+   tcp_rack_mark_lost(sk, ack_time);
if (prior_retrans > tp->retrans_out) {
flag |= FLAG_LOST_RETRANS;
*ack_flag |= FLAG_LOST_RETRANS;
@@ -3105,11 +3107,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct 
sk_buff *skb,
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
   u32 prior_snd_una, int *acked,
-  struct tcp_sacktag_state *sack,
-  struct skb_mstamp *now)
+  struct tcp_sacktag_state *sack)
 {
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt;
+   struct skb_mstamp *now = >ack_time;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
@@ -3169,7 +3171,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int 
prior_fackets,
} else if (tcp_is_sack(tp)) {
tp->delivered += acked_pcount;

[net-next 00/13] RACK fast recovery

2017-01-12 Thread Yuchung Cheng

The patch set enables RACK loss detection (draft-ietf-tcpm-rack-01)
to trigger fast recovery with a reordering timer.

Previously RACK has been running in auxiliary mode where it is
used to detect packet losses once the recovery has triggered by
other algorithms (e.g., FACK). By inspecting packet timestamps,
RACK can start ACK-driven repairs timely. A few similar heuristics
are no longer needed and are either removed or disabled to reduce
the complexity of the Linux TCP loss recovery engine:

  1. FACK (Forward Acknowledgement)
  2. Early Retransmit (RFC5827)
  3. thin_dupack (fast recovery on single DUPACK for thin-streams)
  4. NCR (Non-Congestion Robustness RFC4653) (RFC4653)
  5. Forward Retransmit

After this change, Linux's loss recovery algorithms consist of
  1. Conventional DUPACK threshold approach (RFC6675)
  2. RACK and Tail Loss Probe (draft-ietf-tcpm-rack-01)
  3. RTO plus F-RTO extension (RFC5682)

The patch set has been tested on Google servers extensively and
presented in several IETF meetings. The data suggests that RACK
successfully improves recovery performance:
https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-draft-ietf-tcpm-rack-01.pdf
https://www.ietf.org/proceedings/96/slides/slides-96-tcpm-3.pdf


Yuchung Cheng (13):
  tcp: new helper function for RACK loss detection
  tcp: new helper for RACK to detect loss
  tcp: record most recent RTT in RACK loss detection
  tcp: add reordering timer in RACK loss detection
  tcp: use sequence to break TS ties for RACK loss detection
  tcp: check undo conditions before detecting losses
  tcp: enable RACK loss detection to trigger recovery
  tcp: extend F-RTO to catch more spurious timeouts
  tcp: remove forward retransmit feature
  tcp: remove early retransmit
  tcp: remove RFC4653 NCR
  tcp: remove thin_dupack feature
  tcp: disable fack by default

 Documentation/networking/ip-sysctl.txt |  31 +
 include/linux/tcp.h|   8 +-
 include/net/inet_connection_sock.h |   4 +-
 include/net/tcp.h  |  40 ++
 net/ipv4/inet_diag.c   |   2 +-
 net/ipv4/sysctl_net_ipv4.c |   7 --
 net/ipv4/tcp.c |   9 +-
 net/ipv4/tcp_input.c   | 224 +
 net/ipv4/tcp_ipv4.c|   2 +-
 net/ipv4/tcp_metrics.c |   1 -
 net/ipv4/tcp_minisocks.c   |   1 -
 net/ipv4/tcp_output.c  |  75 ++-
 net/ipv4/tcp_recovery.c| 148 --
 net/ipv4/tcp_timer.c   |   4 +-
 net/ipv6/tcp_ipv6.c|   2 +-
 15 files changed, 237 insertions(+), 321 deletions(-)

-- 
2.11.0.483.g087da7b7c-goog

Re: [PATCH v5 01/13] net: ethernet: aquantia: Make and configuration files.

2017-01-12 Thread Joe Perches

On Thu, 2017-01-12 at 21:24 -0800, David VomLehn wrote:
> On 01/12/2017 09:06 PM, Joe Perches wrote:
> > On Thu, 2017-01-12 at 21:02 -0800, Alexander Loktionov wrote:
> > > From: David VomLehn 
> > > 
> > > Patches to create the make and configuration files.
> > 
> > This patch should _really_ be the last in the series
> > not the first.
> > 
> 
> Could you explain the basis for this? By convention, we put tables of 
> content at the beginning of books and only indices at the back. 
> Analogously, make and config files can be used to established the 
> context for what follows, making it easier to understand. Once 
> committed, of course, the order no longer matters except as bisection is 
> concerned.

As I wrote the first time:

On Tue, 2016-12-27 at 08:15 -0800, Joe Perches wrote:
> On Tue, 2016-12-27 at 05:17 -0800, David VomLehn wrote:
> > Patches to create the make and configuration files.
[]
> Patch 1 will not build if CONFIG_AQTION is enabled.
> Patch 1/12 should be reordered to be patch 12/12 and
> all the other patches moved up appropriately. 

You don't create the files until later patches.

If you applied just this first patch and tried to
add CONFIG_AQTION=y to the .config, make fails.

That's bad for git bisect.
Every patch in this series should build properly.

If you delay the adding of the Makefile and Kconfig
until all the files are added, then it'd bisect fine.

Re: [PATCH v5 01/13] net: ethernet: aquantia: Make and configuration files.

2017-01-12 Thread David VomLehn


On 01/12/2017 09:06 PM, Joe Perches wrote:

On Thu, 2017-01-12 at 21:02 -0800, Alexander Loktionov wrote:

From: David VomLehn 

Patches to create the make and configuration files.

This patch should _really_ be the last in the series
not the first.

Could you explain the basis for this? By convention, we put tables of 
content at the beginning of books and only indices at the back. 
Analogously, make and config files can be used to established the 
context for what follows, making it easier to understand. Once 
committed, of course, the order no longer matters except as bisection is 
concerned.


--
David VL

Re: [PATCH v5 01/13] net: ethernet: aquantia: Make and configuration files.

2017-01-12 Thread Joe Perches

On Thu, 2017-01-12 at 21:02 -0800, Alexander Loktionov wrote:
> From: David VomLehn 
> 
> Patches to create the make and configuration files.

This patch should _really_ be the last in the series
not the first.

[PATCH v5 03/13] net: ethernet: aquantia: Add ring support code

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Add code to support the transmit and receive ring buffers.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/aquantia/aq_ring.c | 358 
 drivers/net/ethernet/aquantia/aq_ring.h | 157 ++
 2 files changed, 515 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/aq_ring.c
 create mode 100644 drivers/net/ethernet/aquantia/aq_ring.h

diff --git a/drivers/net/ethernet/aquantia/aq_ring.c 
b/drivers/net/ethernet/aquantia/aq_ring.c
new file mode 100644
index 000..690c37d
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/aq_ring.c
@@ -0,0 +1,358 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File aq_ring.c: Definition of functions for Rx/Tx rings. */
+
+#include "aq_ring.h"
+#include "aq_nic.h"
+#include "aq_hw.h"
+
+#include 
+#include 
+
+static struct aq_ring_s *aq_ring_alloc(struct aq_ring_s *self,
+  struct aq_nic_s *aq_nic)
+{
+   int err = 0;
+
+   self->buff_ring = (struct aq_ring_buff_s *)
+   kzalloc(sizeof(struct aq_ring_buff_s) * self->size, GFP_KERNEL);
+
+   if (!self->buff_ring) {
+   err = -ENOMEM;
+   goto err_exit;
+   }
+   self->dx_ring = dma_alloc_coherent(aq_nic_get_dev(aq_nic),
+   self->size * self->dx_size,
+   >dx_ring_pa, GFP_KERNEL);
+   if (!self->dx_ring) {
+   err = -ENOMEM;
+   goto err_exit;
+   }
+
+err_exit:
+   if (err < 0) {
+   aq_ring_free(self);
+   self = NULL;
+   }
+   return self;
+}
+
+struct aq_ring_s *aq_ring_tx_alloc(struct aq_ring_s *self,
+  struct aq_nic_s *aq_nic,
+  unsigned int idx,
+  struct aq_nic_cfg_s *aq_nic_cfg)
+{
+   int err = 0;
+
+   self->aq_nic = aq_nic;
+   self->idx = idx;
+   self->size = aq_nic_cfg->txds;
+   self->dx_size = aq_nic_cfg->aq_hw_caps->txd_size;
+
+   self = aq_ring_alloc(self, aq_nic);
+   if (!self) {
+   err = -ENOMEM;
+   goto err_exit;
+   }
+
+err_exit:
+   if (err < 0) {
+   aq_ring_free(self);
+   self = NULL;
+   }
+   return self;
+}
+
+struct aq_ring_s *aq_ring_rx_alloc(struct aq_ring_s *self,
+  struct aq_nic_s *aq_nic,
+  unsigned int idx,
+  struct aq_nic_cfg_s *aq_nic_cfg)
+{
+   int err = 0;
+
+   self->aq_nic = aq_nic;
+   self->idx = idx;
+   self->size = aq_nic_cfg->rxds;
+   self->dx_size = aq_nic_cfg->aq_hw_caps->rxd_size;
+
+   self = aq_ring_alloc(self, aq_nic);
+   if (!self) {
+   err = -ENOMEM;
+   goto err_exit;
+   }
+
+err_exit:
+   if (err < 0) {
+   aq_ring_free(self);
+   self = NULL;
+   }
+   return self;
+}
+
+void aq_ring_init(struct aq_ring_s *self)
+{
+   self->hw_head = 0;
+   self->sw_head = 0;
+   self->sw_tail = 0;
+}
+
+void aq_ring_free(struct aq_ring_s *self)
+{
+   if (!self)
+   return;
+
+   kfree(self->buff_ring);
+
+   if (self->dx_ring)
+   dma_free_coherent(aq_nic_get_dev(self->aq_nic),
+ self->size * self->dx_size, self->dx_ring,
+ self->dx_ring_pa);
+}
+
+void aq_ring_tx_append_buffs(struct aq_ring_s *self,
+struct aq_ring_buff_s *buffer,
+unsigned int buffers)
+{
+   if (likely(self->sw_tail + buffers < self->size)) {
+   memcpy(>buff_ring[self->sw_tail], buffer,
+  sizeof(buffer[0]) * buffers);
+   } else {
+   unsigned int first_part = self->size - self->sw_tail;
+   unsigned int second_part = buffers - first_part;
+
+   memcpy(>buff_ring[self->sw_tail], buffer,
+  sizeof(buffer[0]) * first_part);
+
+   memcpy(>buff_ring[0], [first_part],
+  sizeof(buffer[0]) * second_part);
+   }
+}
+
+void aq_ring_tx_clean(struct aq_ring_s *self)
+{
+   struct device *dev = aq_nic_get_dev(self->aq_nic);
+
+   for (;

[PATCH v5 02/13] net: ethernet: aquantia: Common functions and definitions

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Add files containing the functions and definitions used in common in
different functional areas.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/aquantia/aq_cfg.h| 77 +++
 drivers/net/ethernet/aquantia/aq_common.h | 23 +
 drivers/net/ethernet/aquantia/aq_utils.h  | 53 +
 3 files changed, 153 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/aq_cfg.h
 create mode 100644 drivers/net/ethernet/aquantia/aq_common.h
 create mode 100644 drivers/net/ethernet/aquantia/aq_utils.h

diff --git a/drivers/net/ethernet/aquantia/aq_cfg.h 
b/drivers/net/ethernet/aquantia/aq_cfg.h
new file mode 100644
index 000..5f99237
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/aq_cfg.h
@@ -0,0 +1,77 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File aq_cfg.h: Definition of configuration parameters and constants. */
+
+#ifndef AQ_CFG_H
+#define AQ_CFG_H
+
+#define AQ_CFG_VECS_DEF   4U
+#define AQ_CFG_TCS_DEF1U
+
+#define AQ_CFG_TXDS_DEF4096U
+#define AQ_CFG_RXDS_DEF1024U
+
+#define AQ_CFG_IS_POLLING_DEF 0U
+
+#define AQ_CFG_FORCE_LEGACY_INT 0U
+
+#define AQ_CFG_IS_INTERRUPT_MODERATION_DEF   1U
+#define AQ_CFG_INTERRUPT_MODERATION_RATE_DEF 0xU
+#define AQ_CFG_IRQ_MASK  0x1FFU
+
+#define AQ_CFG_VECS_MAX   8U
+#define AQ_CFG_TCS_MAX8U
+
+#define AQ_CFG_TX_FRAME_MAX  (16U * 1024U)
+#define AQ_CFG_RX_FRAME_MAX  (4U * 1024U)
+
+/* LRO */
+#define AQ_CFG_IS_LRO_DEF   1U
+
+/* RSS */
+#define AQ_CFG_RSS_INDIRECTION_TABLE_MAX  128U
+#define AQ_CFG_RSS_HASHKEY_SIZE   320U
+
+#define AQ_CFG_IS_RSS_DEF   1U
+#define AQ_CFG_NUM_RSS_QUEUES_DEF   AQ_CFG_VECS_DEF
+#define AQ_CFG_RSS_BASE_CPU_NUM_DEF 0U
+
+#define AQ_CFG_PCI_FUNC_MSIX_IRQS   9U
+#define AQ_CFG_PCI_FUNC_PORTS   2U
+
+#define AQ_CFG_SERVICE_TIMER_INTERVAL(2 * HZ)
+#define AQ_CFG_POLLING_TIMER_INTERVAL   ((unsigned int)(2 * HZ))
+
+#define AQ_CFG_SKB_FRAGS_MAX   32U
+
+#define AQ_CFG_NAPI_WEIGHT 64U
+
+#define AQ_CFG_MULTICAST_ADDRESS_MAX 32U
+
+/*#define AQ_CFG_MAC_ADDR_PERMANENT {0x30, 0x0E, 0xE3, 0x12, 0x34, 0x56}*/
+
+#define AQ_CFG_FC_MODE 3U
+
+#define AQ_CFG_SPEED_MSK  0xU  /* 0xU==auto_neg */
+
+#define AQ_CFG_IS_AUTONEG_DEF   1U
+#define AQ_CFG_MTU_DEF  1514U
+
+#define AQ_CFG_LOCK_TRYS   100U
+
+#define AQ_CFG_DRV_AUTHOR  "aQuantia"
+#define AQ_CFG_DRV_DESC"aQuantia Corporation(R) Network Driver"
+#define AQ_CFG_DRV_NAME"aquantia"
+#define AQ_CFG_DRV_VERSION __stringify(NIC_MAJOR_DRIVER_VERSION)"."\
+   __stringify(NIC_MINOR_DRIVER_VERSION)"."\
+   __stringify(NIC_BUILD_DRIVER_VERSION)"."\
+   __stringify(NIC_REVISION_DRIVER_VERSION)
+
+#endif /* AQ_CFG_H */
diff --git a/drivers/net/ethernet/aquantia/aq_common.h 
b/drivers/net/ethernet/aquantia/aq_common.h
new file mode 100644
index 000..9eb5e22
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/aq_common.h
@@ -0,0 +1,23 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File aq_common.h: Basic includes for all files in project. */
+
+#ifndef AQ_COMMON_H
+#define AQ_COMMON_H
+
+#include 
+#include 
+
+#include "ver.h"
+#include "aq_nic.h"
+#include "aq_cfg.h"
+#include "aq_utils.h"
+
+#endif /* AQ_COMMON_H */
diff --git a/drivers/net/ethernet/aquantia/aq_utils.h 
b/drivers/net/ethernet/aquantia/aq_utils.h
new file mode 100644
index 000..2ffc0f4
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/aq_utils.h
@@ -0,0 +1,53 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File aq_utils.h: Useful macro and structures used in all layers of driver. 
*/
+
+#ifndef AQ_UTILS_H
+#define AQ_UTILS_H
+
+#include "aq_common.h"
+
+#ifndef MBIT
+#define MBIT ((u64)100U)

[PATCH v5 07/13] net: ethernet: aquantia: Vector operations

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Add functions to manululate the vector of receive and transmit rings.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel.Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/aquantia/aq_vec.c | 385 +
 drivers/net/ethernet/aquantia/aq_vec.h |  42 
 2 files changed, 427 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/aq_vec.c
 create mode 100644 drivers/net/ethernet/aquantia/aq_vec.h

diff --git a/drivers/net/ethernet/aquantia/aq_vec.c 
b/drivers/net/ethernet/aquantia/aq_vec.c
new file mode 100644
index 000..7974b7c0
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/aq_vec.c
@@ -0,0 +1,385 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File aq_vec.c: Definition of common structure for vector of Rx and Tx rings.
+ * Definition of functions for Rx and Tx rings. Friendly module for aq_nic.
+ */
+
+#include "aq_vec.h"
+#include "aq_nic.h"
+#include "aq_ring.h"
+#include "aq_hw.h"
+
+#include 
+
+struct aq_vec_s {
+   AQ_OBJ_HEADER;
+   struct aq_hw_ops *aq_hw_ops;
+   struct aq_hw_s *aq_hw;
+   struct aq_nic_s *aq_nic;
+   unsigned int tx_rings;
+   unsigned int rx_rings;
+   struct aq_ring_param_s aq_ring_param;
+   struct napi_struct napi;
+   struct aq_ring_s ring[AQ_CFG_TCS_MAX][2];
+};
+
+#define AQ_VEC_TX_ID 0
+#define AQ_VEC_RX_ID 1
+
+static int aq_vec_poll(struct napi_struct *napi, int budget)
+__releases(>lock)
+__acquires(>lock)
+{
+   struct aq_vec_s *self = container_of(napi, struct aq_vec_s, napi);
+   struct aq_ring_s *ring = NULL;
+   int work_done = 0;
+   int err = 0;
+   unsigned int i = 0U;
+   unsigned int sw_tail_old = 0U;
+   bool was_tx_cleaned = false;
+   bool is_locked = false;
+
+   if (!self)
+   return 0;
+
+   if (spin_trylock(>lock)) {
+   is_locked = true;
+
+   for (i = 0U, ring = self->ring[0];
+   self->tx_rings > i; ++i, ring = self->ring[i]) {
+   if (self->aq_hw_ops->hw_ring_tx_head_update) {
+   err = self->aq_hw_ops->hw_ring_tx_head_update(
+   self->aq_hw,
+   [AQ_VEC_TX_ID]);
+   if (err < 0)
+   goto err_exit;
+   }
+
+   if (ring[AQ_VEC_TX_ID].sw_head !=
+   ring[AQ_VEC_TX_ID].hw_head) {
+   aq_ring_tx_clean([AQ_VEC_TX_ID]);
+   was_tx_cleaned = true;
+   }
+
+   err = self->aq_hw_ops->hw_ring_rx_receive(self->aq_hw,
+   [AQ_VEC_RX_ID]);
+   if (err < 0)
+   goto err_exit;
+
+   if (ring[AQ_VEC_RX_ID].sw_head !=
+   ring[AQ_VEC_RX_ID].hw_head) {
+   err = aq_ring_rx_clean([AQ_VEC_RX_ID],
+  _done,
+  budget - work_done);
+   if (err < 0)
+   goto err_exit;
+
+   sw_tail_old = ring[AQ_VEC_RX_ID].sw_tail;
+
+   err = aq_ring_rx_fill([AQ_VEC_RX_ID]);
+   if (err < 0)
+   goto err_exit;
+
+   err = self->aq_hw_ops->hw_ring_rx_fill(
+   self->aq_hw,
+   [AQ_VEC_RX_ID], sw_tail_old);
+   if (err < 0)
+   goto err_exit;
+   }
+   }
+
+   if (was_tx_cleaned)
+   work_done = budget;
+
+   if (work_done < budget) {
+   napi_complete(napi);
+   self->aq_hw_ops->hw_irq_enable(self->aq_hw,
+   1U << self->aq_ring_param.vec_idx);
+   }
+
+err_exit:
+   if (is_locked)
+   spin_unlock(>lock);
+   }
+
+   return work_done;
+}
+
+struct aq_vec_s *aq_vec_alloc(struct

[PATCH v5 00/13] net: ethernet: aquantia: Add AQtion 2.5/5 GB NIC driver

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

v1: Initial version
v2: o Make necessary drivers/net/ethernet changes to integrate software
o Drop intermediate atlantic directory
o Remove Makefile things only appropriate to out of tree module
  building
v3: o Move changes to drivers/net/ethernet/{Kconfig,Makefile} to the last
  patch to ensure clean bisection.
o Removed inline attribute aq_hw_write_req() as it was defined in
  only one .c file.
o #included pci.h in aq_common.h to get struct pci definition.
o Modified code to unlock based execution flow rather than using a
  flag.
o Made a number of functions that were only used in a single file
  static.
o Cleaned up error and return code handling in various places.
o Remove AQ_CFG_IP_ALIGN definition.
o Other minor code clean up.
v4: o Using do_div for 64 bit division.
o Modified NIC statistics code.
o Using build_skb instead netdev_alloc_skb for single fragment
  packets.
o Removed extra aq_nic.o from Makefile
v5: o Removed extra newline at the end of the files.
o Wrapped cover letter lines.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: David M. VomLehn 

David VomLehn (13):
  net: ethernet: aquantia: Make and configuration files.
  net: ethernet: aquantia: Common functions and definitions
  net: ethernet: aquantia: Add ring support code
  net: ethernet: aquantia: Low-level hardware interfaces
  net: ethernet: aquantia: Support for NIC-specific code
  net: ethernet: aquantia: Atlantic A0 and B0 specific functions.
  net: ethernet: aquantia: Vector operations
  net: ethernet: aquantia: PCI operations
  net: ethernet: aquantia: Atlantic hardware abstraction layer
  net: ethernet: aquantia: Hardware interface and utility functions
  net: ethernet: aquantia: Ethtool support
  net: ethernet: aquantia: Receive side scaling
  net: ethernet: aquantia: Integrate AQtion 2.5/5 GB NIC driver

 drivers/net/ethernet/Kconfig   |1 +
 drivers/net/ethernet/Makefile  |1 +
 drivers/net/ethernet/aquantia/Kconfig  |   24 +
 drivers/net/ethernet/aquantia/Makefile |   42 +
 drivers/net/ethernet/aquantia/aq_cfg.h |   77 +
 drivers/net/ethernet/aquantia/aq_common.h  |   23 +
 drivers/net/ethernet/aquantia/aq_ethtool.c |  250 +++
 drivers/net/ethernet/aquantia/aq_ethtool.h |   19 +
 drivers/net/ethernet/aquantia/aq_hw.h  |  169 ++
 drivers/net/ethernet/aquantia/aq_hw_utils.c|   68 +
 drivers/net/ethernet/aquantia/aq_hw_utils.h|   47 +
 drivers/net/ethernet/aquantia/aq_main.c|  291 +++
 drivers/net/ethernet/aquantia/aq_main.h|   17 +
 drivers/net/ethernet/aquantia/aq_nic.c |  910 
 drivers/net/ethernet/aquantia/aq_nic.h |  108 +
 drivers/net/ethernet/aquantia/aq_nic_internal.h|   46 +
 drivers/net/ethernet/aquantia/aq_pci_func.c|  347 +++
 drivers/net/ethernet/aquantia/aq_pci_func.h|   34 +
 drivers/net/ethernet/aquantia/aq_ring.c|  358 +++
 drivers/net/ethernet/aquantia/aq_ring.h|  157 ++
 drivers/net/ethernet/aquantia/aq_rss.h |   26 +
 drivers/net/ethernet/aquantia/aq_utils.h   |   53 +
 drivers/net/ethernet/aquantia/aq_vec.c |  385 
 drivers/net/ethernet/aquantia/aq_vec.h |   42 +
 drivers/net/ethernet/aquantia/hw_atl/hw_atl_a0.c   |  907 
 drivers/net/ethernet/aquantia/hw_atl/hw_atl_a0.h   |   34 +
 .../ethernet/aquantia/hw_atl/hw_atl_a0_internal.h  |  152 ++
 drivers/net/ethernet/aquantia/hw_atl/hw_atl_b0.c   |  960 
 drivers/net/ethernet/aquantia/hw_atl/hw_atl_b0.h   |   34 +
 .../ethernet/aquantia/hw_atl/hw_atl_b0_internal.h  |  205 ++
 drivers/net/ethernet/aquantia/hw_atl/hw_atl_llh.c  | 1394 
 drivers/net/ethernet/aquantia/hw_atl/hw_atl_llh.h  |  677 ++
 .../ethernet/aquantia/hw_atl/hw_atl_llh_internal.h | 2375 
 .../net/ethernet/aquantia/hw_atl/hw_atl_utils.c|  547 +
 .../net/ethernet/aquantia/hw_atl/hw_atl_utils.h|  210 ++
 drivers/net/ethernet/aquantia/ver.h|   18 +
 36 files changed, 11008 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/Kconfig
 create mode 100644 drivers/net/ethernet/aquantia/Makefile
 create mode 100644 drivers/net/ethernet/aquantia/aq_cfg.h
 create mode 100644 drivers/net/ethernet/aquantia/aq_common.h
 create mode 100644 drivers/net/ethernet/aquantia/aq_ethtool.c
 create mode 100644 drivers/net/ethernet/aquantia/aq_ethtool.h
 create mode 100644 drivers/net/ethernet/aquantia/aq_hw.h
 create mode 100644 drivers/net/ethernet/aquantia/aq_hw_utils.c
 create mode 100644 drivers/net/ethernet/aquantia/aq_hw_utils.h
 create mode 100644

[PATCH v5 10/13] net: ethernet: aquantia: Hardware interface and utility functions

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Add functions to interface with the hardware and some utility functions.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/aquantia/aq_hw.h   | 169 
 drivers/net/ethernet/aquantia/aq_hw_utils.c |  68 +++
 drivers/net/ethernet/aquantia/aq_hw_utils.h |  47 
 3 files changed, 284 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/aq_hw.h
 create mode 100644 drivers/net/ethernet/aquantia/aq_hw_utils.c
 create mode 100644 drivers/net/ethernet/aquantia/aq_hw_utils.h

diff --git a/drivers/net/ethernet/aquantia/aq_hw.h 
b/drivers/net/ethernet/aquantia/aq_hw.h
new file mode 100644
index 000..a3c727e
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/aq_hw.h
@@ -0,0 +1,169 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File aq_hw.h: Declaraion of abstract interface for NIC hardware specific
+ * functions.
+ */
+
+#ifndef AQ_HW_H
+#define AQ_HW_H
+
+#include "aq_common.h"
+
+/* NIC H/W capabilities */
+struct aq_hw_caps_s {
+   u64 hw_features;
+   u64 link_speed_msk;
+   unsigned int hw_priv_flags;
+   u32 rxds;
+   u32 txds;
+   u32 txhwb_alignment;
+   u32 irq_mask;
+   u32 vecs;
+   u32 mtu;
+   u32 mac_regs_count;
+   u8 ports;
+   u8 msix_irqs;
+   u8 tcs;
+   u8 rxd_alignment;
+   u8 rxd_size;
+   u8 txd_alignment;
+   u8 txd_size;
+   u8 tx_rings;
+   u8 rx_rings;
+   bool flow_control;
+   bool is_64_dma;
+};
+
+struct aq_hw_link_status_s {
+   u64 bps;
+};
+
+#define AQ_HW_POWER_STATE_D0   0U
+#define AQ_HW_POWER_STATE_D3   3U
+
+#define AQ_HW_FLAG_STARTED 0x0004U
+#define AQ_HW_FLAG_STOPPING0x0008U
+#define AQ_HW_FLAG_RESETTING   0x0010U
+#define AQ_HW_FLAG_CLOSING 0x0020U
+#define AQ_HW_LINK_DOWN0x0400U
+#define AQ_HW_FLAG_ERR_UNPLUG  0x4000U
+#define AQ_HW_FLAG_ERR_HW  0x8000U
+
+#define AQ_HW_FLAG_ERRORS  (AQ_HW_FLAG_ERR_HW | AQ_HW_FLAG_ERR_UNPLUG)
+
+struct aq_hw_s {
+   AQ_OBJ_HEADER;
+   struct aq_nic_cfg_s *aq_nic_cfg;
+   struct aq_pci_func_s *aq_pci_func;
+   void __iomem *mmio;
+   unsigned int not_ff_addr;
+   struct aq_hw_link_status_s aq_link_status;
+};
+
+struct aq_ring_s;
+struct aq_ring_param_s;
+struct aq_nic_cfg_s;
+struct sk_buff;
+
+struct aq_hw_ops {
+   struct aq_hw_s *(*create)(struct aq_pci_func_s *aq_pci_func,
+ unsigned int port, struct aq_hw_ops *ops);
+
+   void (*destroy)(struct aq_hw_s *self);
+
+   int (*get_hw_caps)(struct aq_hw_s *self,
+  struct aq_hw_caps_s *aq_hw_caps);
+
+   int (*hw_ring_tx_xmit)(struct aq_hw_s *self, struct aq_ring_s *aq_ring,
+  unsigned int frags);
+
+   int (*hw_ring_rx_receive)(struct aq_hw_s *self,
+ struct aq_ring_s *aq_ring);
+
+   int (*hw_ring_rx_fill)(struct aq_hw_s *self, struct aq_ring_s *aq_ring,
+  unsigned int sw_tail_old);
+
+   int (*hw_ring_tx_head_update)(struct aq_hw_s *self,
+ struct aq_ring_s *aq_ring);
+
+   int (*hw_get_mac_permanent)(struct aq_hw_s *self, u8 *mac);
+
+   int (*hw_set_mac_address)(struct aq_hw_s *self, u8 *mac_addr);
+
+   int (*hw_get_link_status)(struct aq_hw_s *self,
+ struct aq_hw_link_status_s *link_status);
+
+   int (*hw_set_link_speed)(struct aq_hw_s *self, u32 speed);
+
+   int (*hw_reset)(struct aq_hw_s *self);
+
+   int (*hw_init)(struct aq_hw_s *self, struct aq_nic_cfg_s *aq_nic_cfg,
+  u8 *mac_addr);
+
+   int (*hw_start)(struct aq_hw_s *self);
+
+   int (*hw_stop)(struct aq_hw_s *self);
+
+   int (*hw_ring_tx_init)(struct aq_hw_s *self, struct aq_ring_s *aq_ring,
+  struct aq_ring_param_s *aq_ring_param);
+
+   int (*hw_ring_tx_start)(struct aq_hw_s *self,
+   struct aq_ring_s *aq_ring);
+
+   int (*hw_ring_tx_stop)(struct aq_hw_s *self,
+  struct aq_ring_s *aq_ring);
+
+   int (*hw_ring_rx_init)(struct aq_hw_s *self,
+  struct aq_ring_s *aq_ring,
+  struct aq_ring_param_s *aq_ring_param);
+
+   int

[PATCH v5 09/13] net: ethernet: aquantia: Atlantic hardware abstraction layer

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Add common functions for Atlantic hardware abstraction layer.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 .../net/ethernet/aquantia/hw_atl/hw_atl_utils.c| 547 +
 .../net/ethernet/aquantia/hw_atl/hw_atl_utils.h| 210 
 2 files changed, 757 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/hw_atl/hw_atl_utils.c
 create mode 100644 drivers/net/ethernet/aquantia/hw_atl/hw_atl_utils.h

diff --git a/drivers/net/ethernet/aquantia/hw_atl/hw_atl_utils.c 
b/drivers/net/ethernet/aquantia/hw_atl/hw_atl_utils.c
new file mode 100644
index 000..bb24d64
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/hw_atl/hw_atl_utils.c
@@ -0,0 +1,547 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File hw_atl_utils.c: Definition of common functions for Atlantic hardware
+ * abstraction layer.
+ */
+
+#include "../aq_hw.h"
+#include "../aq_hw_utils.h"
+#include "../aq_pci_func.h"
+#include "../aq_ring.h"
+#include "../aq_vec.h"
+#include "hw_atl_utils.h"
+#include "hw_atl_llh.h"
+
+#include 
+
+#define HW_ATL_UCP_0X370_REG0x0370U
+
+#define HW_ATL_FW_SM_RAM0x2U
+#define HW_ATL_MPI_CONTROL_ADR  0x0368U
+#define HW_ATL_MPI_STATE_ADR0x036CU
+
+#define HW_ATL_MPI_STATE_MSK0x00FFU
+#define HW_ATL_MPI_STATE_SHIFT  0U
+#define HW_ATL_MPI_SPEED_MSK0xU
+#define HW_ATL_MPI_SPEED_SHIFT  16U
+
+static int hw_atl_utils_fw_downld_dwords(struct aq_hw_s *self, u32 a,
+u32 *p, u32 cnt)
+{
+   int err = 0;
+
+   AQ_HW_WAIT_FOR(reg_glb_cpu_sem_get(self,
+  HW_ATL_FW_SM_RAM) == 1U, 1U, 1000U);
+
+   if (err < 0) {
+   bool is_locked;
+
+   reg_glb_cpu_sem_set(self, 1U, HW_ATL_FW_SM_RAM);
+   is_locked = reg_glb_cpu_sem_get(self, HW_ATL_FW_SM_RAM);
+   if (!is_locked) {
+   err = -ETIME;
+   goto err_exit;
+   }
+   }
+
+   aq_hw_write_reg(self, 0x0208U, a);
+
+   for (++cnt; --cnt;) {
+   u32 i = 0U;
+
+   aq_hw_write_reg(self, 0x0200U, 0x8000U);
+
+   for (i = 1024U;
+   (0x100U & aq_hw_read_reg(self, 0x0200U)) && --i;) {
+   }
+
+   *(p++) = aq_hw_read_reg(self, 0x020CU);
+   }
+
+   reg_glb_cpu_sem_set(self, 1U, HW_ATL_FW_SM_RAM);
+
+err_exit:
+   return err;
+}
+
+static void hw_atl_utils_fw_upload_dwords(struct aq_hw_s *self, u32 a, u32 *p,
+ u32 cnt)
+{
+   int err = 0;
+   bool is_locked;
+
+   is_locked = reg_glb_cpu_sem_get(self, HW_ATL_FW_SM_RAM);
+   if (!is_locked) {
+   err = -ETIME;
+   goto err_exit;
+   }
+
+   aq_hw_write_reg(self, 0x0208U, a);
+
+   for (++cnt; --cnt;) {
+   u32 i = 0U;
+
+   aq_hw_write_reg(self, 0x020CU, *(p++));
+   aq_hw_write_reg(self, 0x0200U, 0xC000U);
+
+   for (i = 1024U;
+   (0x100U & aq_hw_read_reg(self, 0x0200U)) && --i;) {
+   }
+   }
+
+   reg_glb_cpu_sem_set(self, 1U, HW_ATL_FW_SM_RAM);
+
+err_exit:;
+   (void)err;
+}
+
+static int hw_atl_utils_init_ucp(struct aq_hw_s *self)
+{
+   int err = 0;
+
+   if (!aq_hw_read_reg(self, 0x370U)) {
+   unsigned int rnd = 0U;
+   unsigned int ucp_0x370 = 0U;
+
+   get_random_bytes(, sizeof(unsigned int));
+
+   ucp_0x370 = 0x02020202U | (0xFEFEFEFEU & rnd);
+   aq_hw_write_reg(self, HW_ATL_UCP_0X370_REG, ucp_0x370);
+   }
+
+   reg_glb_cpu_scratch_scp_set(self, 0xU, 25U);
+
+   /* check 10 times by 1ms */
+   AQ_HW_WAIT_FOR(0U != (PHAL_ATLANTIC_A0->mbox_addr =
+   aq_hw_read_reg(self, 0x360U)), 1000U, 10U);
+
+   return err;
+}
+
+#define HW_ATL_RPC_CONTROL_ADR 0x0338U
+#define HW_ATL_RPC_STATE_ADR   0x033CU
+
+struct aq_hw_atl_utils_fw_rpc_tid_s {
+   union {
+   u32 val;
+   struct {
+   u16 tid;
+   u16 len;
+   };
+   };
+};
+
+#define hw_atl_utils_fw_rpc_init(_H_) hw_atl_utils_fw_rpc_wait(_H_, NULL)
+
+static int hw_atl_utils_fw_rpc_call(struct aq_hw_s *self, unsigned int 
rpc_size)
+{
+

[PATCH v5 08/13] net: ethernet: aquantia: PCI operations

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Add functions that handle the PCI bus interface.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/aquantia/aq_pci_func.c | 347 
 drivers/net/ethernet/aquantia/aq_pci_func.h |  34 +++
 2 files changed, 381 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/aq_pci_func.c
 create mode 100644 drivers/net/ethernet/aquantia/aq_pci_func.h

diff --git a/drivers/net/ethernet/aquantia/aq_pci_func.c 
b/drivers/net/ethernet/aquantia/aq_pci_func.c
new file mode 100644
index 000..0b9052c
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/aq_pci_func.c
@@ -0,0 +1,347 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File aq_pci_func.c: Definition of PCI functions. */
+
+#include "aq_pci_func.h"
+#include "aq_nic.h"
+#include "aq_vec.h"
+#include "aq_hw.h"
+#include 
+
+struct aq_pci_func_s {
+   struct pci_dev *pdev;
+   struct aq_nic_s *port[AQ_CFG_PCI_FUNC_PORTS];
+   void __iomem *mmio;
+   void *aq_vec[AQ_CFG_PCI_FUNC_MSIX_IRQS];
+   resource_size_t mmio_pa;
+   unsigned int msix_entry_mask;
+   unsigned int irq_type;
+   unsigned int ports;
+   bool is_pci_enabled;
+   bool is_regions;
+   bool is_pci_using_dac;
+   struct aq_hw_caps_s aq_hw_caps;
+   struct msix_entry msix_entry[AQ_CFG_PCI_FUNC_MSIX_IRQS];
+};
+
+struct aq_pci_func_s *aq_pci_func_alloc(struct aq_hw_ops *aq_hw_ops,
+   struct pci_dev *pdev,
+   const struct net_device_ops *ndev_ops,
+   const struct ethtool_ops *eth_ops)
+{
+   struct aq_pci_func_s *self = NULL;
+   int err = 0;
+   unsigned int port = 0U;
+
+   if (!aq_hw_ops) {
+   err = -EFAULT;
+   goto err_exit;
+   }
+   self = kzalloc(sizeof(*self), GFP_KERNEL);
+   if (!self) {
+   err = -ENOMEM;
+   goto err_exit;
+   }
+
+   pci_set_drvdata(pdev, self);
+   self->pdev = pdev;
+
+   err = aq_hw_ops->get_hw_caps(NULL, >aq_hw_caps);
+   if (err < 0)
+   goto err_exit;
+
+   self->ports = self->aq_hw_caps.ports;
+
+   for (port = 0; port < self->ports; ++port) {
+   struct aq_nic_s *aq_nic = aq_nic_alloc_cold(ndev_ops, eth_ops,
+   >dev, self,
+   port, aq_hw_ops);
+
+   if (!aq_nic) {
+   err = -ENOMEM;
+   goto err_exit;
+   }
+   self->port[port] = aq_nic;
+   }
+
+err_exit:
+   if (err < 0) {
+   if (self)
+   aq_pci_func_free(self);
+   self = NULL;
+   }
+
+   (void)err;
+   return self;
+}
+
+int aq_pci_func_init(struct aq_pci_func_s *self)
+{
+   int err = 0;
+   unsigned int bar = 0U;
+   unsigned int port = 0U;
+   unsigned int i = 0U;
+
+   err = pci_enable_device(self->pdev);
+   if (err < 0)
+   goto err_exit;
+
+   self->is_pci_enabled = true;
+
+   err = pci_set_dma_mask(self->pdev, DMA_BIT_MASK(64));
+   if (!err) {
+   err = pci_set_consistent_dma_mask(self->pdev, DMA_BIT_MASK(64));
+   self->is_pci_using_dac = 1;
+   }
+   if (err) {
+   err = pci_set_dma_mask(self->pdev, DMA_BIT_MASK(32));
+   if (!err)
+   err = pci_set_consistent_dma_mask(self->pdev,
+ DMA_BIT_MASK(32));
+   self->is_pci_using_dac = 0;
+   }
+   if (err != 0) {
+   err = -ENOSR;
+   goto err_exit;
+   }
+
+   err = pci_request_regions(self->pdev, AQ_CFG_DRV_NAME "_mmio");
+   if (err < 0)
+   goto err_exit;
+
+   self->is_regions = true;
+
+   pci_set_master(self->pdev);
+
+   for (bar = 0; bar < 4; ++bar) {
+   if (IORESOURCE_MEM & pci_resource_flags(self->pdev, bar)) {
+   resource_size_t reg_sz;
+
+   self->mmio_pa = pci_resource_start(self->pdev, bar);
+   if (self->mmio_pa == 0U) {
+   err = -EIO;
+   goto err_exit;
+   }
+
+

[PATCH v5 11/13] net: ethernet: aquantia: Ethtool support

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Add the driver interfaces required for support by the ethtool utility.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/aquantia/aq_ethtool.c | 250 +
 drivers/net/ethernet/aquantia/aq_ethtool.h |  19 +++
 2 files changed, 269 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/aq_ethtool.c
 create mode 100644 drivers/net/ethernet/aquantia/aq_ethtool.h

diff --git a/drivers/net/ethernet/aquantia/aq_ethtool.c 
b/drivers/net/ethernet/aquantia/aq_ethtool.c
new file mode 100644
index 000..f11bdb1
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/aq_ethtool.c
@@ -0,0 +1,250 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File aq_ethtool.c: Definition of ethertool related functions. */
+
+#include "aq_ethtool.h"
+#include "aq_nic.h"
+
+static void aq_ethtool_get_regs(struct net_device *ndev,
+   struct ethtool_regs *regs, void *p)
+{
+   struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
+   u32 regs_count = aq_nic_get_regs_count(aq_nic);
+
+   memset(p, 0, regs_count * sizeof(u32));
+   aq_nic_get_regs(aq_nic, regs, p);
+}
+
+static int aq_ethtool_get_regs_len(struct net_device *ndev)
+{
+   struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
+   u32 regs_count = aq_nic_get_regs_count(aq_nic);
+
+   return regs_count * sizeof(u32);
+}
+
+static u32 aq_ethtool_get_link(struct net_device *ndev)
+{
+   struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
+
+   return aq_nic_get_link_speed(aq_nic) ? 1U : 0U;
+}
+
+static int aq_ethtool_get_settings(struct net_device *ndev,
+  struct ethtool_cmd *cmd)
+{
+   struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
+
+   cmd->port = PORT_TP;
+   cmd->transceiver = XCVR_EXTERNAL;
+
+   ethtool_cmd_speed_set(cmd, netif_carrier_ok(ndev) ?
+   aq_nic_get_link_speed(aq_nic) : 0U);
+
+   cmd->duplex = DUPLEX_FULL;
+   aq_nic_get_link_settings(aq_nic, cmd);
+   return 0;
+}
+
+static int aq_ethtool_set_settings(struct net_device *ndev,
+  struct ethtool_cmd *cmd)
+{
+   struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
+
+   return aq_nic_set_link_settings(aq_nic, cmd);
+}
+
+static const char aq_ethtool_stat_names[][ETH_GSTRING_LEN] = {
+   "InPackets",
+   "InUCast",
+   "InMCast",
+   "InBCast",
+   "InErrors",
+   "OutPackets",
+   "OutUCast",
+   "OutMCast",
+   "OutBCast",
+   "InUCastOctects",
+   "OutUCastOctects",
+   "InMCastOctects",
+   "OutMCastOctects",
+   "InBCastOctects",
+   "OutBCastOctects",
+   "InOctects",
+   "OutOctects",
+   "InPacketsDma",
+   "OutPacketsDma",
+   "InOctetsDma",
+   "OutOctetsDma",
+   "InDroppedDma",
+   "Queue[0] InPackets",
+   "Queue[0] OutPackets",
+   "Queue[0] InJumboPackets",
+   "Queue[0] InLroPackets",
+   "Queue[0] InErrors",
+#if 1 < AQ_CFG_VECS_DEF
+   "Queue[1] InPackets",
+   "Queue[1] OutPackets",
+   "Queue[1] InJumboPackets",
+   "Queue[1] InLroPackets",
+   "Queue[1] InErrors",
+#endif
+#if 2 < AQ_CFG_VECS_DEF
+   "Queue[2] InPackets",
+   "Queue[2] OutPackets",
+   "Queue[2] InJumboPackets",
+   "Queue[2] InLroPackets",
+   "Queue[2] InErrors",
+#endif
+#if 3 < AQ_CFG_VECS_DEF
+   "Queue[3] InPackets",
+   "Queue[3] OutPackets",
+   "Queue[3] InJumboPackets",
+   "Queue[3] InLroPackets",
+   "Queue[3] InErrors",
+#endif
+#if 4 < AQ_CFG_VECS_DEF
+   "Queue[4] InPackets",
+   "Queue[4] OutPackets",
+   "Queue[4] InJumboPackets",
+   "Queue[4] InLroPackets",
+   "Queue[4] InErrors",
+#endif
+#if 5 < AQ_CFG_VECS_DEF
+   "Queue[5] InPackets",
+   "Queue[5] OutPackets",
+   "Queue[5] InJumboPackets",
+   "Queue[5] InLroPackets",
+   "Queue[5] InErrors",
+#endif
+#if 6 < AQ_CFG_VECS_DEF
+   "Queue[6] InPackets",
+   "Queue[6] OutPackets",
+   "Queue[6] InJumboPackets",
+   "Queue[6] InLroPackets",
+   "Queue[6] InErrors",
+#endif
+#if 7 < AQ_CFG_VECS_DEF
+   "Queue[7] InPackets",
+   "Queue[7] OutPackets",
+   "Queue[7] InJumboPackets",
+   "Queue[7] InLroPackets",
+   "Queue[7]

[PATCH v5 13/13] net: ethernet: aquantia: Integrate AQtion 2.5/5 GB NIC driver

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Modify the drivers/net/ethernet/{Makefile,Kconfig} file to make them a
part of the network drivers build.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/Kconfig  | 1 +
 drivers/net/ethernet/Makefile | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index 8cc7467..d467c8b 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -28,6 +28,7 @@ source "drivers/net/ethernet/amazon/Kconfig"
 source "drivers/net/ethernet/amd/Kconfig"
 source "drivers/net/ethernet/apm/Kconfig"
 source "drivers/net/ethernet/apple/Kconfig"
+source "drivers/net/ethernet/aquantia/Kconfig"
 source "drivers/net/ethernet/arc/Kconfig"
 source "drivers/net/ethernet/atheros/Kconfig"
 source "drivers/net/ethernet/aurora/Kconfig"
diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index a09423d..123ef8e 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_NET_VENDOR_AMAZON) += amazon/
 obj-$(CONFIG_NET_VENDOR_AMD) += amd/
 obj-$(CONFIG_NET_XGENE) += apm/
 obj-$(CONFIG_NET_VENDOR_APPLE) += apple/
+obj-$(CONFIG_NET_VENDOR_AQUANTIA) += aquantia/
 obj-$(CONFIG_NET_VENDOR_ARC) += arc/
 obj-$(CONFIG_NET_VENDOR_ATHEROS) += atheros/
 obj-$(CONFIG_NET_VENDOR_AURORA) += aurora/
-- 
2.7.4

[PATCH v5 01/13] net: ethernet: aquantia: Make and configuration files.

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Patches to create the make and configuration files.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/aquantia/Kconfig  | 24 +++
 drivers/net/ethernet/aquantia/Makefile | 42 ++
 drivers/net/ethernet/aquantia/ver.h| 18 +++
 3 files changed, 84 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/Kconfig
 create mode 100644 drivers/net/ethernet/aquantia/Makefile
 create mode 100644 drivers/net/ethernet/aquantia/ver.h

diff --git a/drivers/net/ethernet/aquantia/Kconfig 
b/drivers/net/ethernet/aquantia/Kconfig
new file mode 100644
index 000..a74a4c0
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/Kconfig
@@ -0,0 +1,24 @@
+#
+# aQuantia device configuration
+#
+
+config NET_VENDOR_AQUANTIA
+   bool "aQuantia devices"
+   default y
+   ---help---
+ Set this to y if you have an Ethernet network cards that uses the 
aQuantia
+ chipset.
+
+ This option does not build any drivers; it casues the aQuantia
+ drivers that can be built to appear in the list of Ethernet drivers.
+
+
+if NET_VENDOR_AQUANTIA
+
+config AQTION
+   tristate "aQuantia AQtion Support"
+   depends on PCI
+   ---help---
+ This enables the support for the aQuantia AQtion Ethernet card.
+
+endif # NET_VENDOR_AQUANTIA
diff --git a/drivers/net/ethernet/aquantia/Makefile 
b/drivers/net/ethernet/aquantia/Makefile
new file mode 100644
index 000..e4ae696
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/Makefile
@@ -0,0 +1,42 @@
+
+#
+# aQuantia Ethernet Controller AQtion Linux Driver
+# Copyright(c) 2014-2017 aQuantia Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms and conditions of the GNU General Public License,
+# version 2, as published by the Free Software Foundation.
+#
+# This program is distributed in the hope it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see .
+#
+# The full GNU General Public License is included in this distribution in
+# the file called "COPYING".
+#
+# Contact Information: 
+# aQuantia Corporation, 105 E. Tasman Dr. San Jose, CA 95134, USA
+#
+
+
+#
+# Makefile for the AQtion(tm) Ethernet driver
+#
+
+obj-$(CONFIG_AQTION) += atlantic.o
+
+atlantic-objs := aq_main.o \
+   aq_nic.o \
+   aq_pci_func.o \
+   aq_vec.o \
+   aq_ring.o \
+   aq_hw_utils.o \
+   aq_ethtool.o \
+   hw_atl/hw_atl_a0.o \
+   hw_atl/hw_atl_b0.o \
+   hw_atl/hw_atl_utils.o \
+   hw_atl/hw_atl_llh.o
diff --git a/drivers/net/ethernet/aquantia/ver.h 
b/drivers/net/ethernet/aquantia/ver.h
new file mode 100644
index 000..636d646
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/ver.h
@@ -0,0 +1,18 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#ifndef VER_H
+#define VER_H
+
+#define NIC_MAJOR_DRIVER_VERSION   1
+#define NIC_MINOR_DRIVER_VERSION   5
+#define NIC_BUILD_DRIVER_VERSION   339
+#define NIC_REVISION_DRIVER_VERSION0
+
+#endif /* VER_H */
-- 
2.7.4

[PATCH v5 06/13] net: ethernet: aquantia: Atlantic A0 and B0 specific functions.

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Add Atlantic A0 and B0 specific functions.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/aquantia/hw_atl/hw_atl_a0.c   | 907 +++
 drivers/net/ethernet/aquantia/hw_atl/hw_atl_a0.h   |  34 +
 .../ethernet/aquantia/hw_atl/hw_atl_a0_internal.h  | 152 
 drivers/net/ethernet/aquantia/hw_atl/hw_atl_b0.c   | 960 +
 drivers/net/ethernet/aquantia/hw_atl/hw_atl_b0.h   |  34 +
 .../ethernet/aquantia/hw_atl/hw_atl_b0_internal.h  | 205 +
 6 files changed, 2292 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/hw_atl/hw_atl_a0.c
 create mode 100644 drivers/net/ethernet/aquantia/hw_atl/hw_atl_a0.h
 create mode 100644 drivers/net/ethernet/aquantia/hw_atl/hw_atl_a0_internal.h
 create mode 100644 drivers/net/ethernet/aquantia/hw_atl/hw_atl_b0.c
 create mode 100644 drivers/net/ethernet/aquantia/hw_atl/hw_atl_b0.h
 create mode 100644 drivers/net/ethernet/aquantia/hw_atl/hw_atl_b0_internal.h

diff --git a/drivers/net/ethernet/aquantia/hw_atl/hw_atl_a0.c 
b/drivers/net/ethernet/aquantia/hw_atl/hw_atl_a0.c
new file mode 100644
index 000..fafdc9c
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/hw_atl/hw_atl_a0.c
@@ -0,0 +1,907 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File hw_atl_a0.c: Definition of Atlantic hardware specific functions. */
+
+#include "../aq_hw.h"
+#include "../aq_hw_utils.h"
+#include "../aq_ring.h"
+#include "hw_atl_a0.h"
+#include "hw_atl_utils.h"
+#include "hw_atl_llh.h"
+#include "hw_atl_a0_internal.h"
+
+static int hw_atl_a0_get_hw_caps(struct aq_hw_s *self,
+struct aq_hw_caps_s *aq_hw_caps)
+{
+   memcpy(aq_hw_caps, _atl_a0_hw_caps_, sizeof(*aq_hw_caps));
+   return 0;
+}
+
+static struct aq_hw_s *hw_atl_a0_create(struct aq_pci_func_s *aq_pci_func,
+   unsigned int port,
+   struct aq_hw_ops *ops)
+{
+   struct hw_atl_s *self = NULL;
+   int err = 0;
+
+   self = kzalloc(sizeof(*self), GFP_KERNEL);
+   if (!self) {
+   err = -ENOMEM;
+   goto err_exit;
+   }
+   self->base.aq_pci_func = aq_pci_func;
+
+   self->base.not_ff_addr = 0x10U;
+
+err_exit:
+   return (struct aq_hw_s *)self;
+}
+
+static void hw_atl_a0_destroy(struct aq_hw_s *self)
+{
+   kfree(self);
+}
+
+static int hw_atl_a0_hw_reset(struct aq_hw_s *self)
+{
+   int err = 0;
+
+   glb_glb_reg_res_dis_set(self, 1U);
+   pci_pci_reg_res_dis_set(self, 0U);
+   rx_rx_reg_res_dis_set(self, 0U);
+   tx_tx_reg_res_dis_set(self, 0U);
+
+   HW_ATL_FLUSH();
+   glb_soft_res_set(self, 1);
+
+   /* check 10 times by 1ms */
+   AQ_HW_WAIT_FOR(glb_soft_res_get(self) == 0, 1000U, 10U);
+   if (err < 0)
+   goto err_exit;
+
+   itr_irq_reg_res_dis_set(self, 0U);
+   itr_res_irq_set(self, 1U);
+
+   /* check 10 times by 1ms */
+   AQ_HW_WAIT_FOR(itr_res_irq_get(self) == 0, 1000U, 10U);
+   if (err < 0)
+   goto err_exit;
+
+   hw_atl_utils_mpi_set(self, MPI_RESET, 0x0U);
+
+   err = aq_hw_err_from_flags(self);
+
+err_exit:
+   return err;
+}
+
+static int hw_atl_a0_hw_qos_set(struct aq_hw_s *self)
+{
+   u32 tc = 0U;
+   u32 buff_size = 0U;
+   unsigned int i_priority = 0U;
+   bool is_rx_flow_control = false;
+
+   /* TPS Descriptor rate init */
+   tps_tx_pkt_shed_desc_rate_curr_time_res_set(self, 0x0U);
+   tps_tx_pkt_shed_desc_rate_lim_set(self, 0xA);
+
+   /* TPS VM init */
+   tps_tx_pkt_shed_desc_vm_arb_mode_set(self, 0U);
+
+   /* TPS TC credits init */
+   tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
+   tps_tx_pkt_shed_data_arb_mode_set(self, 0U);
+
+   tps_tx_pkt_shed_tc_data_max_credit_set(self, 0xFFF, 0U);
+   tps_tx_pkt_shed_tc_data_weight_set(self, 0x64, 0U);
+   tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0x50, 0U);
+   tps_tx_pkt_shed_desc_tc_weight_set(self, 0x1E, 0U);
+
+   /* Tx buf size */
+   buff_size = HW_ATL_A0_TXBUF_MAX;
+
+   tpb_tx_pkt_buff_size_per_tc_set(self, buff_size, tc);
+   tpb_tx_buff_hi_threshold_per_tc_set(self,
+   (buff_size * (1024 / 32U) * 66U) /
+   100U, tc);
+   tpb_tx_buff_lo_threshold_per_tc_set(self,
+

[PATCH v5 05/13] net: ethernet: aquantia: Support for NIC-specific code

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Add support for code specific to the Atlantic NIC.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/aquantia/aq_main.c | 291 
 drivers/net/ethernet/aquantia/aq_main.h |  17 +
 drivers/net/ethernet/aquantia/aq_nic.c  | 910 
 drivers/net/ethernet/aquantia/aq_nic.h  | 108 +++
 drivers/net/ethernet/aquantia/aq_nic_internal.h |  46 ++
 5 files changed, 1372 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/aq_main.c
 create mode 100644 drivers/net/ethernet/aquantia/aq_main.h
 create mode 100644 drivers/net/ethernet/aquantia/aq_nic.c
 create mode 100644 drivers/net/ethernet/aquantia/aq_nic.h
 create mode 100644 drivers/net/ethernet/aquantia/aq_nic_internal.h

diff --git a/drivers/net/ethernet/aquantia/aq_main.c 
b/drivers/net/ethernet/aquantia/aq_main.c
new file mode 100644
index 000..18a6012
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/aq_main.c
@@ -0,0 +1,291 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File aq_main.c: Main file for aQuantia Linux driver. */
+
+#include "aq_main.h"
+#include "aq_nic.h"
+#include "aq_pci_func.h"
+#include "aq_ethtool.h"
+#include "hw_atl/hw_atl_a0.h"
+#include "hw_atl/hw_atl_b0.h"
+
+#include 
+#include 
+
+static const struct pci_device_id aq_pci_tbl[] = {
+   { PCI_VDEVICE(AQUANTIA, HW_ATL_DEVICE_ID_0001), },
+   { PCI_VDEVICE(AQUANTIA, HW_ATL_DEVICE_ID_D100), },
+   { PCI_VDEVICE(AQUANTIA, HW_ATL_DEVICE_ID_D107), },
+   { PCI_VDEVICE(AQUANTIA, HW_ATL_DEVICE_ID_D108), },
+   { PCI_VDEVICE(AQUANTIA, HW_ATL_DEVICE_ID_D109), },
+   {}
+};
+
+MODULE_DEVICE_TABLE(pci, aq_pci_tbl);
+
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(AQ_CFG_DRV_VERSION);
+MODULE_AUTHOR(AQ_CFG_DRV_AUTHOR);
+MODULE_DESCRIPTION(AQ_CFG_DRV_DESC);
+
+static struct aq_hw_ops *aq_pci_probe_get_hw_ops_by_id(struct pci_dev *pdev)
+{
+   struct aq_hw_ops *ops = NULL;
+   int err = 0;
+
+   ops = hw_atl_a0_get_ops_by_id(pdev);
+   if (ops) {
+   err = 0;
+   goto err_exit;
+   }
+
+   ops = hw_atl_b0_get_ops_by_id(pdev);
+   if (ops) {
+   err = 0;
+   goto err_exit;
+   }
+
+/* the H/W was not recognized */
+   err = -EFAULT;
+
+err_exit:
+   return ops;
+}
+
+static int aq_ndev_open(struct net_device *ndev)
+{
+   struct aq_nic_s *aq_nic = NULL;
+   int err = 0;
+
+   aq_nic = aq_nic_alloc_hot(ndev);
+   if (!aq_nic) {
+   err = -ENOMEM;
+   goto err_exit;
+   }
+   err = aq_nic_init(aq_nic);
+   if (err < 0)
+   goto err_exit;
+   err = aq_nic_start(aq_nic);
+   if (err < 0)
+   goto err_exit;
+
+err_exit:
+   if (err < 0) {
+   if (aq_nic)
+   aq_nic_deinit(aq_nic);
+   }
+   return err;
+}
+
+static int aq_ndev_close(struct net_device *ndev)
+{
+   struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
+
+   aq_nic_stop(aq_nic);
+   aq_nic_deinit(aq_nic);
+   aq_nic_free_hot_resources(aq_nic);
+
+   return 0;
+}
+
+static int aq_ndev_start_xmit(struct sk_buff *skb, struct net_device *ndev)
+{
+   struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
+   int err = 0;
+
+   err = aq_nic_xmit(aq_nic, skb);
+   if (err < 0)
+   goto err_exit;
+
+err_exit:
+   return err;
+}
+
+static int aq_ndev_change_mtu(struct net_device *ndev, int new_mtu)
+{
+   struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
+   int err = 0;
+
+   if (new_mtu == ndev->mtu) {
+   err = 0;
+   goto err_exit;
+   }
+   if (new_mtu < 68) {
+   err = -EINVAL;
+   goto err_exit;
+   }
+   err = aq_nic_set_mtu(aq_nic, new_mtu + ETH_HLEN);
+   if (err < 0)
+   goto err_exit;
+   ndev->mtu = new_mtu;
+
+   if (netif_running(ndev)) {
+   aq_ndev_close(ndev);
+   aq_ndev_open(ndev);
+   }
+
+err_exit:
+   return err;
+}
+
+static int aq_ndev_set_features(struct net_device *ndev,
+   netdev_features_t features)
+{
+   struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
+   struct aq_nic_cfg_s *aq_cfg = aq_nic_get_cfg(aq_nic);
+   bool is_lro = false;
+
+

[PATCH v5 12/13] net: ethernet: aquantia: Receive side scaling

2017-01-12 Thread Alexander Loktionov

From: David VomLehn 

Add definitions that support receive side scaling.

Signed-off-by: Alexander Loktionov 
Signed-off-by: Dmitrii Tarakanov 
Signed-off-by: Pavel Belous 
Signed-off-by: Dmitry Bezrukov 
Signed-off-by: David M. VomLehn 
---
 drivers/net/ethernet/aquantia/aq_rss.h | 26 ++
 1 file changed, 26 insertions(+)
 create mode 100644 drivers/net/ethernet/aquantia/aq_rss.h

diff --git a/drivers/net/ethernet/aquantia/aq_rss.h 
b/drivers/net/ethernet/aquantia/aq_rss.h
new file mode 100644
index 000..1db6eb2
--- /dev/null
+++ b/drivers/net/ethernet/aquantia/aq_rss.h
@@ -0,0 +1,26 @@
+/*
+ * aQuantia Corporation Network Driver
+ * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+/* File aq_rss.h: Receive Side Scaling definitions. */
+
+#ifndef AQ_RSS_H
+#define AQ_RSS_H
+
+#include "aq_common.h"
+#include "aq_cfg.h"
+
+struct aq_rss_parameters {
+   u16 base_cpu_number;
+   u16 indirection_table_size;
+   u16 hash_secret_key_size;
+   u32 hash_secret_key[AQ_CFG_RSS_HASHKEY_SIZE / sizeof(u32)];
+   u8 indirection_table[AQ_CFG_RSS_INDIRECTION_TABLE_MAX];
+};
+
+#endif /* AQ_RSS_H */
-- 
2.7.4

Re: [PATCH net-next v2 05/10] drivers: base: Add device_find_class()

2017-01-12 Thread David Miller

From: Florian Fainelli 
Date: Thu, 12 Jan 2017 14:50:39 -0800

> Well, this is really so that we don't need to cast the arguments passed
> to device_find_child(), which takes a void *data as well.

Aha, I didn't catch that, my bad.

Re: [RFC] [PATCH] audit: log 32-bit socketcalls

2017-01-12 Thread Richard Guy Briggs

On 2017-01-12 16:32, Paul Moore wrote:
> On Thu, Jan 12, 2017 at 7:36 AM, Richard Guy Briggs  wrote:
> > 32-bit socketcalls were not being logged by audit on x86_64 systems.
> > Log them.
> >
> > See: https://github.com/linux-audit/audit-kernel/issues/14
> >
> > Signed-off-by: Richard Guy Briggs 
> > ---
> >  net/compat.c |   18 --
> >  1 files changed, 16 insertions(+), 2 deletions(-)
> 
> You should CC netdev on this patch; I'd also mention that you are
> simply duplicating the normal socketcall() auditing in the compat
> version (the only real difference being the argument size handling
> workaround).

D'ho! Completely forgot about netdev.

I thought of mentioning the size handling in the description, but
figured it was somewhat obvious right in the code.  I'll add a comment.

> > diff --git a/net/compat.c b/net/compat.c
> > index 1cd2ec0..86cacab 100644
> > --- a/net/compat.c
> > +++ b/net/compat.c
> > @@ -22,6 +22,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >
> >  #include 
> > @@ -781,14 +782,27 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct 
> > compat_mmsghdr __user *, mmsg,
> >
> >  COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
> >  {
> > +   unsigned int len, i;
> > int ret;
> > -   u32 a[6];
> > +   u32 a[AUDITSC_ARGS];
> > +   unsigned long aa[AUDITSC_ARGS];
> > u32 a0, a1;
> >
> > if (call < SYS_SOCKET || call > SYS_SENDMMSG)
> > return -EINVAL;
> > -   if (copy_from_user(a, args, nas[call]))
> > +   len = nas[call];
> > +   if (len > sizeof(a))
> > +   return -EINVAL;
> > +
> > +   if (copy_from_user(a, args, len))
> > return -EFAULT;
> > +
> > +   for (i=0; i < len/sizeof(a[0]); i++)
> > +   aa[i] = (unsigned long)a[i];
> 
> It will be interesting to see if you get push back on this loop
> outside of audit_socketcall(); folks may want to see it wrapped up
> inside a audit_socketcall_compat() (or similar) function so it isn't
> needlessly called in a number of cases.  However, considering it is
> compat code, and not the common case it may be okay.

I thought about this, and was thinking a check of !audit_dummy_context()
here might be a solution, but audit_socketcall_compat is a much cleaner
idea.  I did also consider that it is compat code that won't have a lot
of performance nerds screaming, but that's no excuse...

> > +   ret = audit_socketcall(len/sizeof(a[0]), aa);
> > +   if (ret)
> > +   return ret;
> > +
> > a0 = a[0];
> > a1 = a[1];
> >
> > --
> > 1.7.1
> 
> -- 
> paul moore
> www.paul-moore.com

- RGB

--
Richard Guy Briggs 
Kernel Security Engineering, Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635

Re: [net PATCH v3 1/5] virtio_net: use dev_kfree_skb for small buffer XDP receive

2017-01-12 Thread Jason Wang




On 2017年01月13日 10:50, John Fastabend wrote:

In the small buffer case during driver unload we currently use
put_page instead of dev_kfree_skb. Resolve this by adding a check
for virtnet mode when checking XDP queue type. Also name the
function so that the code reads correctly to match the additional
check.

Signed-off-by: John Fastabend 
---
  drivers/net/virtio_net.c |8 ++--
  1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4a10500..d97bb71 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1890,8 +1890,12 @@ static void free_receive_page_frags(struct virtnet_info 
*vi)
put_page(vi->rq[i].alloc_frag.page);
  }
  
-static bool is_xdp_queue(struct virtnet_info *vi, int q)

+static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
  {
+   /* For small receive mode always use kfree_skb variants */
+   if (!vi->mergeable_rx_bufs)
+   return false;
+
if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
return false;
else if (q < vi->curr_queue_pairs)
@@ -1908,7 +1912,7 @@ static void free_unused_bufs(struct virtnet_info *vi)
for (i = 0; i < vi->max_queue_pairs; i++) {
struct virtqueue *vq = vi->sq[i].vq;
while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
-   if (!is_xdp_queue(vi, i))
+   if (!is_xdp_raw_buffer_queue(vi, i))
dev_kfree_skb(buf);
else
put_page(virt_to_head_page(buf));



Acked-by: Jason Wang

Re: [net PATCH v2 0/5] virtio_net XDP fixes and adjust_header support

2017-01-12 Thread John Fastabend

On 17-01-12 04:34 PM, John Fastabend wrote:
> This has a fix to handle small buffer free logic correctly and then
> also adds adjust head support.
> 
> I pushed adjust head at net (even though its rc3) to avoid having
> to push another exception case into virtio_net to catch if the
> program uses adjust_head and then block it. If there are any strong
> objections to this we can push it at net-next and use a patch from
> Jakub to add the exception handling but then user space has to deal
> with it either via try/fail logic or via kernel version checks. Granted
> we already have some cases that need to be configured to enable XDP
> but I don't see any reason to have yet another one when we can fix it
> now vs delaying a kernel version.
> 
> 
> v2: fix spelling error, convert unsigned -> unsigned int
> 
> ---

Sorry about the v2 here I got a connection reset by peer error from
git and it seems only 2/5 patches made it to the list. To avoid as much
confusion as possible I just sent a v3 and it seems to have completed
correctly.

Thanks,
John

[net PATCH v3 5/5] virtio_net: XDP support for adjust_head

2017-01-12 Thread John Fastabend

Add support for XDP adjust head by allocating a 256B header region
that XDP programs can grow into. This is only enabled when a XDP
program is loaded.

In order to ensure that we do not have to unwind queue headroom push
queue setup below bpf_prog_add. It reads better to do a prog ref
unwind vs another queue setup call.

At the moment this code must do a full reset to ensure old buffers
without headroom on program add or with headroom on program removal
are not used incorrectly in the datapath. Ideally we would only
have to disable/enable the RX queues being updated but there is no
API to do this at the moment in virtio so use the big hammer. In
practice it is likely not that big of a problem as this will only
happen when XDP is enabled/disabled changing programs does not
require the reset. There is some risk that the driver may either
have an allocation failure or for some reason fail to correctly
negotiate with the underlying backend in this case the driver will
be left uninitialized. I have not seen this ever happen on my test
systems and for what its worth this same failure case can occur
from probe and other contexts in virtio framework.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |  155 --
 drivers/virtio/virtio.c  |9 ++-
 include/linux/virtio.h   |3 +
 3 files changed, 144 insertions(+), 23 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6041828..8b897e7 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 static int napi_weight = NAPI_POLL_WEIGHT;
@@ -159,6 +160,9 @@ struct virtnet_info {
/* Ethtool settings */
u8 duplex;
u32 speed;
+
+   /* Headroom allocated in RX Queue */
+   unsigned int headroom;
 };
 
 struct padded_vnet_hdr {
@@ -359,6 +363,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
}
 
if (vi->mergeable_rx_bufs) {
+   xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
/* Zero header and leave csum up to XDP layers */
hdr = xdp->data;
memset(hdr, 0, vi->hdr_len);
@@ -375,7 +380,9 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
num_sg = 2;
sg_init_table(sq->sg, 2);
sg_set_buf(sq->sg, hdr, vi->hdr_len);
-   skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
+   skb_to_sgvec(skb, sq->sg + 1,
+xdp->data - xdp->data_hard_start,
+xdp->data_end - xdp->data);
}
err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
   data, GFP_ATOMIC);
@@ -401,7 +408,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
struct bpf_prog *xdp_prog;
 
len -= vi->hdr_len;
-   skb_trim(skb, len);
 
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -413,11 +419,15 @@ static struct sk_buff *receive_small(struct net_device 
*dev,
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
goto err_xdp;
 
-   xdp.data = skb->data;
+   xdp.data_hard_start = skb->data;
+   xdp.data = skb->data + vi->headroom;
xdp.data_end = xdp.data + len;
act = bpf_prog_run_xdp(xdp_prog, );
switch (act) {
case XDP_PASS:
+   /* Recalculate length in case bpf program changed it */
+   len = xdp.data_end - xdp.data;
+   __skb_pull(skb, xdp.data - xdp.data_hard_start);
break;
case XDP_TX:
virtnet_xdp_xmit(vi, rq, , skb);
@@ -432,6 +442,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
}
rcu_read_unlock();
 
+   skb_trim(skb, len);
return skb;
 
 err_xdp:
@@ -569,7 +580,11 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
if (unlikely(hdr->hdr.gso_type))
goto err_xdp;
 
+   /* Allow consuming headroom but reserve enough space to push
+* the descriptor on if we get an XDP_TX return code.
+*/
data = page_address(xdp_page) + offset;
+   xdp.data_hard_start = data - vi->headroom + desc_room;
xdp.data = data + desc_room;
xdp.data_end = xdp.data + (len - vi->hdr_len);
act = bpf_prog_run_xdp(xdp_prog, );
@@ -748,20 +763,21 @@ static void receive_buf(struct virtnet_info *vi, struct 
receive_queue *rq,
 static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
 gfp_t gfp)
 {
+   int headroom = GOOD_PACKET_LEN + vi->headroom;
struct sk_buff *skb;

[net PATCH v3 3/5] virtio_net: factor out xdp handler for readability

2017-01-12 Thread John Fastabend

At this point the do_xdp_prog is mostly if/else branches handling
the different modes of virtio_net. So remove it and handle running
the program in the per mode handlers.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |   76 +-
 1 file changed, 28 insertions(+), 48 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 43cb2e0..ec54644 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -388,49 +388,6 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
virtqueue_kick(sq->vq);
 }
 
-static u32 do_xdp_prog(struct virtnet_info *vi,
-  struct receive_queue *rq,
-  struct bpf_prog *xdp_prog,
-  void *data, int len)
-{
-   int hdr_padded_len;
-   struct xdp_buff xdp;
-   void *buf;
-   unsigned int qp;
-   u32 act;
-
-   if (vi->mergeable_rx_bufs) {
-   hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
-   xdp.data = data + hdr_padded_len;
-   xdp.data_end = xdp.data + (len - vi->hdr_len);
-   buf = data;
-   } else { /* small buffers */
-   struct sk_buff *skb = data;
-
-   xdp.data = skb->data;
-   xdp.data_end = xdp.data + len;
-   buf = skb->data;
-   }
-
-   act = bpf_prog_run_xdp(xdp_prog, );
-   switch (act) {
-   case XDP_PASS:
-   return XDP_PASS;
-   case XDP_TX:
-   qp = vi->curr_queue_pairs -
-   vi->xdp_queue_pairs +
-   smp_processor_id();
-   xdp.data = buf;
-   virtnet_xdp_xmit(vi, rq, >sq[qp], , data);
-   return XDP_TX;
-   default:
-   bpf_warn_invalid_xdp_action(act);
-   case XDP_ABORTED:
-   case XDP_DROP:
-   return XDP_DROP;
-   }
-}
-
 static struct sk_buff *receive_small(struct net_device *dev,
 struct virtnet_info *vi,
 struct receive_queue *rq,
@@ -446,19 +403,30 @@ static struct sk_buff *receive_small(struct net_device 
*dev,
xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) {
struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
+   struct xdp_buff xdp;
+   unsigned int qp;
u32 act;
 
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
goto err_xdp;
-   act = do_xdp_prog(vi, rq, xdp_prog, skb, len);
+
+   xdp.data = skb->data;
+   xdp.data_end = xdp.data + len;
+   act = bpf_prog_run_xdp(xdp_prog, );
switch (act) {
case XDP_PASS:
break;
case XDP_TX:
+   qp = vi->curr_queue_pairs -
+   vi->xdp_queue_pairs +
+   smp_processor_id();
+   virtnet_xdp_xmit(vi, rq, >sq[qp], , skb);
rcu_read_unlock();
goto xdp_xmit;
-   case XDP_DROP:
default:
+   bpf_warn_invalid_xdp_action(act);
+   case XDP_ABORTED:
+   case XDP_DROP:
goto err_xdp;
}
}
@@ -575,7 +543,11 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) {
+   int desc_room = sizeof(struct virtio_net_hdr_mrg_rxbuf);
struct page *xdp_page;
+   struct xdp_buff xdp;
+   unsigned int qp;
+   void *data;
u32 act;
 
/* This happens when rx buffer size is underestimated */
@@ -598,8 +570,10 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
if (unlikely(hdr->hdr.gso_type))
goto err_xdp;
 
-   act = do_xdp_prog(vi, rq, xdp_prog,
- page_address(xdp_page) + offset, len);
+   data = page_address(xdp_page) + offset;
+   xdp.data = data + desc_room;
+   xdp.data_end = xdp.data + (len - vi->hdr_len);
+   act = bpf_prog_run_xdp(xdp_prog, );
switch (act) {
case XDP_PASS:
/* We can only create skb based on xdp_page. */
@@ -613,13 +587,19 @@ static struct sk_buff *receive_mergeable(struct 
net_device *dev,
}
break;
case XDP_TX:
+   qp = vi->curr_queue_pairs -
+   vi->xdp_queue_pairs +
+   smp_processor_id();
+   virtnet_xdp_xmit(vi, rq, >sq[qp],

[net PATCH v3 4/5] virtio_net: remove duplicate queue pair binding in XDP

2017-01-12 Thread John Fastabend

Factor out qp assignment.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |   18 +++---
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ec54644..6041828 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -332,15 +332,19 @@ static struct sk_buff *page_to_skb(struct virtnet_info 
*vi,
 
 static void virtnet_xdp_xmit(struct virtnet_info *vi,
 struct receive_queue *rq,
-struct send_queue *sq,
 struct xdp_buff *xdp,
 void *data)
 {
struct virtio_net_hdr_mrg_rxbuf *hdr;
unsigned int num_sg, len;
+   struct send_queue *sq;
+   unsigned int qp;
void *xdp_sent;
int err;
 
+   qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+   sq = >sq[qp];
+
/* Free up any pending old buffers before queueing new ones. */
while ((xdp_sent = virtqueue_get_buf(sq->vq, )) != NULL) {
if (vi->mergeable_rx_bufs) {
@@ -404,7 +408,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
if (xdp_prog) {
struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
struct xdp_buff xdp;
-   unsigned int qp;
u32 act;
 
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
@@ -417,10 +420,7 @@ static struct sk_buff *receive_small(struct net_device 
*dev,
case XDP_PASS:
break;
case XDP_TX:
-   qp = vi->curr_queue_pairs -
-   vi->xdp_queue_pairs +
-   smp_processor_id();
-   virtnet_xdp_xmit(vi, rq, >sq[qp], , skb);
+   virtnet_xdp_xmit(vi, rq, , skb);
rcu_read_unlock();
goto xdp_xmit;
default:
@@ -546,7 +546,6 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
int desc_room = sizeof(struct virtio_net_hdr_mrg_rxbuf);
struct page *xdp_page;
struct xdp_buff xdp;
-   unsigned int qp;
void *data;
u32 act;
 
@@ -587,10 +586,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
}
break;
case XDP_TX:
-   qp = vi->curr_queue_pairs -
-   vi->xdp_queue_pairs +
-   smp_processor_id();
-   virtnet_xdp_xmit(vi, rq, >sq[qp], , data);
+   virtnet_xdp_xmit(vi, rq, , data);
ewma_pkt_len_add(>mrg_avg_pkt_len, len);
if (unlikely(xdp_page != page))
goto err_xdp;

[net PATCH v3 2/5] net: virtio: wrap rtnl_lock in test for calling with lock already held

2017-01-12 Thread John Fastabend

For XDP use case and to allow ethtool reset tests it is useful to be
able to use reset routines from contexts where rtnl lock is already
held.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |   16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d97bb71..43cb2e0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1864,12 +1864,13 @@ static void virtnet_free_queues(struct virtnet_info *vi)
kfree(vi->sq);
 }
 
-static void free_receive_bufs(struct virtnet_info *vi)
+static void free_receive_bufs(struct virtnet_info *vi, bool need_lock)
 {
struct bpf_prog *old_prog;
int i;
 
-   rtnl_lock();
+   if (need_lock)
+   rtnl_lock();
for (i = 0; i < vi->max_queue_pairs; i++) {
while (vi->rq[i].pages)
__free_pages(get_a_page(>rq[i], GFP_KERNEL), 0);
@@ -1879,7 +1880,8 @@ static void free_receive_bufs(struct virtnet_info *vi)
if (old_prog)
bpf_prog_put(old_prog);
}
-   rtnl_unlock();
+   if (need_lock)
+   rtnl_unlock();
 }
 
 static void free_receive_page_frags(struct virtnet_info *vi)
@@ -2351,14 +2353,14 @@ static int virtnet_probe(struct virtio_device *vdev)
return err;
 }
 
-static void remove_vq_common(struct virtnet_info *vi)
+static void remove_vq_common(struct virtnet_info *vi, bool lock)
 {
vi->vdev->config->reset(vi->vdev);
 
/* Free unused buffers in both send and recv, if any. */
free_unused_bufs(vi);
 
-   free_receive_bufs(vi);
+   free_receive_bufs(vi, lock);
 
free_receive_page_frags(vi);
 
@@ -2376,7 +2378,7 @@ static void virtnet_remove(struct virtio_device *vdev)
 
unregister_netdev(vi->dev);
 
-   remove_vq_common(vi);
+   remove_vq_common(vi, true);
 
free_percpu(vi->stats);
free_netdev(vi->dev);
@@ -2401,7 +2403,7 @@ static int virtnet_freeze(struct virtio_device *vdev)
napi_disable(>rq[i].napi);
}
 
-   remove_vq_common(vi);
+   remove_vq_common(vi, true);
 
return 0;
 }

[net PATCH v3 0/5] virtio_net XDP fixes and adjust_header support

2017-01-12 Thread John Fastabend

This has a fix to handle small buffer free logic correctly and then
also adds adjust head support.

I pushed adjust head at net (even though its rc3) to avoid having
to push another exception case into virtio_net to catch if the
program uses adjust_head and then block it. If there are any strong
objections to this we can push it at net-next and use a patch from
Jakub to add the exception handling but then user space has to deal
with it either via try/fail logic or via kernel version checks. Granted
we already have some cases that need to be configured to enable XDP
but I don't see any reason to have yet another one when we can fix it
now vs delaying a kernel version.


v2: fix spelling error, convert unsigned -> unsigned int
v3: v2 git crashed during send so retrying sorry for the noise

---

John Fastabend (5):
  virtio_net: use dev_kfree_skb for small buffer XDP receive
  net: virtio: wrap rtnl_lock in test for calling with lock already held
  virtio_net: factor out xdp handler for readability
  virtio_net: remove duplicate queue pair binding in XDP
  virtio_net: XDP support for adjust_head


 drivers/net/virtio_net.c |  251 --
 drivers/virtio/virtio.c  |9 +-
 include/linux/virtio.h   |3 +
 3 files changed, 183 insertions(+), 80 deletions(-)

--
Signature

[net PATCH v3 1/5] virtio_net: use dev_kfree_skb for small buffer XDP receive

2017-01-12 Thread John Fastabend

In the small buffer case during driver unload we currently use
put_page instead of dev_kfree_skb. Resolve this by adding a check
for virtnet mode when checking XDP queue type. Also name the
function so that the code reads correctly to match the additional
check.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4a10500..d97bb71 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1890,8 +1890,12 @@ static void free_receive_page_frags(struct virtnet_info 
*vi)
put_page(vi->rq[i].alloc_frag.page);
 }
 
-static bool is_xdp_queue(struct virtnet_info *vi, int q)
+static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
 {
+   /* For small receive mode always use kfree_skb variants */
+   if (!vi->mergeable_rx_bufs)
+   return false;
+
if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
return false;
else if (q < vi->curr_queue_pairs)
@@ -1908,7 +1912,7 @@ static void free_unused_bufs(struct virtnet_info *vi)
for (i = 0; i < vi->max_queue_pairs; i++) {
struct virtqueue *vq = vi->sq[i].vq;
while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
-   if (!is_xdp_queue(vi, i))
+   if (!is_xdp_raw_buffer_queue(vi, i))
dev_kfree_skb(buf);
else
put_page(virt_to_head_page(buf));

Re: [PATCH net-next v2 00/10] net: dsa: Support for pdata in dsa2

2017-01-12 Thread Vivien Didelot

Hi Florian,

Florian Fainelli  writes:

> Hi all,
>
> This is not exactly new, and was sent before, although back then, I did not
> have an user of the pre-declared MDIO board information, but now we do. Note
> that I have additional changes queued up to have b53 register platform data 
> for
> MIPS bcm47xx and bcm63xx.
>
> Yes I know that we should have the Orion platforms eventually be converted to
> Device Tree, but until that happens, I don't want any remaining users of the
> old "dsa" platform device (hence the previous DTS submissions for ARM/mvebu)
> and, there will be platforms out there that most likely won't never see DT
> coming their way (BCM47xx is almost 100% sure, BCM63xx maybe not in a distant
> future).
>
> We would probably want the whole series to be merged via David Miller's tree
> to simplify things.
>
> Greg, can you Ack/Nack patch 5 since it touched the core LDD?
>
> Thanks!

I've tested this patchset on my mv88e6xxx (DTS) boards to make sure
nothing was broken, since it touches the driver. Looks good!

Tested-by: Vivien Didelot 

Thanks,

Vivien

Re: [PATCH iproute2] Add support for rt_protos.d

2017-01-12 Thread David Ahern

On 1/12/17 6:27 PM, Stephen Hemminger wrote:
>> diff --git a/etc/iproute2/rt_protos.d/README 
>> b/etc/iproute2/rt_protos.d/README
>> new file mode 100644
>> index ..723509ce56b6
>> --- /dev/null
>> +++ b/etc/iproute2/rt_protos.d/README
>> @@ -0,0 +1,3 @@
>> +Each file in this directory is an rt_protos configuration file. iproute2
>> +commands scan this directory processing all files that end in '.conf'.
>> +
> 
> Applied, but required manual fixup.
> 
> 
> .git/rebase-apply/patch:15: new blank line at EOF.
> +
> warning: 1 line adds whitespace errors.
> 

yes, i noticed that when I applied the patch to our local repo. It's a copy of 
the rt_tables.d patch and that README has the extra newline too.

Re: [PATCH] i40e: Invoke softirqs after napi_reschedule

2017-01-12 Thread Benjamin Poirier

On 2017/01/12 17:15, Eric Dumazet wrote:
> On Thu, 2017-01-12 at 17:04 -0800, Benjamin Poirier wrote:
> > The following message is logged from time to time when using i40e:
> > NOHZ: local_softirq_pending 08
> > 
> > i40e may schedule napi from a workqueue. Afterwards, softirqs are not run
> > in a deterministic time frame. The problem is the same as what was
> > described in commit ec13ee80145c ("virtio_net: invoke softirqs after
> > __napi_schedule") and this patch applies the same fix to i40e.
> 
> Yes, I believe mlx4 has a similar problem in mlx4_en_recover_from_oom()

Indeed, I was going to send a patch for mlx4 after this one is accepted.

Re: [PATCH iproute2/net-next 0/2] net/sched: cls_flower: Support matching ARP

2017-01-12 Thread Stephen Hemminger

On Thu, 12 Jan 2017 09:11:57 +0100
Simon Horman  wrote:

> Add support for support matching on ARP operation, and hardware and
> protocol addresses for Ethernet hardware and IPv4 protocol addresses.
> 
> Changes since RFC:
> * Drop RFC designation; kernel patches are present in net-next
> 
> Simon Horman (2):
>   tc: flower: update headers for TCA_FLOWER_KEY_ARP*
>   tc: flower: Support matching ARP
> 
>  include/linux/pkt_cls.h |  11 +++
>  man/man8/tc-flower.8|  41 +-
>  tc/f_flower.c   | 208 
> 
>  3 files changed, 243 insertions(+), 17 deletions(-)
> 

Applied to net-next

Re: [PATCH iproute2 v4 3/4] ifstat: Add 64 bits based stats to extended statistics

2017-01-12 Thread Stephen Hemminger

On Thu, 12 Jan 2017 15:49:50 +0200
Nogah Frankel  wrote:

> The default stats for ifstat are 32 bits based.
> The kernel supports 64 bits based stats. (They are returned in struct
> rtnl_link_stats64 which is an exact copy of struct rtnl_link_stats, in
> which the "normal" stats are returned, but with fields of u64 instead of
> u32). This patch adds them as an extended stats.
> 
> It is read with filter type IFLA_STATS_LINK_64 and no sub type.
> 
> It is under the name 64bits
> (or any shorten of it as "64")
> 
> For example:
> ifstat -x 64bit
> 
> Signed-off-by: Nogah Frankel 
> Reviewed-by: Jiri Pirko 

Other commands (like ip link) always use the 64 bit statistics if available
from the device. I see no reason that ifstat needs to be different.

Re: [PATCH iproute2] bridge: fdb: add state filter support

2017-01-12 Thread Stephen Hemminger

On Thu, 12 Jan 2017 17:47:39 +0100
Nikolay Aleksandrov  wrote:

> This patch adds a new argument to the bridge fdb show command that allows
> to filter by entry state.
> Also update the man page to include all available show arguments.
> 
> Signed-off-by: Nikolay Aleksandrov 
> ---

Applied. Extra thanks for remembering to update man page.

Re: [PATCH iproute2] rttable: Fix invalid range checking when table id is converted to u32

2017-01-12 Thread Stephen Hemminger

On Tue, 10 Jan 2017 15:33:55 -0800
David Ahern  wrote:

> Frank reported that table ids for very large numbers are not properly
> detected:
> $ ip li add foobar type vrf table 98765432100123456789
> 
> command succeeds and resulting table id is actually:
> 
> 21: foobar:  mtu 65536 qdisc noop state DOWN mode DEFAULT group 
> default qlen 1000
> link/ether da:ea:d4:77:38:2a brd ff:ff:ff:ff:ff:ff promiscuity 0
> vrf table 4294967295 addrgenmode eui64 numtxqueues 1 numrxqueues 1 
> gso_max_size 65536 gso_max_segs 65535
> 
> Make the temp variable 'i' unsigned long and let the typecast to u32
> happen on assignment to id.
> 
> Reported-by: Frank Kellermann 
> Signed-off-by: David Ahern 

Applied thanks.

Re: [PATCH iproute2] ip6tunnel: Align ipv6 tunnel key display with ipv4

2017-01-12 Thread Stephen Hemminger

On Tue, 10 Jan 2017 10:45:54 +
David Forster  wrote:

> Show ipv6 tunnel keys on presence of GRE_KEY flag for tunnel types
> other than GRE. Aligns ipv6 behaviour with ipv4.
> 
> Signed-off-by: dfors...@brocade.com

Applied thanks.

Re: [PATCH v2 7/7] uapi: export all headers under uapi directories

2017-01-12 Thread Jeff Epler

On Thu, Jan 12, 2017 at 05:32:09PM +0100, Nicolas Dichtel wrote:
> What I was trying to say is that I export those directories like other are.
> Removing those files is not related to that series.

Perhaps the correct solution is to only copy files matching "*.h" to
reduce the risk of copying files incidentally created by kbuild but
which shouldn't be installed as uapi headers.

jeff

Re: [PATCH iproute2] Add support for rt_protos.d

2017-01-12 Thread Stephen Hemminger

On Mon,  9 Jan 2017 15:43:09 -0800
David Ahern  wrote:

> Add support for reading proto id/name mappings from rt_protos.d
> directory. Allows users to have custom protocol values converted
> to human friendly names.
> 
> Each file under rt_protos.d has the 'id name' format used by
> rt_protos. Only .conf files are read and parsed.
> 
> Signed-off-by: David Ahern 
> ---
>  etc/iproute2/rt_protos.d/README |  3 +++
>  lib/rt_names.c  | 27 +++
>  2 files changed, 30 insertions(+)
>  create mode 100644 etc/iproute2/rt_protos.d/README
> 
> diff --git a/etc/iproute2/rt_protos.d/README b/etc/iproute2/rt_protos.d/README
> new file mode 100644
> index ..723509ce56b6
> --- /dev/null
> +++ b/etc/iproute2/rt_protos.d/README
> @@ -0,0 +1,3 @@
> +Each file in this directory is an rt_protos configuration file. iproute2
> +commands scan this directory processing all files that end in '.conf'.
> +

Applied, but required manual fixup.


.git/rebase-apply/patch:15: new blank line at EOF.
+
warning: 1 line adds whitespace errors.

Re: [PATCH] i40e: Invoke softirqs after napi_reschedule

2017-01-12 Thread Eric Dumazet

On Thu, 2017-01-12 at 17:04 -0800, Benjamin Poirier wrote:
> The following message is logged from time to time when using i40e:
> NOHZ: local_softirq_pending 08
> 
> i40e may schedule napi from a workqueue. Afterwards, softirqs are not run
> in a deterministic time frame. The problem is the same as what was
> described in commit ec13ee80145c ("virtio_net: invoke softirqs after
> __napi_schedule") and this patch applies the same fix to i40e.

Yes, I believe mlx4 has a similar problem in mlx4_en_recover_from_oom()

Re: [PATCH 5/6] treewide: use kv[mz]alloc* rather than opencoded variants

2017-01-12 Thread Dilger, Andreas


> On Jan 12, 2017, at 08:37, Michal Hocko  wrote:
> 
> From: Michal Hocko 
> 
> There are many code paths opencoding kvmalloc. Let's use the helper
> instead. The main difference to kvmalloc is that those users are usually
> not considering all the aspects of the memory allocator. E.g. allocation
> requests < 64kB are basically never failing and invoke OOM killer to
> satisfy the allocation. This sounds too disruptive for something that
> has a reasonable fallback - the vmalloc. On the other hand those
> requests might fallback to vmalloc even when the memory allocator would
> succeed after several more reclaim/compaction attempts previously. There
> is no guarantee something like that happens though.
> 
> This patch converts many of those places to kv[mz]alloc* helpers because
> they are more conservative.
> 
> Signed-off-by: Michal Hocko 

Lustre part can be
Acked-by: Andreas Dilger 

[snip]

> diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-mem.c 
> b/drivers/staging/lustre/lnet/libcfs/linux/linux-mem.c
> index a6a76a681ea9..8f638267e704 100644
> --- a/drivers/staging/lustre/lnet/libcfs/linux/linux-mem.c
> +++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-mem.c
> @@ -45,15 +45,6 @@ EXPORT_SYMBOL(libcfs_kvzalloc);
> void *libcfs_kvzalloc_cpt(struct cfs_cpt_table *cptab, int cpt, size_t size,
> gfp_t flags)
> {
> - void *ret;
> -
> - ret = kzalloc_node(size, flags | __GFP_NOWARN,
> -cfs_cpt_spread_node(cptab, cpt));
> - if (!ret) {
> - WARN_ON(!(flags & (__GFP_FS | __GFP_HIGH)));
> - ret = vmalloc_node(size, cfs_cpt_spread_node(cptab, cpt));
> - }
> -
> - return ret;
> + return kvzalloc_node(size, flags, cfs_cpt_spread_node(cptab, cpt));
> }
> EXPORT_SYMBOL(libcfs_kvzalloc_cpt);

[PATCH] i40e: Invoke softirqs after napi_reschedule

2017-01-12 Thread Benjamin Poirier

The following message is logged from time to time when using i40e:
NOHZ: local_softirq_pending 08

i40e may schedule napi from a workqueue. Afterwards, softirqs are not run
in a deterministic time frame. The problem is the same as what was
described in commit ec13ee80145c ("virtio_net: invoke softirqs after
__napi_schedule") and this patch applies the same fix to i40e.

Signed-off-by: Benjamin Poirier 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ad4cf63..d65488c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -4621,8 +4621,10 @@ static void i40e_detect_recover_hung_queue(int q_idx, 
struct i40e_vsi *vsi)
 */
if ((!tx_pending_hw) && i40e_get_tx_pending(tx_ring, true) &&
(!(val & I40E_PFINT_DYN_CTLN_INTENA_MASK))) {
+   local_bh_disable();
if (napi_reschedule(_ring->q_vector->napi))
tx_ring->tx_stats.tx_lost_interrupt++;
+   local_bh_enable();
}
 }
 
-- 
2.10.2

[net PATCH v2 2/5] net: virtio: wrap rtnl_lock in test for calling with lock already held

2017-01-12 Thread John Fastabend

For XDP use case and to allow ethtool reset tests it is useful to be
able to use reset routines from contexts where rtnl lock is already
held.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |   16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d97bb71..43cb2e0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1864,12 +1864,13 @@ static void virtnet_free_queues(struct virtnet_info *vi)
kfree(vi->sq);
 }
 
-static void free_receive_bufs(struct virtnet_info *vi)
+static void free_receive_bufs(struct virtnet_info *vi, bool need_lock)
 {
struct bpf_prog *old_prog;
int i;
 
-   rtnl_lock();
+   if (need_lock)
+   rtnl_lock();
for (i = 0; i < vi->max_queue_pairs; i++) {
while (vi->rq[i].pages)
__free_pages(get_a_page(>rq[i], GFP_KERNEL), 0);
@@ -1879,7 +1880,8 @@ static void free_receive_bufs(struct virtnet_info *vi)
if (old_prog)
bpf_prog_put(old_prog);
}
-   rtnl_unlock();
+   if (need_lock)
+   rtnl_unlock();
 }
 
 static void free_receive_page_frags(struct virtnet_info *vi)
@@ -2351,14 +2353,14 @@ static int virtnet_probe(struct virtio_device *vdev)
return err;
 }
 
-static void remove_vq_common(struct virtnet_info *vi)
+static void remove_vq_common(struct virtnet_info *vi, bool lock)
 {
vi->vdev->config->reset(vi->vdev);
 
/* Free unused buffers in both send and recv, if any. */
free_unused_bufs(vi);
 
-   free_receive_bufs(vi);
+   free_receive_bufs(vi, lock);
 
free_receive_page_frags(vi);
 
@@ -2376,7 +2378,7 @@ static void virtnet_remove(struct virtio_device *vdev)
 
unregister_netdev(vi->dev);
 
-   remove_vq_common(vi);
+   remove_vq_common(vi, true);
 
free_percpu(vi->stats);
free_netdev(vi->dev);
@@ -2401,7 +2403,7 @@ static int virtnet_freeze(struct virtio_device *vdev)
napi_disable(>rq[i].napi);
}
 
-   remove_vq_common(vi);
+   remove_vq_common(vi, true);
 
return 0;
 }

[net PATCH v2 1/5] virtio_net: use dev_kfree_skb for small buffer XDP receive

2017-01-12 Thread John Fastabend

In the small buffer case during driver unload we currently use
put_page instead of dev_kfree_skb. Resolve this by adding a check
for virtnet mode when checking XDP queue type. Also name the
function so that the code reads correctly to match the additional
check.

Signed-off-by: John Fastabend 
---
 drivers/net/virtio_net.c |8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4a10500..d97bb71 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1890,8 +1890,12 @@ static void free_receive_page_frags(struct virtnet_info 
*vi)
put_page(vi->rq[i].alloc_frag.page);
 }
 
-static bool is_xdp_queue(struct virtnet_info *vi, int q)
+static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
 {
+   /* For small receive mode always use kfree_skb variants */
+   if (!vi->mergeable_rx_bufs)
+   return false;
+
if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
return false;
else if (q < vi->curr_queue_pairs)
@@ -1908,7 +1912,7 @@ static void free_unused_bufs(struct virtnet_info *vi)
for (i = 0; i < vi->max_queue_pairs; i++) {
struct virtqueue *vq = vi->sq[i].vq;
while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
-   if (!is_xdp_queue(vi, i))
+   if (!is_xdp_raw_buffer_queue(vi, i))
dev_kfree_skb(buf);
else
put_page(virt_to_head_page(buf));

[net PATCH v2 0/5] virtio_net XDP fixes and adjust_header support

2017-01-12 Thread John Fastabend

This has a fix to handle small buffer free logic correctly and then
also adds adjust head support.

I pushed adjust head at net (even though its rc3) to avoid having
to push another exception case into virtio_net to catch if the
program uses adjust_head and then block it. If there are any strong
objections to this we can push it at net-next and use a patch from
Jakub to add the exception handling but then user space has to deal
with it either via try/fail logic or via kernel version checks. Granted
we already have some cases that need to be configured to enable XDP
but I don't see any reason to have yet another one when we can fix it
now vs delaying a kernel version.


v2: fix spelling error, convert unsigned -> unsigned int

---

John Fastabend (5):
  virtio_net: use dev_kfree_skb for small buffer XDP receive
  net: virtio: wrap rtnl_lock in test for calling with lock already held
  virtio_net: factor out xdp handler for readability
  virtio_net: remove duplicate queue pair binding in XDP
  virtio_net: XDP support for adjust_head


 drivers/net/virtio_net.c |  251 --
 drivers/virtio/virtio.c  |9 +-
 include/linux/virtio.h   |3 +
 3 files changed, 183 insertions(+), 80 deletions(-)

--
Signature

[PATCH net] openvswitch: maintain correct checksum state in conntrack actions

2017-01-12 Thread Lance Richardson

When executing conntrack actions on skbuffs with checksum mode
CHECKSUM_COMPLETE, the checksum must be updated to account for
header pushes and pulls. Otherwise we get "hw csum failure"
logs similar to this (ICMP packet received on geneve tunnel
via ixgbe NIC):

[  405.740065] genev_sys_6081: hw csum failure
[  405.740106] CPU: 3 PID: 0 Comm: swapper/3 Tainted: G  I 
4.10.0-rc3+ #1
[  405.740108] Call Trace:
[  405.740110]  
[  405.740113]  dump_stack+0x63/0x87
[  405.740116]  netdev_rx_csum_fault+0x3a/0x40
[  405.740118]  __skb_checksum_complete+0xcf/0xe0
[  405.740120]  nf_ip_checksum+0xc8/0xf0
[  405.740124]  icmp_error+0x1de/0x351 [nf_conntrack_ipv4]
[  405.740132]  nf_conntrack_in+0xe1/0x550 [nf_conntrack]
[  405.740137]  ? find_bucket.isra.2+0x62/0x70 [openvswitch]
[  405.740143]  __ovs_ct_lookup+0x95/0x980 [openvswitch]
[  405.740145]  ? netif_rx_internal+0x44/0x110
[  405.740149]  ovs_ct_execute+0x147/0x4b0 [openvswitch]
[  405.740153]  do_execute_actions+0x22e/0xa70 [openvswitch]
[  405.740157]  ovs_execute_actions+0x40/0x120 [openvswitch]
[  405.740161]  ovs_dp_process_packet+0x84/0x120 [openvswitch]
[  405.740166]  ovs_vport_receive+0x73/0xd0 [openvswitch]
[  405.740168]  ? udp_rcv+0x1a/0x20
[  405.740170]  ? ip_local_deliver_finish+0x93/0x1e0
[  405.740172]  ? ip_local_deliver+0x6f/0xe0
[  405.740174]  ? ip_rcv_finish+0x3a0/0x3a0
[  405.740176]  ? ip_rcv_finish+0xdb/0x3a0
[  405.740177]  ? ip_rcv+0x2a7/0x400
[  405.740180]  ? __netif_receive_skb_core+0x970/0xa00
[  405.740185]  netdev_frame_hook+0xd3/0x160 [openvswitch]
[  405.740187]  __netif_receive_skb_core+0x1dc/0xa00
[  405.740194]  ? ixgbe_clean_rx_irq+0x46d/0xa20 [ixgbe]
[  405.740197]  __netif_receive_skb+0x18/0x60
[  405.740199]  netif_receive_skb_internal+0x40/0xb0
[  405.740201]  napi_gro_receive+0xcd/0x120
[  405.740204]  gro_cell_poll+0x57/0x80 [geneve]
[  405.740206]  net_rx_action+0x260/0x3c0
[  405.740209]  __do_softirq+0xc9/0x28c
[  405.740211]  irq_exit+0xd9/0xf0
[  405.740213]  do_IRQ+0x51/0xd0
[  405.740215]  common_interrupt+0x93/0x93

Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action")
Signed-off-by: Lance Richardson 
---
 net/openvswitch/conntrack.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 6b78bab..54253ea 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -514,7 +514,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct 
nf_conn *ct,
int hooknum, nh_off, err = NF_ACCEPT;
 
nh_off = skb_network_offset(skb);
-   skb_pull(skb, nh_off);
+   skb_pull_rcsum(skb, nh_off);
 
/* See HOOK2MANIP(). */
if (maniptype == NF_NAT_MANIP_SRC)
@@ -579,6 +579,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct 
nf_conn *ct,
err = nf_nat_packet(ct, ctinfo, hooknum, skb);
 push:
skb_push(skb, nh_off);
+   skb_postpush_rcsum(skb, skb->data, nh_off);
 
return err;
 }
@@ -886,7 +887,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 
/* The conntrack module expects to be working at L3. */
nh_ofs = skb_network_offset(skb);
-   skb_pull(skb, nh_ofs);
+   skb_pull_rcsum(skb, nh_ofs);
 
if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
err = handle_fragments(net, key, info->zone.id, skb);
@@ -900,6 +901,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
err = ovs_ct_lookup(net, key, info, skb);
 
skb_push(skb, nh_ofs);
+   skb_postpush_rcsum(skb, skb->data, nh_ofs);
if (err)
kfree_skb(skb);
return err;
-- 
1.8.3.1

[PATCH net-next] liquidio: use fallback for selecting txq

2017-01-12 Thread Felix Manlunas

From: Satanand Burla 

Remove assignment to ndo_select_queue so that fallback is used for
selecting txq.  Also remove the now-useless function that used to be
assigned to ndo_select_queue.

Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
Signed-off-by: Derek Chickles 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 20 
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 21 -
 2 files changed, 41 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index cc825d5..2b89ec2 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2223,25 +2223,6 @@ static void if_cfg_callback(struct octeon_device *oct,
wake_up_interruptible(>wc);
 }
 
-/**
- * \brief Select queue based on hash
- * @param dev Net device
- * @param skb sk_buff structure
- * @returns selected queue number
- */
-static u16 select_q(struct net_device *dev, struct sk_buff *skb,
-   void *accel_priv __attribute__((unused)),
-   select_queue_fallback_t fallback __attribute__((unused)))
-{
-   u32 qindex = 0;
-   struct lio *lio;
-
-   lio = GET_LIO(dev);
-   qindex = skb_tx_hash(dev, skb);
-
-   return (u16)(qindex % (lio->linfo.num_txpciq));
-}
-
 /** Routine to push packets arriving on Octeon interface upto network layer.
  * @param oct_id   - octeon device id.
  * @param skbuff   - skbuff struct to be passed to network layer.
@@ -3755,7 +3736,6 @@ static const struct net_device_ops lionetdevops = {
.ndo_set_vf_vlan= liquidio_set_vf_vlan,
.ndo_get_vf_config  = liquidio_get_vf_config,
.ndo_set_vf_link_state  = liquidio_set_vf_link_state,
-   .ndo_select_queue   = select_q
 };
 
 /** \brief Entry point for the liquidio module
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index ad2e72d..19d88fb 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1455,26 +1455,6 @@ static void if_cfg_callback(struct octeon_device *oct,
wake_up_interruptible(>wc);
 }
 
-/**
- * \brief Select queue based on hash
- * @param dev Net device
- * @param skb sk_buff structure
- * @returns selected queue number
- */
-static u16 select_q(struct net_device *dev, struct sk_buff *skb,
-   void *accel_priv __attribute__((unused)),
-   select_queue_fallback_t fallback __attribute__((unused)))
-{
-   struct lio *lio;
-   u32 qindex;
-
-   lio = GET_LIO(dev);
-
-   qindex = skb_tx_hash(dev, skb);
-
-   return (u16)(qindex % (lio->linfo.num_txpciq));
-}
-
 /** Routine to push packets arriving on Octeon interface upto network layer.
  * @param oct_id   - octeon device id.
  * @param skbuff   - skbuff struct to be passed to network layer.
@@ -2717,7 +2697,6 @@ static const struct net_device_ops lionetdevops = {
.ndo_set_features   = liquidio_set_features,
.ndo_udp_tunnel_add = liquidio_add_vxlan_port,
.ndo_udp_tunnel_del = liquidio_del_vxlan_port,
-   .ndo_select_queue   = select_q,
 };
 
 static int lio_nic_info(struct octeon_recv_info *recv_info, void *buf)

Re: [Patch net] atm: remove an unnecessary loop

2017-01-12 Thread Cong Wang

On Thu, Jan 12, 2017 at 4:07 PM, Francois Romieu  wrote:
> Cong Wang  :
> [...]
>> diff --git a/net/atm/common.c b/net/atm/common.c
>> index a3ca922..7ec3bbc 100644
>> --- a/net/atm/common.c
>> +++ b/net/atm/common.c
>> @@ -72,10 +72,11 @@ static struct sk_buff *alloc_tx(struct atm_vcc *vcc, 
>> unsigned int size)
>>sk_wmem_alloc_get(sk), size, sk->sk_sndbuf);
>>   return NULL;
>>   }
>> - while (!(skb = alloc_skb(size, GFP_KERNEL)))
>> - schedule();
>> - pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
>> - atomic_add(skb->truesize, >sk_wmem_alloc);
>> + skb = alloc_skb(size, GFP_KERNEL);
>> + if (skb) {
>> + pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
>> + atomic_add(skb->truesize, >sk_wmem_alloc);
>> + }
>>   return skb;
>>  }
>
> Were alloc_skb moved one level up in the call stack, there would be
> no need to use the new wait api in the subsequent page, thus easing
> pre 3.19 longterm kernel maintenance (at least those on korg page).

alloc_skb(GFP_KERNEL) itself is sleeping, so the new wait api is still
needed.

Re: [Patch net] atm: remove an unnecessary loop

2017-01-12 Thread Francois Romieu

Cong Wang  :
[...]
> diff --git a/net/atm/common.c b/net/atm/common.c
> index a3ca922..7ec3bbc 100644
> --- a/net/atm/common.c
> +++ b/net/atm/common.c
> @@ -72,10 +72,11 @@ static struct sk_buff *alloc_tx(struct atm_vcc *vcc, 
> unsigned int size)
>sk_wmem_alloc_get(sk), size, sk->sk_sndbuf);
>   return NULL;
>   }
> - while (!(skb = alloc_skb(size, GFP_KERNEL)))
> - schedule();
> - pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
> - atomic_add(skb->truesize, >sk_wmem_alloc);
> + skb = alloc_skb(size, GFP_KERNEL);
> + if (skb) {
> + pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
> + atomic_add(skb->truesize, >sk_wmem_alloc);
> + }
>   return skb;
>  }

Were alloc_skb moved one level up in the call stack, there would be
no need to use the new wait api in the subsequent page, thus easing
pre 3.19 longterm kernel maintenance (at least those on korg page).

But it tastes a tad bit too masochistic.

-- 
Ueimor

Re: [net PATCH 5/5] virtio_net: XDP support for adjust_head

2017-01-12 Thread John Fastabend

On 17-01-12 02:22 PM, Michael S. Tsirkin wrote:
> On Thu, Jan 12, 2017 at 01:45:19PM -0800, John Fastabend wrote:
>> Add support for XDP adjust head by allocating a 256B header region
>> that XDP programs can grow into. This is only enabled when a XDP
>> program is loaded.
>>
>> In order to ensure that we do not have to unwind queue headroom push
>> queue setup below bpf_prog_add. It reads better to do a prog ref
>> unwind vs another queue setup call.
>>
>> At the moment this code must do a full reset to ensure old buffers
>> without headroom on program add or with headroom on program removal
>> are not used incorrectly in the datapath. Ideally we would only
>> have to disable/enable the RX queues being updated but there is no
>> API to do this at the moment in virtio so use the big hammer. In
>> practice it is likely not that big of a problem as this will only
>> happen when XDP is enabled/disabled changing programs does not
>> require the reset. There is some risk that the driver may either
>> have an allocation failure or for some reason fail to correctly
>> negotiate with the underlying backend in this case the driver will
>> be left uninitialized. I have not seen this ever happen on my test
>> systems and for what its worth this same failure case can occur
>> from probe and other contexts in virtio framework.
> 
> Could you explain about this a bit more?
> Thanks!
> 

Sure. There are two existing paths and this patch adds a third one
where the driver basically goes through this reset path. First one
is on probe the other one is on the freeze/restore path.

The virtnet_freeze() path eventually free's the memory for rq/sq
(receive queues and send queues).

virtnet_freeze()
...
remove_vq_common()
...
virtnet_dev_vqs()
vdev->config->del_vqs()
virtnet_free_queues <- this does a kfree

On virtnet_restore() path we then have to reallocate and reneg with
backend.

virtnet_retore()
...
init_vqs()
...
virtnet_alloc_queues() <- alloc sq/rq
virtnet_find_vqs()
(allocates callbacks/names/vqs)

So the above allocs could fail and leave the device in a FAILED
state. This can happen today on probe or freeze/restore paths and
after this patch possibly on XDP load. Although as noted I have not
seen it happen in any of the above cases.

Second failure mode could happen if virtio_finalize_features() fails.
This seems unlikely because in order to probe successfully we had to
finalize the features successfully earlier. But it could I guess happen
based on return codes. Again never seen this actually happen. This is
called in probe case, freeze/restore case, and XDP now as well.

Does that help? Also I need to send a v2 to fix a spelling mistake and
to convert a 'unsigned' to 'unsigned int' per checkpatch warning. Always
better to run checkpatch before submitting vs after.

Thanks,
John

Re: [PATCH] can: Fix kernel panic at security_sock_rcv_skb

2017-01-12 Thread Eric Dumazet

On Thu, 2017-01-12 at 14:40 -0800, william.c.robe...@intel.com wrote:
> From: Zhang Yanmin 
> 
> The patch is for fix the below kernel panic:
> BUG: unable to handle kernel NULL pointer dereference at (null)
> IP: [] selinux_socket_sock_rcv_skb+0x65/0x2a0

Same patch was sent earlier, and we gave a feedback on it.

Adding synchronize_rcu() calls is a step backward.

https://patchwork.ozlabs.org/patch/714446/

Re: [PATCH] xen-netfront: Fix Rx stall during network stress and OOM

2017-01-12 Thread Vineeth Remanan Pillai

On 01/12/2017 12:17 PM, David Miller wrote:

From: Vineeth Remanan Pillai 
Date: Wed, 11 Jan 2017 23:17:17 +

@@ -1054,7 +1059,11 @@ static int xennet_poll(struct napi_struct *napi, int 
budget)
napi_complete(napi);

  		RING_FINAL_CHECK_FOR_RESPONSES(>rx, more_to_do);

-   if (more_to_do)
+
+   /* If there is more work to do or could not allocate
+* rx buffers, re-enable polling.
+*/
+   if (more_to_do || err != 0)
napi_schedule(napi);

Just polling endlessly in a loop retrying the SKB allocation over and over
again until it succeeds is not very nice behavior.

You already have that refill timer, so please use that to retry instead
of wasting cpu cycles looping in NAPI poll.

Thanks Dave for the inputs.
On further look, I think I can fix it much simpler by correcting the 
test condition

for minimum slots for pushing requests. Existing test is like this:

/* Not enough requests? Try again later. */
   if (req_prod - queue->rx.rsp_cons < NET_RX_SLOTS_MIN) {
mod_timer(>rx_refill_timer, jiffies + (HZ/10));
return;
}

Actually the above check counts more than the newly created request slots
as it counts from rsp_cons. The actual count should be the difference 
between
new req_prod and old req_prod(in the queue). If skbs cannot be created, 
this
count remains small and hence we would schedule the timer. So the fix 
could be:

/* Not enough requests? Try again later. */
-   if (req_prod - queue->rx.rsp_cons < NET_RX_SLOTS_MIN) {
+   if (req_prod - queue->rx.sring->req_prod < NET_RX_SLOTS_MIN) {

I have done some initial testing to verify the fix. Will send out v2 
patch after couple

more round of testing.

Thanks,
Vineeth

[PATCH net-next] net: dsa: mv88e6xxx: add EEPROM support to 6390

2017-01-12 Thread Vivien Didelot

The Marvell 6352 chip has a 8-bit address/16-bit data EEPROM access.
The Marvell 6390 chip has a 16-bit address/8-bit data EEPROM access.

This patch implements the 8-bit data EEPROM access in the mv88e6xxx
driver and adds its support to chips of the 6390 family.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 14 ++
 drivers/net/dsa/mv88e6xxx/global2.c   | 93 ++-
 drivers/net/dsa/mv88e6xxx/global2.h   | 21 
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  1 +
 4 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index eea8e0176e33..987b2dbbd35a 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3453,6 +3453,8 @@ static const struct mv88e6xxx_ops mv88e6185_ops = {
 
 static const struct mv88e6xxx_ops mv88e6190_ops = {
/* MV88E6XXX_FAMILY_6390 */
+   .get_eeprom = mv88e6xxx_g2_get_eeprom8,
+   .set_eeprom = mv88e6xxx_g2_set_eeprom8,
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
@@ -3478,6 +3480,8 @@ static const struct mv88e6xxx_ops mv88e6190_ops = {
 
 static const struct mv88e6xxx_ops mv88e6190x_ops = {
/* MV88E6XXX_FAMILY_6390 */
+   .get_eeprom = mv88e6xxx_g2_get_eeprom8,
+   .set_eeprom = mv88e6xxx_g2_set_eeprom8,
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
@@ -3503,6 +3507,8 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = {
 
 static const struct mv88e6xxx_ops mv88e6191_ops = {
/* MV88E6XXX_FAMILY_6390 */
+   .get_eeprom = mv88e6xxx_g2_get_eeprom8,
+   .set_eeprom = mv88e6xxx_g2_set_eeprom8,
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
@@ -3556,6 +3562,8 @@ static const struct mv88e6xxx_ops mv88e6240_ops = {
 
 static const struct mv88e6xxx_ops mv88e6290_ops = {
/* MV88E6XXX_FAMILY_6390 */
+   .get_eeprom = mv88e6xxx_g2_get_eeprom8,
+   .set_eeprom = mv88e6xxx_g2_set_eeprom8,
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
@@ -3714,6 +3722,8 @@ static const struct mv88e6xxx_ops mv88e6352_ops = {
 
 static const struct mv88e6xxx_ops mv88e6390_ops = {
/* MV88E6XXX_FAMILY_6390 */
+   .get_eeprom = mv88e6xxx_g2_get_eeprom8,
+   .set_eeprom = mv88e6xxx_g2_set_eeprom8,
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
@@ -3741,6 +3751,8 @@ static const struct mv88e6xxx_ops mv88e6390_ops = {
 
 static const struct mv88e6xxx_ops mv88e6390x_ops = {
/* MV88E6XXX_FAMILY_6390 */
+   .get_eeprom = mv88e6xxx_g2_get_eeprom8,
+   .set_eeprom = mv88e6xxx_g2_set_eeprom8,
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
@@ -3768,6 +3780,8 @@ static const struct mv88e6xxx_ops mv88e6390x_ops = {
 
 static const struct mv88e6xxx_ops mv88e6391_ops = {
/* MV88E6XXX_FAMILY_6390 */
+   .get_eeprom = mv88e6xxx_g2_get_eeprom8,
+   .set_eeprom = mv88e6xxx_g2_set_eeprom8,
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
diff --git a/drivers/net/dsa/mv88e6xxx/global2.c 
b/drivers/net/dsa/mv88e6xxx/global2.c
index 3e77071949ab..ead2e265c9ef 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.c
+++ b/drivers/net/dsa/mv88e6xxx/global2.c
@@ -218,7 +218,8 @@ static int mv88e6xxx_g2_clear_pot(struct mv88e6xxx_chip 
*chip)
 }
 
 /* Offset 0x14: EEPROM Command
- * Offset 0x15: EEPROM Data
+ * Offset 0x15: EEPROM Data (for 16-bit data access)
+ * Offset 0x15: EEPROM Addr (for 8-bit data access)
  */
 
 static int mv88e6xxx_g2_eeprom_wait(struct mv88e6xxx_chip *chip)
@@ -239,6 +240,50 @@ static int mv88e6xxx_g2_eeprom_cmd(struct mv88e6xxx_chip 
*chip, u16 cmd)
return mv88e6xxx_g2_eeprom_wait(chip);
 }
 
+static int mv88e6xxx_g2_eeprom_read8(struct mv88e6xxx_chip *chip,
+u16 addr, u8 *data)
+{
+   u16 cmd = GLOBAL2_EEPROM_CMD_OP_READ;
+   int err;
+
+   err = mv88e6xxx_g2_eeprom_wait(chip);
+   if (err)
+   return err;
+
+   err = mv88e6xxx_g2_write(chip, GLOBAL2_EEPROM_ADDR, addr);
+   if (err)
+   return err;
+
+   err = mv88e6xxx_g2_eeprom_cmd(chip, cmd);
+   if (err)
+   return err;
+
+   err = mv88e6xxx_g2_read(chip, GLOBAL2_EEPROM_CMD, );
+   if (err)
+   return err;
+

[�PATCH net-next] ARM: dts: vf610-zii-dev: add EEPROM entry to Rev B

2017-01-12 Thread Vivien Didelot

The ZII Dev Rev B board has EEPROMs hanging the 88E6352 Ethernet switch
chips. Add an "eeprom-length" property to allow access from ethtool.

Signed-off-by: Vivien Didelot 
---
 arch/arm/boot/dts/vf610-zii-dev-rev-b.dts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts 
b/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts
index 958b4c42d320..b60d3d03f58c 100644
--- a/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts
+++ b/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts
@@ -98,6 +98,7 @@
interrupts = <27 IRQ_TYPE_LEVEL_LOW>;
interrupt-controller;
#interrupt-cells = <2>;
+   eeprom-length = <512>;
 
ports {
#address-cells = <1>;
@@ -181,6 +182,7 @@
interrupts = <26 IRQ_TYPE_LEVEL_LOW>;
interrupt-controller;
#interrupt-cells = <2>;
+   eeprom-length = <512>;
 
ports {
#address-cells = <1>;
-- 
2.11.0

Re: [PATCH] ARM: dts: am57xx-beagle-x15: implement errata "Ethernet RGMII2 Limited to 10/100 Mbps"

2017-01-12 Thread Tony Lindgren

* Grygorii Strashko  [170112 09:15]:
> According to errata i880 description the speed of Ethernet port 1 on AM572x
> SoCs rev 1.1 shuld be limited to 10/100Mbps, because RGMII2 Switching
> Characteristics are not compatible with 1000 Mbps operation [1].
> The issue is fixed with Rev 2.0 silicon.
> 
> Hence, rework Beagle-X15 and Begale-X15-revb1 to use phy-handle instead of
> phy_id and apply corresponding limitation to the Ethernet Phy 1.
> 
> [1] http://www.ti.com/lit/er/sprz429j/sprz429j.pdf

Applying into omap-for-v4.11/dt thanks.

Tony

Re: [PATCH net-next v2 05/10] drivers: base: Add device_find_class()

2017-01-12 Thread Florian Fainelli

On 01/12/2017 01:21 PM, David Miller wrote:
> From: Florian Fainelli 
> Date: Wed, 11 Jan 2017 19:41:16 -0800
> 
>> Add a helper function to lookup a device reference given a class name.
>> This is a preliminary patch to remove adhoc code from net/dsa/dsa.c and
>> make it more generic.
>>
>> Signed-off-by: Florian Fainelli 
>> ---
>>  drivers/base/core.c| 19 +++
>>  include/linux/device.h |  1 +
>>  2 files changed, 20 insertions(+)
>>
>> diff --git a/drivers/base/core.c b/drivers/base/core.c
>> index 020ea7f05520..3dd6047c10d8 100644
>> --- a/drivers/base/core.c
>> +++ b/drivers/base/core.c
>> @@ -2065,6 +2065,25 @@ struct device *device_find_child(struct device 
>> *parent, void *data,
>>  }
>>  EXPORT_SYMBOL_GPL(device_find_child);
>>  
>> +static int dev_is_class(struct device *dev, void *class)
> 
> I know you are just moving code, but this class argumnet is a string
> and thus should be "char *" or even "const char *".

Well, this is really so that we don't need to cast the arguments passed
to device_find_child(), which takes a void *data as well. If we made
that a const char *class, we'd get warnings that look like these:

drivers/base/core.c: In function 'device_find_class':
drivers/base/core.c:2083:2: warning: passing argument 2 of
'device_find_child' discards 'const' qualifier from pointer target type
[enabled by default]
  return device_find_child(parent, class, dev_is_class);
  ^
drivers/base/core.c:2050:16: note: expected 'void *' but argument is of
type 'const char *'
 struct device *device_find_child(struct device *parent, void *data,
^
drivers/base/core.c:2083:2: warning: passing argument 3 of
'device_find_child' from incompatible pointer type [enabled by default]
  return device_find_child(parent, class, dev_is_class);
  ^
drivers/base/core.c:2050:16: note: expected 'int (*)(struct device *,
void *)' but argument is of type 'int (*)(struct device *, const char *)'
 struct device *device_find_child(struct device *parent, void *data,
^

-- 
Florian

Re: [PATCH v2] tcp: fix tcp_fastopen unaligned access complaints on sparc

2017-01-12 Thread Eric Dumazet

On Thu, 2017-01-12 at 14:24 -0800, Shannon Nelson wrote:
> Fix up a data alignment issue on sparc by swapping the order
> of the cookie byte array field with the length field in
> struct tcp_fastopen_cookie, and making it a proper union
> to clean up the typecasting.
> 
> This addresses log complaints like these:
> log_unaligned: 113 callbacks suppressed
> Kernel unaligned access at TPC[976490] tcp_try_fastopen+0x2d0/0x360
> Kernel unaligned access at TPC[9764ac] tcp_try_fastopen+0x2ec/0x360
> Kernel unaligned access at TPC[9764c8] tcp_try_fastopen+0x308/0x360
> Kernel unaligned access at TPC[9764e4] tcp_try_fastopen+0x324/0x360
> Kernel unaligned access at TPC[976490] tcp_try_fastopen+0x2d0/0x360
> 
> Cc: Eric Dumazet 
> Signed-off-by: Shannon Nelson 
> ---
> v2: Use Eric's suggestion for a union in the struct

Acked-by: Eric Dumazet 

Thanks for fixing this !

RE: Marvell Phy (1510) issue since v4.7 kernel

2017-01-12 Thread Kwok, WingMan

Hi Andrew,

> -Original Message-
> From: Andrew Lunn [mailto:and...@lunn.ch]
> Sent: Thursday, January 12, 2017 4:50 PM
> To: Kwok, WingMan
> Cc: rmk+ker...@arm.linux.org.uk; Karicheri, Muralidharan;
> netdev@vger.kernel.org
> Subject: Re: Marvell Phy (1510) issue since v4.7 kernel
> 
> > But our problem is caused by the read_status function:
> >
> > if ((phydev->supported & SUPPORTED_FIBRE)) {
> > err = phy_write(phydev, MII_MARVELL_PHY_PAGE,
> MII_M_FIBER);
> > if (err < 0)
> > goto error;
> >
> > err = marvell_read_status_page(phydev, MII_M_FIBER);
> > if (err < 0)
> > goto error;
> >
> > /* If the fiber link is up, it is the selected and used
> link.
> > * In this case, we need to stay in the fiber page.
> > * Please to be careful about that, avoid to restore Copper
> page
> > * in other functions which could break the behaviour
> > * for some fiber phy like 88E1512.
> > * */
> > if (phydev->link)
> > return 0;
> >
> > which keeps the fiber page if phydev->link is true (for some
> > reason this is the case even though we are not using fiber)
> 
> How are you using the PHY. What phy-mode do you have set?  Do you
> happen to be using it as an RGMII to SERDES/SGMII bridge? This is what
> Russell King is doing, i think.
> 

our 88e1514 is connected to the host via sgmii.

> Have you tried the patch Russell submitted recently.
> 
> Author: Russell King 
> Date:   Tue Jan 10 23:13:45 2017 +
> 
> net: phy: marvell: fix Marvell 88E1512 used in SGMII mode
> 
> When an Marvell 88E1512 PHY is connected to a nic in SGMII mode,
> the
> fiber page is used for the SGMII host-side connection.  The PHY
> driver
> notices that SUPPORTED_FIBRE is set, so it tries reading the fiber
> page
> for the link status, and ends up reading the MAC-side status
> instead of
> the outgoing (copper) link.  This leads to incorrect results
> reported
> via ethtool.
> 
> If the PHY is connected via SGMII to the host, ignore the fiber
> page.
> However, continue to allow the existing power management code to
> suspend and resume the fiber page.
> 

Thanks for pointer. It does fix the problem.

> > However, this causes a problem in kernel reboot because neither
> > the suspend/resume is called to restore the copper page and
> > u-boot marvell phy driver does not support 1510 fiber, which
> > will then result in writing to the wrong phy regs and causes
> > a sgmii auto-nego time out.
> 
> This is still a u-boot bug. It should not assume the PHY is in a sane
> state. It should reset it and configure it as needed. So far, i don't
> think you have reported any issues with Linux usage of the PHY. There
> clearly are bugs, but your real problem is u-boot.
> 

Yes. Agree.

> > In addition to fixing the ! in suspend/resume, my suggestion
> > would be to change also the read_status function to
> > always restore the copper page after doing the fiber stuffs:
> 
> Nope. This is done deliberately, as the comment suggests:
> 
> > /* If the fiber link is up, it is the selected and used
> link.
> > * In this case, we need to stay in the fiber page.
> > * Please to be careful about that, avoid to restore Copper
> page
> > * in other functions which could break the behaviour
> > * for some fiber phy like 88E1512.
> > * */
> > if (phydev->link)
> > return 0;
> 
> The point is, the phylib will continue polling the PHY registers,
> reading them. If the FIBRE is up, we want to read the FIBRE values,
> not the copper.
> 

Thanks for the explanations.

> > Another issue is that, as of now, FIBER is enabled regardless
> > of the specific 88e151x. But I believe there is 88e151x chip(s)
> > that does not support fiber. Should fiber be enabled only for
> > those that do support fiber?
> 
> Yes, we should look at register 30, page 18 any set SUPPORTED_FIBRE
> based on that.
> 

Thanks for the suggestions.

>   Andrew

Just want to know if there is already a patch in the net tree
fixing the incorrect ! in the suspend/resume functions also?

WingMan

Re: [net-next PATCH 1/3] Revert "icmp: avoid allocating large struct on stack"

2017-01-12 Thread Cong Wang

On Tue, Jan 10, 2017 at 10:54 AM, David Miller  wrote:
> From: Cong Wang 
> Date: Tue, 10 Jan 2017 10:44:59 -0800
>> The only countries you hold netdev are Canada, Japan and Spain
>> (to my knowledge). If you check:
>>
>> https://en.wikipedia.org/wiki/Visa_requirements_for_Chinese_citizens#Visa_requirements
>>
>> It is very easy to find out if it is an excuse or a fact.
>
> The conference explciitly offers VISA help for anyone who needs it.
> Many people come to the conference on a VISA we helped them obtain.

I never complain about visa sponsorship or money, this is the last
problem for me to consider.

>
> It is not hard to do.  You have put in exactly zero effort in trying
> to solve this problem.  If you had simply contacted the conference
> organizers asking for help, you would have gotten all the help you
> needed.

If obtaining a visa were as easy as obtaining a sponsorship, I would
go as many conferences as I know. But the fact is there are already
too many hassles to obtain even _one_ visa, not to mention I need
to obtain _two_ visas to go to Canada (or Japan, Spain) and return
to US. I'd be very happy to attend it if it were held in US, but this never
happens.

[PATCH] can: Fix kernel panic at security_sock_rcv_skb

2017-01-12 Thread william . c . roberts

From: Zhang Yanmin 

The patch is for fix the below kernel panic:
BUG: unable to handle kernel NULL pointer dereference at (null)
IP: [] selinux_socket_sock_rcv_skb+0x65/0x2a0

Call Trace:
 
 [] security_sock_rcv_skb+0x4c/0x60
 [] sk_filter+0x41/0x210
 [] sock_queue_rcv_skb+0x53/0x3a0
 [] raw_rcv+0x2a3/0x3c0
 [] can_rcv_filter+0x12b/0x370
 [] can_receive+0xd9/0x120
 [] can_rcv+0xab/0x100
 [] __netif_receive_skb_core+0xd8c/0x11f0
 [] __netif_receive_skb+0x24/0xb0
 [] process_backlog+0x127/0x280
 [] net_rx_action+0x33b/0x4f0
 [] __do_softirq+0x184/0x440
 [] do_softirq_own_stack+0x1c/0x30
 
 [] do_softirq.part.18+0x3b/0x40
 [] do_softirq+0x1d/0x20
 [] netif_rx_ni+0xe5/0x110
 [] slcan_receive_buf+0x507/0x520
 [] flush_to_ldisc+0x21c/0x230
 [] process_one_work+0x24f/0x670
 [] worker_thread+0x9d/0x6f0
 [] ? rescuer_thread+0x480/0x480
 [] kthread+0x12c/0x150
 [] ret_from_fork+0x3f/0x70

The sk dereferenced in panic has been released. After the rcu_call in
can_rx_unregister, receiver was protected by RCU but inner data was
not, then later sk will be freed while other CPU is still using it.
We need wait here to make sure sk referenced via receiver was safe.

=> security_sk_free
=> sk_destruct
=> __sk_free
=> sk_free
=> raw_release
=> sock_release
=> sock_close
=> __fput
=> fput
=> task_work_run
=> exit_to_usermode_loop
=> syscall_return_slowpath
=> int_ret_from_sys_call

Tracked-On: https://jira01.devtools.intel.com/browse/OAM-40528
Signed-off-by: Zhang Yanmin 
Signed-off-by: He, Bo 
Signed-off-by: Liu Shuo A 
Signed-off-by: William Roberts 
---
 net/can/af_can.c | 14 --
 net/can/af_can.h |  1 -
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/can/af_can.c b/net/can/af_can.c
index 1108079..fcbe971 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -517,10 +517,8 @@ EXPORT_SYMBOL(can_rx_register);
 /*
  * can_rx_delete_receiver - rcu callback for single receiver entry removal
  */
-static void can_rx_delete_receiver(struct rcu_head *rp)
+static void can_rx_delete_receiver(struct receiver *r)
 {
-   struct receiver *r = container_of(rp, struct receiver, rcu);
-
kmem_cache_free(rcv_cache, r);
 }
 
@@ -595,9 +593,13 @@ void can_rx_unregister(struct net_device *dev, canid_t 
can_id, canid_t mask,
  out:
spin_unlock(_rcvlists_lock);
 
-   /* schedule the receiver item for deletion */
-   if (r)
-   call_rcu(>rcu, can_rx_delete_receiver);
+   /* synchronize_rcu to wait until a grace period has elapsed, to make
+* sure all receiver's sk dereferenced by others.
+*/
+   if (r) {
+   synchronize_rcu();
+   can_rx_delete_receiver(r);
+   }
 }
 EXPORT_SYMBOL(can_rx_unregister);
 
diff --git a/net/can/af_can.h b/net/can/af_can.h
index fca0fe9..a0cbf83 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -50,7 +50,6 @@
 
 struct receiver {
struct hlist_node list;
-   struct rcu_head rcu;
canid_t can_id;
canid_t mask;
unsigned long matches;
-- 
2.7.4

1 2 3 4 >

1 - 100 of 343 matches

Mail list logo