date:20180126

Re: [PATCH v3] net: macb: Handle HRESP error

2018-01-26 Thread Harini Katakam

Hi David,

On Sat, Jan 27, 2018 at 12:09 PM,   wrote:
> From: Harini Katakam 
>
> Handle HRESP error by doing a SW reset of RX and TX and
> re-initializing the descriptors, RX and TX queue pointers.
>
> Signed-off-by: Harini Katakam 
> Signed-off-by: Michal Simek 
> ---
> v3 and v2 changes:
> Fixed patch formatting errors
> Rebased on latest net-next and reinitialized
> multiple rx queues in error handling.
>
>  drivers/net/ethernet/cadence/macb.h  |  3 ++
>  drivers/net/ethernet/cadence/macb_main.c | 59 
> +---
>  2 files changed, 58 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/ethernet/cadence/macb.h 
> b/drivers/net/ethernet/cadence/macb.h
> index c50c5ec..8665982 100644
> --- a/drivers/net/ethernet/cadence/macb.h
> +++ b/drivers/net/ethernet/cadence/macb.h
> @@ -13,6 +13,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) || defined(CONFIG_MACB_USE_HWSTAMP)
>  #define MACB_EXT_DESC
> @@ -1200,6 +1201,8 @@ struct macb {
> struct ethtool_rx_fs_list rx_fs_list;
> spinlock_t rx_fs_lock;
> unsigned int max_tuples;
> +
> +   struct tasklet_struct   hresp_err_tasklet;
>  };
>
>  #ifdef CONFIG_MACB_USE_HWSTAMP
> diff --git a/drivers/net/ethernet/cadence/macb_main.c 
> b/drivers/net/ethernet/cadence/macb_main.c
> index 234667e..e84afcf 100644
> --- a/drivers/net/ethernet/cadence/macb_main.c
> +++ b/drivers/net/ethernet/cadence/macb_main.c
> @@ -1258,6 +1258,57 @@ static int macb_poll(struct napi_struct *napi, int 
> budget)
> return work_done;
>  }
>
> +static void macb_hresp_error_task(unsigned long data)
> +{
> +   struct macb *bp = (struct macb *)data;
> +   struct net_device *dev = bp->dev;
> +   struct macb_queue *queue = bp->queues;
> +   unsigned int q;
> +   u32 ctrl;
> +
> +   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
> +   queue_writel(queue, IDR, MACB_RX_INT_FLAGS |
> +MACB_TX_INT_FLAGS |
> +MACB_BIT(HRESP));
> +   }
> +   ctrl = macb_readl(bp, NCR);
> +   ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE));
> +   macb_writel(bp, NCR, ctrl);
> +
> +   netif_tx_stop_all_queues(dev);
> +   netif_carrier_off(dev);
> +
> +   bp->macbgem_ops.mog_init_rings(bp);
> +
> +   /* Initialize TX and RX buffers */
> +   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
> +   queue_writel(queue, RBQP, lower_32_bits(queue->rx_ring_dma));
> +#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
> +   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
> +   queue_writel(queue, RBQPH,
> +upper_32_bits(queue->rx_ring_dma));
> +#endif
> +   queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
> +#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
> +   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
> +   queue_writel(queue, TBQPH,
> +upper_32_bits(queue->tx_ring_dma));
> +#endif
> +
> +   /* Enable interrupts */
> +   queue_writel(queue, IER,
> +MACB_RX_INT_FLAGS |
> +MACB_TX_INT_FLAGS |
> +MACB_BIT(HRESP));
> +   }
> +
> +   ctrl |= MACB_BIT(RE) | MACB_BIT(TE);
> +   macb_writel(bp, NCR, ctrl);
> +
> +   netif_carrier_on(dev);
> +   netif_tx_start_all_queues(dev);
> +}
> +
>  static irqreturn_t macb_interrupt(int irq, void *dev_id)
>  {
> struct macb_queue *queue = dev_id;
> @@ -1347,10 +1398,7 @@ static irqreturn_t macb_interrupt(int irq, void 
> *dev_id)
> }
>
> if (status & MACB_BIT(HRESP)) {
> -   /* TODO: Reset the hardware, and maybe move the
> -* netdev_err to a lower-priority context as well
> -* (work queue?)
> -*/
> +   tasklet_schedule(>hresp_err_tasklet);
> netdev_err(dev, "DMA bus error: HRESP not OK\n");
>
> if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
> @@ -3937,6 +3985,9 @@ static int macb_probe(struct platform_device *pdev)
> goto err_out_unregister_mdio;
> }
>
> +   tasklet_init(>hresp_err_tasklet, macb_hresp_error_task,
> +(unsigned long)bp);
> +
> phy_attached_info(phydev);
>
> netdev_info(dev, "Cadence %s rev 0x%08x at 0x%08lx irq %d (%pM)\n",
> --
> 2.7.4
>

Apologies for the spam.
This is the v3. Please do let me know in case you still see any corruption.

Regards,
Harini

[PATCH v3] net: macb: Handle HRESP error

2018-01-26 Thread harinikatakamlinux

From: Harini Katakam 

Handle HRESP error by doing a SW reset of RX and TX and
re-initializing the descriptors, RX and TX queue pointers.

Signed-off-by: Harini Katakam 
Signed-off-by: Michal Simek 
---
v3 and v2 changes:
Fixed patch formatting errors
Rebased on latest net-next and reinitialized
multiple rx queues in error handling.

 drivers/net/ethernet/cadence/macb.h  |  3 ++
 drivers/net/ethernet/cadence/macb_main.c | 59 +---
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.h 
b/drivers/net/ethernet/cadence/macb.h
index c50c5ec..8665982 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) || defined(CONFIG_MACB_USE_HWSTAMP)
 #define MACB_EXT_DESC
@@ -1200,6 +1201,8 @@ struct macb {
struct ethtool_rx_fs_list rx_fs_list;
spinlock_t rx_fs_lock;
unsigned int max_tuples;
+
+   struct tasklet_struct   hresp_err_tasklet;
 };
 
 #ifdef CONFIG_MACB_USE_HWSTAMP
diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index 234667e..e84afcf 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1258,6 +1258,57 @@ static int macb_poll(struct napi_struct *napi, int 
budget)
return work_done;
 }
 
+static void macb_hresp_error_task(unsigned long data)
+{
+   struct macb *bp = (struct macb *)data;
+   struct net_device *dev = bp->dev;
+   struct macb_queue *queue = bp->queues;
+   unsigned int q;
+   u32 ctrl;
+
+   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+   queue_writel(queue, IDR, MACB_RX_INT_FLAGS |
+MACB_TX_INT_FLAGS |
+MACB_BIT(HRESP));
+   }
+   ctrl = macb_readl(bp, NCR);
+   ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE));
+   macb_writel(bp, NCR, ctrl);
+
+   netif_tx_stop_all_queues(dev);
+   netif_carrier_off(dev);
+
+   bp->macbgem_ops.mog_init_rings(bp);
+
+   /* Initialize TX and RX buffers */
+   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+   queue_writel(queue, RBQP, lower_32_bits(queue->rx_ring_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+   queue_writel(queue, RBQPH,
+upper_32_bits(queue->rx_ring_dma));
+#endif
+   queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+   queue_writel(queue, TBQPH,
+upper_32_bits(queue->tx_ring_dma));
+#endif
+
+   /* Enable interrupts */
+   queue_writel(queue, IER,
+MACB_RX_INT_FLAGS |
+MACB_TX_INT_FLAGS |
+MACB_BIT(HRESP));
+   }
+
+   ctrl |= MACB_BIT(RE) | MACB_BIT(TE);
+   macb_writel(bp, NCR, ctrl);
+
+   netif_carrier_on(dev);
+   netif_tx_start_all_queues(dev);
+}
+
 static irqreturn_t macb_interrupt(int irq, void *dev_id)
 {
struct macb_queue *queue = dev_id;
@@ -1347,10 +1398,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
}
 
if (status & MACB_BIT(HRESP)) {
-   /* TODO: Reset the hardware, and maybe move the
-* netdev_err to a lower-priority context as well
-* (work queue?)
-*/
+   tasklet_schedule(>hresp_err_tasklet);
netdev_err(dev, "DMA bus error: HRESP not OK\n");
 
if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
@@ -3937,6 +3985,9 @@ static int macb_probe(struct platform_device *pdev)
goto err_out_unregister_mdio;
}
 
+   tasklet_init(>hresp_err_tasklet, macb_hresp_error_task,
+(unsigned long)bp);
+
phy_attached_info(phydev);
 
netdev_info(dev, "Cadence %s rev 0x%08x at 0x%08lx irq %d (%pM)\n",
-- 
2.7.4

[PATCH v2] net: macb: Handle HRESP error

2018-01-26 Thread harinikatakamlinux

From: Harini Katakam 

Handle HRESP error by doing a SW reset of RX and TX and
re-initializing the descriptors, RX and TX queue pointers.

Signed-off-by: Harini Katakam 
Signed-off-by: Michal Simek 
---
v2:
Rebased on top of latest net-next and reinitialized
all rx queues.

 drivers/net/ethernet/cadence/macb.h  |  3 ++
 drivers/net/ethernet/cadence/macb_main.c | 59 +---
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.h 
b/drivers/net/ethernet/cadence/macb.h
index c50c5ec..8665982 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) || defined(CONFIG_MACB_USE_HWSTAMP)
 #define MACB_EXT_DESC
@@ -1200,6 +1201,8 @@ struct macb {
struct ethtool_rx_fs_list rx_fs_list;
spinlock_t rx_fs_lock;
unsigned int max_tuples;
+
+   struct tasklet_struct   hresp_err_tasklet;
 };
 
 #ifdef CONFIG_MACB_USE_HWSTAMP
diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index 234667e..e84afcf 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1258,6 +1258,57 @@ static int macb_poll(struct napi_struct *napi, int 
budget)
return work_done;
 }
 
+static void macb_hresp_error_task(unsigned long data)
+{
+   struct macb *bp = (struct macb *)data;
+   struct net_device *dev = bp->dev;
+   struct macb_queue *queue = bp->queues;
+   unsigned int q;
+   u32 ctrl;
+
+   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+   queue_writel(queue, IDR, MACB_RX_INT_FLAGS |
+MACB_TX_INT_FLAGS |
+MACB_BIT(HRESP));
+   }
+   ctrl = macb_readl(bp, NCR);
+   ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE));
+   macb_writel(bp, NCR, ctrl);
+
+   netif_tx_stop_all_queues(dev);
+   netif_carrier_off(dev);
+
+   bp->macbgem_ops.mog_init_rings(bp);
+
+   /* Initialize TX and RX buffers */
+   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+   queue_writel(queue, RBQP, lower_32_bits(queue->rx_ring_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+   queue_writel(queue, RBQPH,
+upper_32_bits(queue->rx_ring_dma));
+#endif
+   queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+   queue_writel(queue, TBQPH,
+upper_32_bits(queue->tx_ring_dma));
+#endif
+
+   /* Enable interrupts */
+   queue_writel(queue, IER,
+MACB_RX_INT_FLAGS |
+MACB_TX_INT_FLAGS |
+MACB_BIT(HRESP));
+   }
+
+   ctrl |= MACB_BIT(RE) | MACB_BIT(TE);
+   macb_writel(bp, NCR, ctrl);
+
+   netif_carrier_on(dev);
+   netif_tx_start_all_queues(dev);
+}
+
 static irqreturn_t macb_interrupt(int irq, void *dev_id)
 {
struct macb_queue *queue = dev_id;
@@ -1347,10 +1398,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
}
 
if (status & MACB_BIT(HRESP)) {
-   /* TODO: Reset the hardware, and maybe move the
-* netdev_err to a lower-priority context as well
-* (work queue?)
-*/
+   tasklet_schedule(>hresp_err_tasklet);
netdev_err(dev, "DMA bus error: HRESP not OK\n");
 
if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
@@ -3937,6 +3985,9 @@ static int macb_probe(struct platform_device *pdev)
goto err_out_unregister_mdio;
}
 
+   tasklet_init(>hresp_err_tasklet, macb_hresp_error_task,
+(unsigned long)bp);
+
phy_attached_info(phydev);
 
netdev_info(dev, "Cadence %s rev 0x%08x at 0x%08lx irq %d (%pM)\n",
-- 
2.7.4

[PATCH v2] net: macb: Handle HRESP error

2018-01-26 Thread harinikatakamlinux

From: Harini Katakam 

Handle HRESP error by doing a SW reset of RX and TX and
re-initializing the descriptors, RX and TX queue pointers.

Signed-off-by: Harini Katakam 
Signed-off-by: Michal Simek 
---
v2:
Rebased on top of latest net-next and reinitialized
all rx queues.

 drivers/net/ethernet/cadence/macb.h  |  3 ++
 drivers/net/ethernet/cadence/macb_main.c | 59 +---
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.h 
b/drivers/net/ethernet/cadence/macb.h
index c50c5ec..8665982 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 

 #if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) || defined(CONFIG_MACB_USE_HWSTAMP)
 #define MACB_EXT_DESC
@@ -1200,6 +1201,8 @@ struct macb {
struct ethtool_rx_fs_list rx_fs_list;
spinlock_t rx_fs_lock;
unsigned int max_tuples;
+
+   struct tasklet_struct   hresp_err_tasklet;
 };

 #ifdef CONFIG_MACB_USE_HWSTAMP
diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index 234667e..e84afcf 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1258,6 +1258,57 @@ static int macb_poll(struct napi_struct *napi, int 
budget)
return work_done;
 }

+static void macb_hresp_error_task(unsigned long data)
+{
+   struct macb *bp = (struct macb *)data;
+   struct net_device *dev = bp->dev;
+   struct macb_queue *queue = bp->queues;
+   unsigned int q;
+   u32 ctrl;
+
+   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+   queue_writel(queue, IDR, MACB_RX_INT_FLAGS |
+MACB_TX_INT_FLAGS |
+MACB_BIT(HRESP));
+   }
+   ctrl = macb_readl(bp, NCR);
+   ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE));
+   macb_writel(bp, NCR, ctrl);
+
+   netif_tx_stop_all_queues(dev);
+   netif_carrier_off(dev);
+
+   bp->macbgem_ops.mog_init_rings(bp);
+
+   /* Initialize TX and RX buffers */
+   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+   queue_writel(queue, RBQP, lower_32_bits(queue->rx_ring_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+   queue_writel(queue, RBQPH,
+upper_32_bits(queue->rx_ring_dma));
+#endif
+   queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+   queue_writel(queue, TBQPH,
+upper_32_bits(queue->tx_ring_dma));
+#endif
+
+   /* Enable interrupts */
+   queue_writel(queue, IER,
+MACB_RX_INT_FLAGS |
+MACB_TX_INT_FLAGS |
+MACB_BIT(HRESP));
+   }
+
+   ctrl |= MACB_BIT(RE) | MACB_BIT(TE);
+   macb_writel(bp, NCR, ctrl);
+
+   netif_carrier_on(dev);
+   netif_tx_start_all_queues(dev);
+}
+
 static irqreturn_t macb_interrupt(int irq, void *dev_id)
 {
struct macb_queue *queue = dev_id;
@@ -1347,10 +1398,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
}

if (status & MACB_BIT(HRESP)) {
-   /* TODO: Reset the hardware, and maybe move the
-* netdev_err to a lower-priority context as well
-* (work queue?)
-*/
+   tasklet_schedule(>hresp_err_tasklet);
netdev_err(dev, "DMA bus error: HRESP not OK\n");

if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
@@ -3937,6 +3985,9 @@ static int macb_probe(struct platform_device *pdev)
goto err_out_unregister_mdio;
}

+   tasklet_init(>hresp_err_tasklet, macb_hresp_error_task,
+(unsigned long)bp);
+
phy_attached_info(phydev);

netdev_info(dev, "Cadence %s rev 0x%08x at 0x%08lx irq %d (%pM)\n",
--
2.7.4

This email and any attachments are intended for the sole use of the named 
recipient(s) and contain(s) confidential information that may be proprietary, 
privileged or copyrighted under applicable law. If you are not the intended 
recipient, do not read, copy, or forward this email message or any attachments. 
Delete this email message and any attachments immediately.

[PATCH v2] net: macb: Handle HRESP error

2018-01-26 Thread harinikatakamlinux.com

From: Harini Katakam 

Handle HRESP error by doing a SW reset of RX and TX and
re-initializing the descriptors, RX and TX queue pointers.

Signed-off-by: Harini Katakam 
Signed-off-by: Michal Simek 
---
v2:
Rebased on top of latest net-next and reinitialized
all rx queues.

 drivers/net/ethernet/cadence/macb.h  |  3 ++
 drivers/net/ethernet/cadence/macb_main.c | 59 +---
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.h 
b/drivers/net/ethernet/cadence/macb.h
index c50c5ec..8665982 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 

 #if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) || defined(CONFIG_MACB_USE_HWSTAMP)
 #define MACB_EXT_DESC
@@ -1200,6 +1201,8 @@ struct macb {
struct ethtool_rx_fs_list rx_fs_list;
spinlock_t rx_fs_lock;
unsigned int max_tuples;
+
+   struct tasklet_struct   hresp_err_tasklet;
 };

 #ifdef CONFIG_MACB_USE_HWSTAMP
diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index 234667e..e84afcf 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1258,6 +1258,57 @@ static int macb_poll(struct napi_struct *napi, int 
budget)
return work_done;
 }

+static void macb_hresp_error_task(unsigned long data)
+{
+   struct macb *bp = (struct macb *)data;
+   struct net_device *dev = bp->dev;
+   struct macb_queue *queue = bp->queues;
+   unsigned int q;
+   u32 ctrl;
+
+   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+   queue_writel(queue, IDR, MACB_RX_INT_FLAGS |
+MACB_TX_INT_FLAGS |
+MACB_BIT(HRESP));
+   }
+   ctrl = macb_readl(bp, NCR);
+   ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE));
+   macb_writel(bp, NCR, ctrl);
+
+   netif_tx_stop_all_queues(dev);
+   netif_carrier_off(dev);
+
+   bp->macbgem_ops.mog_init_rings(bp);
+
+   /* Initialize TX and RX buffers */
+   for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+   queue_writel(queue, RBQP, lower_32_bits(queue->rx_ring_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+   queue_writel(queue, RBQPH,
+upper_32_bits(queue->rx_ring_dma));
+#endif
+   queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (bp->hw_dma_cap & HW_DMA_CAP_64B)
+   queue_writel(queue, TBQPH,
+upper_32_bits(queue->tx_ring_dma));
+#endif
+
+   /* Enable interrupts */
+   queue_writel(queue, IER,
+MACB_RX_INT_FLAGS |
+MACB_TX_INT_FLAGS |
+MACB_BIT(HRESP));
+   }
+
+   ctrl |= MACB_BIT(RE) | MACB_BIT(TE);
+   macb_writel(bp, NCR, ctrl);
+
+   netif_carrier_on(dev);
+   netif_tx_start_all_queues(dev);
+}
+
 static irqreturn_t macb_interrupt(int irq, void *dev_id)
 {
struct macb_queue *queue = dev_id;
@@ -1347,10 +1398,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
}

if (status & MACB_BIT(HRESP)) {
-   /* TODO: Reset the hardware, and maybe move the
-* netdev_err to a lower-priority context as well
-* (work queue?)
-*/
+   tasklet_schedule(>hresp_err_tasklet);
netdev_err(dev, "DMA bus error: HRESP not OK\n");

if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
@@ -3937,6 +3985,9 @@ static int macb_probe(struct platform_device *pdev)
goto err_out_unregister_mdio;
}

+   tasklet_init(>hresp_err_tasklet, macb_hresp_error_task,
+(unsigned long)bp);
+
phy_attached_info(phydev);

netdev_info(dev, "Cadence %s rev 0x%08x at 0x%08lx irq %d (%pM)\n",
--
2.7.4

This email and any attachments are intended for the sole use of the named 
recipient(s) and contain(s) confidential information that may be proprietary, 
privileged or copyrighted under applicable law. If you are not the intended 
recipient, do not read, copy, or forward this email message or any attachments. 
Delete this email message and any attachments immediately.

Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Jakub Kicinski

On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
> >> 3 netdev model breaks this configuration starting with the creation
> >> and naming of the 2 devices to udev needing to be aware of master and
> >> slave virtio-net devices.  
> > I don't understand this comment.  There is one virtio-net device and
> > one "virtio-bond" netdev.  And user space has to be aware of the special
> > automatic arrangement anyway, because it can't touch the VF.  It
> > doesn't make any difference whether it ignores the VF or PV and VF.
> > It simply can't touch the slaves, no matter how many there are.  
> 
> If the userspace is not expected to touch the slaves, then why do we need to
> take extra effort to expose a netdev that is just not really useful.

You said:
"[user space] needs to be aware of master and slave virtio-net devices."

I'm saying:
It has to be aware of the special arrangement whether there is an
explicit bond netdev or not.

> >> Also, from a user experience point of view, loading a virtio-net with
> >> BACKUP feature enabled will now show 2 virtio-net netdevs.  
> > One virtio-net and one virtio-bond, which represents what's happening.  
> This again assumes that we want to represent a bond setup. Can't we 
> treat this
> as virtio-net providing an alternate low-latency datapath by taking over 
> VF datapath?

Bond is just a familiar name, we can call it something else if you
prefer.  The point is there are two data paths which can have
independent low-level settings and a higher level entity with
global settings which represents any path to the outside world.

Hiding low-level netdevs from a lay user requires a generic solution.

Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Samudrala, Sridhar


On 1/26/2018 6:30 PM, Jakub Kicinski wrote:

On Fri, 26 Jan 2018 15:30:35 -0800, Samudrala, Sridhar wrote:

On 1/26/2018 2:47 PM, Jakub Kicinski wrote:

On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:

On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:

and the VM is not expected to do any tuning/optimizations on the VF driver
directly,
i think the current patch that follows the netvsc model of 2 netdevs(virtio
and vf) should
work fine.

OK. For your use case that's fine. But that's too specific scenario
with lots of restrictions IMHO, perhaps very few users will benefit
from it, I'm not sure. If you're unwilling to move towards it, we'd
take this one and come back with a generic solution that is able to
address general use cases for VF/PT live migration .

I think that's a fine approach. Scratch your own itch!  I imagine a very
generic virtio-switchdev providing host routing info to guests could
address lots of usecases. A driver could bind to that one and enslave
arbitrary other devices.  Sounds reasonable.

But given the fundamental idea of a failover was floated at least as
early as 2013, and made 0 progress since precisely because it kept
trying to address more and more features, and given netvsc is already
using the basic solution with some success, I'm not inclined to block
this specific effort waiting for the generic one.

I think there is an agreement that the extra netdev will be useful for
more advanced use cases, and is generally preferable.  What is the
argument for not doing that from the start?  If it was made I must have
missed it.  Is it just unwillingness to write the extra 300 lines of
code?  Sounds like a pretty weak argument when adding kernel ABI is at
stake...

I am still not clear on the need for the extra netdev created by
virtio_net. The only advantage i can see is that the stats can be
broken between VF and virtio datapaths compared to the aggregrated
stats on virtio netdev as seen with the 2 netdev approach.

Maybe you're not convinced but multiple arguments were made.
All the arguments seem to either saying that semantically this doesn't 
look like

a bond OR suggesting usecases that this patch is not trying to solve.
This approach should help cloud environments where the guest networking 
is fully
controlled from the hypervisor via the PF driver or via port representor 
when switchdev
mode is enabled. The guest admin is not expected or allowed to make any 
networking

changes from the VM.




With 2 netdev model, any VM image that has a working network
configuration will transparently get VF based acceleration without
any changes.

Nothing happens transparently.  Things may happen automatically.  The
VF netdev doesn't disappear with netvsc.  The PV netdev transforms into
something it did not use to be.  And configures and reports some
information from the PV (e.g. speed) but PV doesn't pass traffic any
longer.



3 netdev model breaks this configuration starting with the creation
and naming of the 2 devices to udev needing to be aware of master and
slave virtio-net devices.

I don't understand this comment.  There is one virtio-net device and
one "virtio-bond" netdev.  And user space has to be aware of the special
automatic arrangement anyway, because it can't touch the VF.  It
doesn't make any difference whether it ignores the VF or PV and VF.
It simply can't touch the slaves, no matter how many there are.


If the userspace is not expected to touch the slaves, then why do we need to
take extra effort to expose a netdev that is just not really useful.




Also, from a user experience point of view, loading a virtio-net with
BACKUP feature enabled will now show 2 virtio-net netdevs.

One virtio-net and one virtio-bond, which represents what's happening.
This again assumes that we want to represent a bond setup. Can't we 
treat this
as virtio-net providing an alternate low-latency datapath by taking over 
VF datapath?



For live migration with advanced usecases that Siwei is suggesting, i
think we need a new driver with a new device type that can track the
VF specific feature settings even when the VF driver is unloaded.

Re: [PATCH v2] net: macb: Handle HRESP error

2018-01-26 Thread Harini Katakam

Hi David,

On Fri, Jan 26, 2018 at 9:25 PM, David Miller  wrote:
> From: Harini Katakam 
> Date: Fri, 26 Jan 2018 16:12:11 +0530
>
>> From: Harini Katakam 
>>
>> Handle HRESP error by doing a SW reset of RX and TX and
>> re-initializing the descriptors, RX and TX queue pointers.
>>
>> Signed-off-by: Harini Katakam 
>> Signed-off-by: Michal Simek 
>> ---
>> v2:
>> Rebased on top of latest net-next and reinitialized
>> all rx queues.
>
> Your patch was corrupted by your email client, it changed TAB characters
> into sequences of SPACES amongst other things.  We cannot integrate your
> changes until you fix this.

I'll fix this.

>
>> This email and any attachments are intended for the sole use of the named 
>> recipient(s) and contain(s) confidential information that may be 
>> proprietary, privileged or copyrighted under applicable law. If you are not 
>> the intended recipient, do not read, copy, or forward this email message or 
>> any attachments. Delete this email message and any attachments immediately.
>
> This is also completely inappropriate for a public development list and
> you must eliminate this signature on future postings or we will have to
> ignore them.

Sorry, I usually remove this, missed it this time.
Will resend.

Regards,
Harini

Re: [PATCH] atm: firestream: Replace GFP_ATOMIC with GFP_KERNEL in fs_send

2018-01-26 Thread Jia-Ju Bai




On 2018/1/27 1:08, Al Viro wrote:

On Fri, Jan 26, 2018 at 11:07:39AM -0500, David Miller wrote:

This is found by a static analysis tool named DCNS written by myself.

The trouble is, places like
net/atm/raw.c:65:   vcc->send = atm_send_aal0;
net/atm/raw.c:74:   vcc->send = vcc->dev->ops->send;
net/atm/raw.c:83:   vcc->send = vcc->dev->ops->send;
mean extra call chains.  It's *not* just vcc_sendmsg(), and e.g.
 ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
 ? DROP_PACKET : 1;
 bh_unlock_sock(sk_atm(vcc));
in pppoatm_send() definitely is called under a spinlock.

Looking through the driver (in advanced bitrot, as usual for drivers/atm),
I'd say that submit_queue() is fucked in head in the "queue full" case.
And judging by the history, had been thus since the original merge...

Jia-Ju, I'm probably not going to apply any of your GFP_KERNEL
conversions.

Al's analysis above is similar to how things looked for other patches
you submited of this nature.

So because of the lack of care and serious auditing you put into these
conversions, I really have no choice but to drop them from my queue
because on the whole they are adding bugs rather than improving the
code.

FWIW, the tool *does* promise to be useful


Thanks, I am happy to hear that :)


- as far as I understand it
looks for places where GFP_ATOMIC allocation goes with blocking operations
near every callchain leading there.  And that indicates something fishy
going on - either pointless GFP_ATOMIC (in benign case), or something
much nastier: a callchain that would require GFP_ATOMIC.  In that case
whatever blocking operation found along the way is a bug.


In fact, my tool first collects all places of GFP_ATOMIC and mdelay in 
the whole kernel code.
Then it starts analysis from the entry of each interrupt handler and 
spin-lock function call in the whole kernel code,
to mark the places of GFP_ATOMIC and mdelay that are called in atomic 
context.
The remaining unmarked places of GFP_ATOMIC and mdelay are reported and 
can be replaced with GFP_KERNEL and mdelay (or usleep_range).


Though my tool has handled some common situations of function pointers,
But it does not well handle function pointer passing in this code 
(vcc->send = vcc->dev->ops->send), so the tool needs to be improved.

I am sorry for my incorrect report...


This time it has, AFAICS, caught a genuine bug in drivers/atm/firestream.c -
static void submit_qentry (struct fs_dev *dev, struct queue *q, struct 
FS_QENTRY *qe)
{
 u32 wp;
 struct FS_QENTRY *cqe;

 /* XXX Sanity check: the write pointer can be checked to be
still the same as the value passed as qe... -- REW */
 /*  udelay (5); */
 while ((wp = read_fs (dev, Q_WP (q->offset))) & Q_FULL) {
 fs_dprintk (FS_DEBUG_TXQ, "Found queue at %x full. Waiting.\n",
 q->offset);
 schedule ();
 }
...
static void submit_queue (struct fs_dev *dev, struct queue *q,
   u32 cmd, u32 p1, u32 p2, u32 p3)
{
 struct FS_QENTRY *qe;

 qe = get_qentry (dev, q);
 qe->cmd = cmd;
 qe->p0 = p1;
 qe->p1 = p2;
 qe->p2 = p3;
 submit_qentry (dev,  q, qe);
...
static int fs_send (struct atm_vcc *atm_vcc, struct sk_buff *skb)
{
...
 td = kmalloc (sizeof (struct FS_BPENTRY), GFP_ATOMIC);
...
 submit_queue (dev, >hp_txq,
   QE_TRANSMIT_DE | vcc->channo,
   virt_to_bus (td), 0,
   virt_to_bus (td));
...

Either all callchains leading to fs_send() are in non-atomic contexts
(in which case that GFP_ATOMIC would be pointless) or there's one
where we cannot block.  Any such would be a potential deadlock.

And the latter appears to be the case - fs_send() is atmdev_ops ->send(),
which can end up in atm_vcc ->send(), which can be called from under
->sk_lock.slock.


Yes, I think schedule() can cause a sleep-in-atomic-context bug.


What would be really useful:
* list of "here's a list of locations where we do something
blocking; each callchain to this GFP_ATOMIC allocation passes either
upstream of one of those without leaving atomic context in between
or downstream without entering one".
* after that - backtracking these callchains further, watching
for ones in atomic contexts.  Can be done manually, but if that tool
can assist in doing so, all the better.  If we do find one, we have
found a deadlock - just take the blocking operation reported for
that callchain and that's it.  If it's not an obvious false positive
(e.g.
if (!foo)
spin_lock();
.
if (foo)
schedule();
), it's worth reporting to maintainers of the code in question.
* if all callchains reach obviously non-atomic contexts
(syscall entry, for example, or a kernel thread payload, or
a function documented to

[PATCH bpf-next] netdevsim: fix overflow on the error path

2018-01-26 Thread Jakub Kicinski

Undo loop condition on the error path would cause the i counter
to go below zero, if allocation failure happened with the first
(i.e. 0th) element of the array.

Fixes: 395cacb5f1a0 ("netdevsim: bpf: support fake map offload")
Reported-by: Dan Carpenter 
Signed-off-by: Jakub Kicinski 
---
 drivers/net/netdevsim/bpf.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index de73c1ff0939..75c25306d234 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -480,8 +480,7 @@ static int
 nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap)
 {
struct nsim_bpf_bound_map *nmap;
-   unsigned int i;
-   int err;
+   int i, err;
 
if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY &&
offmap->map.map_type != BPF_MAP_TYPE_HASH))
@@ -518,7 +517,7 @@ nsim_bpf_map_alloc(struct netdevsim *ns, struct 
bpf_offloaded_map *offmap)
return 0;
 
 err_free:
-   while (--i) {
+   while (--i >= 0) {
kfree(nmap->entry[i].key);
kfree(nmap->entry[i].value);
}
-- 
2.15.1

Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Jakub Kicinski

On Fri, 26 Jan 2018 15:30:35 -0800, Samudrala, Sridhar wrote:
> On 1/26/2018 2:47 PM, Jakub Kicinski wrote:
> > On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:  
> >> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:  
>  and the VM is not expected to do any tuning/optimizations on the VF 
>  driver
>  directly,
>  i think the current patch that follows the netvsc model of 2 
>  netdevs(virtio
>  and vf) should
>  work fine.  
> >>> OK. For your use case that's fine. But that's too specific scenario
> >>> with lots of restrictions IMHO, perhaps very few users will benefit
> >>> from it, I'm not sure. If you're unwilling to move towards it, we'd
> >>> take this one and come back with a generic solution that is able to
> >>> address general use cases for VF/PT live migration .  
> >> I think that's a fine approach. Scratch your own itch!  I imagine a very
> >> generic virtio-switchdev providing host routing info to guests could
> >> address lots of usecases. A driver could bind to that one and enslave
> >> arbitrary other devices.  Sounds reasonable.
> >>
> >> But given the fundamental idea of a failover was floated at least as
> >> early as 2013, and made 0 progress since precisely because it kept
> >> trying to address more and more features, and given netvsc is already
> >> using the basic solution with some success, I'm not inclined to block
> >> this specific effort waiting for the generic one.  
> > I think there is an agreement that the extra netdev will be useful for
> > more advanced use cases, and is generally preferable.  What is the
> > argument for not doing that from the start?  If it was made I must have
> > missed it.  Is it just unwillingness to write the extra 300 lines of
> > code?  Sounds like a pretty weak argument when adding kernel ABI is at
> > stake...  
> 
> I am still not clear on the need for the extra netdev created by 
> virtio_net. The only advantage i can see is that the stats can be
> broken between VF and virtio datapaths compared to the aggregrated
> stats on virtio netdev as seen with the 2 netdev approach.

Maybe you're not convinced but multiple arguments were made.

> With 2 netdev model, any VM image that has a working network 
> configuration will transparently get VF based acceleration without
> any changes. 

Nothing happens transparently.  Things may happen automatically.  The
VF netdev doesn't disappear with netvsc.  The PV netdev transforms into
something it did not use to be.  And configures and reports some
information from the PV (e.g. speed) but PV doesn't pass traffic any
longer.

> 3 netdev model breaks this configuration starting with the creation
> and naming of the 2 devices to udev needing to be aware of master and
> slave virtio-net devices.

I don't understand this comment.  There is one virtio-net device and
one "virtio-bond" netdev.  And user space has to be aware of the special
automatic arrangement anyway, because it can't touch the VF.  It
doesn't make any difference whether it ignores the VF or PV and VF.
It simply can't touch the slaves, no matter how many there are.

> Also, from a user experience point of view, loading a virtio-net with
> BACKUP feature enabled will now show 2 virtio-net netdevs.

One virtio-net and one virtio-bond, which represents what's happening.

> For live migration with advanced usecases that Siwei is suggesting, i 
> think we need a new driver with a new device type that can track the
> VF specific feature settings even when the VF driver is unloaded.

pull-request: bpf-next 2018-01-26

2018-01-26 Thread Alexei Starovoitov

Hi David,

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) A number of extensions to tcp-bpf, from Lawrence.
- direct R or R/W access to many tcp_sock fields via bpf_sock_ops
- passing up to 3 arguments to bpf_sock_ops functions
- tcp_sock field bpf_sock_ops_cb_flags for controlling callbacks
- optionally calling bpf_sock_ops program when RTO fires
- optionally calling bpf_sock_ops program when packet is retransmitted
- optionally calling bpf_sock_ops program when TCP state changes
- access to tclass and sk_txhash
- new selftest

2) div/mod exception handling, from Daniel.
One of the ugly leftovers from the early eBPF days is that div/mod
operations based on registers have a hard-coded src_reg == 0 test
in the interpreter as well as in JIT code generators that would
return from the BPF program with exit code 0. This was basically
adopted from cBPF interpreter for historical reasons.
There are multiple reasons why this is very suboptimal and prone
to bugs. To name one: the return code mapping for such abnormal
program exit of 0 does not always match with a suitable program
type's exit code mapping. For example, '0' in tc means action 'ok'
where the packet gets passed further up the stack, which is just
undesirable for such cases (e.g. when implementing policy) and
also does not match with other program types.
After considering _four_ different ways to address the problem,
we adapt the same behavior as on some major archs like ARMv8:
X div 0 results in 0, and X mod 0 results in X. aarch64 and
aarch32 ISA do not generate any traps or otherwise aborts
of program execution for unsigned divides.
Given the options, it seems the most suitable from
all of them, also since major archs have similar schemes in
place. Given this is all in the realm of undefined behavior,
we still have the option to adapt if deemed necessary.

3) sockmap sample refactoring, from John.

4) lpm map get_next_key fixes, from Yonghong.

5) test cleanups, from Alexei and Prashant.

Please consider pulling these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git

There should be no merge conflicts.

Thanks a lot!



The following changes since commit e8a22b5f079449f1803d37ce2b5d09acaa68368d:

  net: aquantia: make symbol hw_atl_boards static (2018-01-23 10:59:42 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git 

for you to fetch changes up to 8223967fe0b8eb2448cca5cfe3c64a0838e6f60d:

  Merge branch 'fix-lpm-map' (2018-01-26 17:06:24 -0800)


Alexei Starovoitov (6):
  selftests/bpf: speedup test_maps
  selftests/bpf: fix test_dev_cgroup
  selftests/bpf: make 'dubious pointer arithmetic' test useful
  Merge branch 'bpf-more-sock_ops-callbacks'
  Merge branch 'bpf-improvements-and-fixes'
  Merge branch 'fix-lpm-map'

Daniel Borkmann (14):
  Merge branch 'bpf-samples-sockmap-improvements'
  bpf: xor of a/x in cbpf can be done in 32 bit alu
  bpf: improve dead code sanitizing
  bpf: make unknown opcode handling more robust
  bpf: fix subprog verifier bypass by div/mod by 0 exception
  bpf, x86_64: remove obsolete exception handling from div/mod
  bpf, arm64: remove obsolete exception handling from div/mod
  bpf, s390x: remove obsolete exception handling from div/mod
  bpf, ppc64: remove obsolete exception handling from div/mod
  bpf, sparc64: remove obsolete exception handling from div/mod
  bpf, mips64: remove obsolete exception handling from div/mod
  bpf, mips64: remove unneeded zero check from div/mod with k
  bpf, arm: remove obsolete exception handling from div/mod
  bpf: add further test cases around div/mod and others

John Fastabend (7):
  bpf: refactor sockmap sample program update for arg parsing
  bpf: add sendmsg option for testing BPF programs
  bpf: sockmap sample, use fork() for send and recv
  bpf: sockmap sample, report bytes/sec
  bpf: sockmap sample add base test without any BPF for comparison
  bpf: sockmap put client sockets in blocking mode
  bpf: sockmap set rlimit

Lawrence Brakmo (13):
  bpf: Only reply field should be writeable
  bpf: Make SOCK_OPS_GET_TCP size independent
  bpf: Make SOCK_OPS_GET_TCP struct independent
  bpf: Add write access to tcp_sock and sock fields
  bpf: Support passing args to sock_ops bpf function
  bpf: Adds field bpf_sock_ops_cb_flags to tcp_sock
  bpf: Add sock_ops RTO callback
  bpf: Add support for reading sk_state and more
  bpf: Add sock_ops R/W access to tclass
  bpf: Add BPF_SOCK_OPS_RETRANS_CB
  bpf: Add BPF_SOCK_OPS_STATE_CB
  bpf: add selftest

Re: [PATCH bpf-next v7 3/5] libbpf: add error reporting in XDP

2018-01-26 Thread Daniel Borkmann

On 01/25/2018 01:05 AM, Eric Leblond wrote:
> Parse netlink ext attribute to get the error message returned by
> the card. Code is partially take from libnl.
> 
> We add netlink.h to the uapi include of tools. And we need to
> avoid include of userspace netlink header to have a successful
> build of sample so nlattr.h has a define to avoid
> the inclusion. Using a direct define could have been an issue
> as NLMSGERR_ATTR_MAX can change in the future.
> 
> We also define SOL_NETLINK if not defined to avoid to have to
> copy socket.h for a fixed value.
> 
> Signed-off-by: Eric Leblond 
> Acked-by: Alexei Starovoitov 
> 
> remote rtne
> 
> Signed-off-by: Eric Leblond 

Some leftover artifact from squashing commits?

>  samples/bpf/Makefile   |   2 +-
>  tools/lib/bpf/Build|   2 +-
>  tools/lib/bpf/bpf.c|  13 +++-
>  tools/lib/bpf/nlattr.c | 187 
> +
>  tools/lib/bpf/nlattr.h |  72 +++
>  5 files changed, 273 insertions(+), 3 deletions(-)
>  create mode 100644 tools/lib/bpf/nlattr.c
>  create mode 100644 tools/lib/bpf/nlattr.h
> 
> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> index 7f61a3d57fa7..5c4cd3745282 100644
> --- a/samples/bpf/Makefile
> +++ b/samples/bpf/Makefile
> @@ -45,7 +45,7 @@ hostprogs-y += xdp_rxq_info
>  hostprogs-y += syscall_tp
>  
>  # Libbpf dependencies
> -LIBBPF := ../../tools/lib/bpf/bpf.o
> +LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
>  CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
>  
>  test_lru_dist-objs := test_lru_dist.o $(LIBBPF)
> diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
> index d8749756352d..64c679d67109 100644
> --- a/tools/lib/bpf/Build
> +++ b/tools/lib/bpf/Build
> @@ -1 +1 @@
> -libbpf-y := libbpf.o bpf.o
> +libbpf-y := libbpf.o bpf.o nlattr.o
> diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
> index 749a447ec9ed..765fd95b0657 100644
> --- a/tools/lib/bpf/bpf.c
> +++ b/tools/lib/bpf/bpf.c
> @@ -27,7 +27,7 @@
>  #include "bpf.h"
>  #include "libbpf.h"
>  #include "nlattr.h"
> -#include 
> +#include 

Okay, so here it's put back from prior added uapi/linux/rtnetlink.h
into linux/rtnetlink.h. Could you add this properly in the first
commit rather than relative adjustment/fix within the same set?

>  #include 
>  #include 
>  
> @@ -37,6 +37,10 @@
>  #define IFLA_XDP_FLAGS   3
>  #endif
>  
> +#ifndef SOL_NETLINK
> +#define SOL_NETLINK 270
> +#endif

This would need include/linux/socket.h into tools/ include infra
as well, no?

>  /*
>   * When building perf, unistd.h is overridden. __NR_bpf is
>   * required to be defined explicitly.
> @@ -441,6 +445,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
>   struct nlmsghdr *nh;
>   struct nlmsgerr *err;
>   socklen_t addrlen;

Re: [PATCH bpf-next v7 2/5] libbpf: add function to setup XDP

2018-01-26 Thread Daniel Borkmann

On 01/25/2018 01:05 AM, Eric Leblond wrote:
> Most of the code is taken from set_link_xdp_fd() in bpf_load.c and
> slightly modified to be library compliant.
> 
> Signed-off-by: Eric Leblond 
> Acked-by: Alexei Starovoitov 
> ---
>  tools/lib/bpf/bpf.c| 127 
> +
>  tools/lib/bpf/libbpf.c |   2 +
>  tools/lib/bpf/libbpf.h |   4 ++
>  3 files changed, 133 insertions(+)
> 
> diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
> index 5128677e4117..749a447ec9ed 100644
> --- a/tools/lib/bpf/bpf.c
> +++ b/tools/lib/bpf/bpf.c
> @@ -25,6 +25,17 @@
>  #include 
>  #include 
>  #include "bpf.h"
> +#include "libbpf.h"
> +#include "nlattr.h"
> +#include 

Doesn't libbpf pull in already -I$(srctree)/tools/include/uapi? Seems the
other headers don't need 'uapi/' path prefix.

> +#include 
> +#include 
> +
> +#ifndef IFLA_XDP_MAX
> +#define IFLA_XDP 43
> +#define IFLA_XDP_FD  1
> +#define IFLA_XDP_FLAGS   3
> +#endif

Hm, given we pull in tools/include/uapi/linux/netlink.h, shouldn't we also
get include/uapi/linux/if_link.h dependency in here, so above ifdef workaround
can be avoided?

>  /*
>   * When building perf, unistd.h is overridden. __NR_bpf is
> @@ -46,7 +57,9 @@
>  # endif
>  #endif
>  
> +#ifndef min
>  #define min(x, y) ((x) < (y) ? (x) : (y))
> +#endif
>  
>  static inline __u64 ptr_to_u64(const void *ptr)
>  {
> @@ -413,3 +426,117 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, 
> __u32 *info_len)
>  
>   return err;
>  }
> +
> +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
> +{
> + struct sockaddr_nl sa;
> + int sock, seq = 0, len, ret = -1;
> + char buf[4096];
> + struct nlattr *nla, *nla_xdp;
> + struct {
[...]

Re: general protection fault in __lock_acquire (2)

2018-01-26 Thread Eric Biggers

On Thu, Nov 02, 2017 at 03:55:00AM -0700, syzbot wrote:
> Hello,
> 
> syzkaller hit the following crash on
> fa8785e862ef644f742558f1a8c91eca6f3f0004
> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> 
> 
> 
> 
> R10: 20869000 R11: 0246 R12: 004a96f0
> R13:  R14: 7fe1d40269c8 R15: 7fe1d4026b38
> Subscriber rejected, no memory
> kasan: CONFIG_KASAN_INLINE enabled
> kasan: GPF could be caused by NULL-ptr deref or user memory access
> general protection fault:  [#1] SMP KASAN
> Dumping ftrace buffer:
>(ftrace buffer empty)
> Modules linked in:
> CPU: 3 PID: 16314 Comm: syz-executor6 Not tainted 4.14.0-rc7-next-20171102+
> #9
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> task: 88003d54a000 task.stack: 880038f7
> RIP: 0010:__lock_acquire+0xdac/0x4770 kernel/locking/lockdep.c:3378
> RSP: 0018:880038f77328 EFLAGS: 00010002
> RAX: dc00 RBX:  RCX: 
> RDX: 0004 RSI:  RDI: 85ecb380
> RBP: 880038f77830 R08: 0001 R09: 
> R10:  R11: 8744cca0 R12: 88003d54a000
> R13:  R14: 0001 R15: 0020
> FS:  7fe1d4027700() GS:88006df0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 7fbbb18db000 CR3: 6a863000 CR4: 26e0
> Call Trace:
>  lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:4004
>  __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline]
>  _raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:173
>  spin_lock_bh include/linux/spinlock.h:319 [inline]
>  tipc_subscrb_subscrp_delete+0x8f/0x480 net/tipc/subscr.c:201
>  tipc_subscrb_delete net/tipc/subscr.c:238 [inline]
>  tipc_subscrb_release_cb+0x17/0x30 net/tipc/subscr.c:316
>  tipc_close_conn+0x171/0x270 net/tipc/server.c:204
>  tipc_topsrv_kern_subscr+0x724/0x810 net/tipc/server.c:514
>  tipc_group_create+0x702/0x9c0 net/tipc/group.c:184
>  tipc_sk_join net/tipc/socket.c:2747 [inline]
>  tipc_setsockopt+0x249/0xc10 net/tipc/socket.c:2861
>  SYSC_setsockopt net/socket.c:1851 [inline]
>  SyS_setsockopt+0x189/0x360 net/socket.c:1830
>  entry_SYSCALL_64_fastpath+0x1f/0xbe
> RIP: 0033:0x447c89
> RSP: 002b:7fe1d4026bd8 EFLAGS: 0246 ORIG_RAX: 0036
> RAX: ffda RBX: 7fe1d40276cc RCX: 00447c89
> RDX: 0087 RSI: 010f RDI: 0013
> RBP: 0086 R08: 0010 R09: 
> R10: 20869000 R11: 0246 R12: 004a96f0
> R13:  R14: 7fe1d40269c8 R15: 7fe1d4026b38
> Code: e9 03 f3 48 ab 48 81 c4 e0 04 00 00 44 89 f0 5b 41 5c 41 5d 41 5e 41
> 5f 5d c3 4c 89 fa 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f
> 85 bf 33 00 00 49 81 3f 80 87 87 86 41 be 00 00
> RIP: __lock_acquire+0xdac/0x4770 kernel/locking/lockdep.c:3378 RSP:
> 880038f77328
> ---[ end trace 61ded41cffc497c5 ]---
> Kernel panic - not syncing: Fatal exception in interrupt
> Dumping ftrace buffer:
>(ftrace buffer empty)
> Kernel Offset: disabled
> Rebooting in 86400 seconds..
> 
> 
> ---
> This bug is generated by a dumb bot. It may contain errors.
> See https://goo.gl/tpsmEJ for details.
> Direct all questions to syzkal...@googlegroups.com.
> Please credit me with: Reported-by: syzbot 
> 
> syzbot will keep track of this bug report.
> Once a fix for this bug is committed, please reply to this email with:
> #syz fix: exact-commit-title

No longer occurring, seems to have been fixed by:

#syz fix: tipc: fix a null pointer deref on error path

Re: [PATCH bpf-next 0/2] bpf: fix kernel page fault in lpm map trie_get_next_key

2018-01-26 Thread Alexei Starovoitov

On Fri, Jan 26, 2018 at 03:06:06PM -0800, Yonghong Song wrote:
> A kernel page fault which happens in lpm map trie_get_next_key is reported
> by syzbot and Eric. The issue was introduced by commit b471f2f1de8b
> ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map").
> Patch #1 fixed the issue in the kernel and patch #2 adds a multithreaded
> test case in tools/testing/selftests/bpf/test_lpm_map.

Applied to bpf-next, thank you Yonghong!

Re: [PATCH bpf-next 00/13] BPF improvements and fixes

2018-01-26 Thread Alexei Starovoitov

On Fri, Jan 26, 2018 at 11:33:35PM +0100, Daniel Borkmann wrote:
> This set contains a small cleanup in cBPF prologue generation and
> otherwise fixes an outstanding issue related to BPF to BPF calls
> and exception handling. For details please see related patches.
> Last but not least, BPF selftests is extended with several new
> test cases.

Applied to bpf-next, Thank you Daniel!

[bridge] IGMP/MLD snooping per port and per VLAN support

2018-01-26 Thread Joachim Nilsson

Hi,

longtime lurker, first time poster.  In the "new" VLAN aware bridge,
is there anyone else except me (and possibly Cumulus) thinking about
per port *and* per VLAN support for IGMP/MLD snooping in the bridge?

By that I mean bridge support for per port and per VLAN querier, and
group membership, which the current bridge implementation, as far as
I can see, unfortunately is lacking.

Westermo, where I work, has invested heavily in DSA/switchdev/bridge
and we are *extremely* happy with it all (awesome work ppl!), and we
are very interested in helping out with multicast work in the bridge
so any answers to this post are most welcome :)

Sincerely
 /Joachim

P.S.
My colleague Tobias is of the opinion that full support for
this will, like STP/RSTP, be politely showed the way to user
space by the senior netdev devs. So yeah, that's what we're
considering atm., but I thought it would be good idea to at
the very least post the question here first.
D.S.
P.P.S.
Yes, openvswitch supports this *perfectly*, which is a bit
annoying, but also in userspace and only if I also run sth
like pimd/mrouted to act as querier (which most ppl don't
really need.) on their magical internal/local CPU port.
D.D.S.

[RFC 2/2] Revert "hv_netvsc: netvsc_teardown_gpadl() split"

2018-01-26 Thread Stephen Hemminger

This reverts commit 0cf737808ae7cb25e952be619db46b9147a92f46.

The problem that the previous commit was trying to solve was that
undoing the mapping of the receive buffer after revoke was
problematic.  This was because the shutdown logic was not ensuring
that there were no receive and sends in flight. The changes in
commit 30de1885e897 ("hv_netvsc: make sure device is idle before changes")
ensure that device is completely idle.

Windows Server 2012 does not allow the receive buffer mapping to be
undone after the channel is closed. This because it assumes when
channel is closed, guest won't use it.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc.c | 69 ++---
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 619a04f98321..6db9bfb5c595 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -101,11 +101,12 @@ static void free_netvsc_device_rcu(struct netvsc_device 
*nvdev)
call_rcu(>rcu, free_netvsc_device);
 }
 
-static void netvsc_revoke_buf(struct hv_device *device,
- struct netvsc_device *net_device)
+static void netvsc_destroy_buf(struct hv_device *device)
 {
struct nvsp_message *revoke_packet;
struct net_device *ndev = hv_get_drvdata(device);
+   struct net_device_context *ndc = netdev_priv(ndev);
+   struct netvsc_device *net_device = rtnl_dereference(ndc->nvdev);
int ret;
 
/*
@@ -148,6 +149,28 @@ static void netvsc_revoke_buf(struct hv_device *device,
net_device->recv_section_cnt = 0;
}
 
+   /* Teardown the gpadl on the vsp end */
+   if (net_device->recv_buf_gpadl_handle) {
+   ret = vmbus_teardown_gpadl(device->channel,
+  net_device->recv_buf_gpadl_handle);
+
+   /* If we failed here, we might as well return and have a leak
+* rather than continue and a bugchk
+*/
+   if (ret != 0) {
+   netdev_err(ndev,
+  "unable to teardown receive buffer's 
gpadl\n");
+   return;
+   }
+   net_device->recv_buf_gpadl_handle = 0;
+   }
+
+   if (net_device->recv_buf) {
+   /* Free up the receive buffer */
+   vfree(net_device->recv_buf);
+   net_device->recv_buf = NULL;
+   }
+
/* Deal with the send buffer we may have setup.
 * If we got a  send section size, it means we received a
 * NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE msg (ie sent
@@ -188,35 +211,7 @@ static void netvsc_revoke_buf(struct hv_device *device,
}
net_device->send_section_cnt = 0;
}
-}
-
-static void netvsc_teardown_gpadl(struct hv_device *device,
- struct netvsc_device *net_device)
-{
-   struct net_device *ndev = hv_get_drvdata(device);
-   int ret;
-
-   if (net_device->recv_buf_gpadl_handle) {
-   ret = vmbus_teardown_gpadl(device->channel,
-  net_device->recv_buf_gpadl_handle);
-
-   /* If we failed here, we might as well return and have a leak
-* rather than continue and a bugchk
-*/
-   if (ret != 0) {
-   netdev_err(ndev,
-  "unable to teardown receive buffer's 
gpadl\n");
-   return;
-   }
-   net_device->recv_buf_gpadl_handle = 0;
-   }
-
-   if (net_device->recv_buf) {
-   /* Free up the receive buffer */
-   vfree(net_device->recv_buf);
-   net_device->recv_buf = NULL;
-   }
-
+   /* Teardown the gpadl on the vsp end */
if (net_device->send_buf_gpadl_handle) {
ret = vmbus_teardown_gpadl(device->channel,
   net_device->send_buf_gpadl_handle);
@@ -431,8 +426,7 @@ static int netvsc_init_buf(struct hv_device *device,
goto exit;
 
 cleanup:
-   netvsc_revoke_buf(device, net_device);
-   netvsc_teardown_gpadl(device, net_device);
+   netvsc_destroy_buf(device);
 
 exit:
return ret;
@@ -551,6 +545,11 @@ static int netvsc_connect_vsp(struct hv_device *device,
return ret;
 }
 
+static void netvsc_disconnect_vsp(struct hv_device *device)
+{
+   netvsc_destroy_buf(device);
+}
+
 /*
  * netvsc_device_remove - Callback when the root bus device is removed
  */
@@ -564,7 +563,7 @@ void netvsc_device_remove(struct hv_device *device)
 
cancel_work_sync(_device->subchan_work);
 
-   netvsc_revoke_buf(device, net_device);
+   netvsc_disconnect_vsp(device);
 
RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
 
@@ -577,8 +576,6 @@ void

[RFC 1/2] hv_netvsc: make sure device is idle before changes

2018-01-26 Thread Stephen Hemminger

Make sure that device is in detached state before doing ring
and mtu changes. When doing these changes, wait for all outstanding
send completions and ring buffer events.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/hyperv_net.h   |  1 -
 drivers/net/hyperv/netvsc.c   |  6 +
 drivers/net/hyperv/netvsc_drv.c   | 29 +--
 drivers/net/hyperv/rndis_filter.c | 48 ++-
 4 files changed, 40 insertions(+), 44 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 0db3bd1ea06f..a846a9c50ddb 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -211,7 +211,6 @@ void netvsc_channel_cb(void *context);
 int netvsc_poll(struct napi_struct *napi, int budget);
 
 void rndis_set_subchannel(struct work_struct *w);
-bool rndis_filter_opened(const struct netvsc_device *nvdev);
 int rndis_filter_open(struct netvsc_device *nvdev);
 int rndis_filter_close(struct netvsc_device *nvdev);
 struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 17e529af79dc..619a04f98321 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -849,7 +849,7 @@ int netvsc_send(struct net_device *ndev,
bool try_batch, xmit_more;
 
/* If device is rescinded, return error and packet will get dropped. */
-   if (unlikely(!net_device || net_device->destroy))
+   if (unlikely(!net_device))
return -ENODEV;
 
/* We may race with netvsc_connect_vsp()/netvsc_init_buf() and get
@@ -996,10 +996,6 @@ static int send_recv_completions(struct net_device *ndev,
mrc->first = 0;
}
 
-   /* receive completion ring has been emptied */
-   if (unlikely(nvdev->destroy))
-   wake_up(>wait_drain);
-
return 0;
 }
 
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index c5584c2d440e..ef395e379a83 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -820,7 +820,7 @@ static int netvsc_set_channels(struct net_device *net,
channels->rx_count || channels->tx_count || channels->other_count)
return -EINVAL;
 
-   if (!nvdev || nvdev->destroy)
+   if (!nvdev)
return -ENODEV;
 
if (nvdev->nvsp_version < NVSP_PROTOCOL_VERSION_5)
@@ -830,9 +830,6 @@ static int netvsc_set_channels(struct net_device *net,
return -EINVAL;
 
orig = nvdev->num_chn;
-   was_opened = rndis_filter_opened(nvdev);
-   if (was_opened)
-   rndis_filter_close(nvdev);
 
memset(_info, 0, sizeof(device_info));
device_info.num_chn = count;
@@ -841,6 +838,11 @@ static int netvsc_set_channels(struct net_device *net,
device_info.recv_sections = nvdev->recv_section_cnt;
device_info.recv_section_size = nvdev->recv_section_size;
 
+   was_opened = netif_running(net);
+   netif_device_detach(net);
+   if (was_opened)
+   rndis_filter_close(nvdev);
+
rndis_filter_device_remove(dev, nvdev);
 
nvdev = rndis_filter_device_add(dev, _info);
@@ -859,6 +861,8 @@ static int netvsc_set_channels(struct net_device *net,
if (was_opened)
rndis_filter_open(nvdev);
 
+   netif_device_attach(net);
+
/* We may have missed link change notifications */
net_device_ctx->last_reconfig = 0;
schedule_delayed_work(_device_ctx->dwork, 0);
@@ -934,7 +938,7 @@ static int netvsc_change_mtu(struct net_device *ndev, int 
mtu)
bool was_opened;
int ret = 0;
 
-   if (!nvdev || nvdev->destroy)
+   if (!nvdev)
return -ENODEV;
 
/* Change MTU of underlying VF netdev first. */
@@ -944,11 +948,6 @@ static int netvsc_change_mtu(struct net_device *ndev, int 
mtu)
return ret;
}
 
-   netif_device_detach(ndev);
-   was_opened = rndis_filter_opened(nvdev);
-   if (was_opened)
-   rndis_filter_close(nvdev);
-
memset(_info, 0, sizeof(device_info));
device_info.num_chn = nvdev->num_chn;
device_info.send_sections = nvdev->send_section_cnt;
@@ -956,6 +955,11 @@ static int netvsc_change_mtu(struct net_device *ndev, int 
mtu)
device_info.recv_sections = nvdev->recv_section_cnt;
device_info.recv_section_size = nvdev->recv_section_size;
 
+   was_opened = netif_running(ndev);
+   netif_device_detach(ndev);
+   if (was_opened)
+   rndis_filter_close(nvdev);
+
rndis_filter_device_remove(hdev, nvdev);
 
ndev->mtu = mtu;
@@ -1497,7 +1501,7 @@ static int netvsc_set_ringparam(struct net_device *ndev,
bool was_opened;
int ret = 0;
 
-   if (!nvdev || nvdev->destroy)
+   if (!nvdev)
return

[RFC 0/2] hv_netvsc shutdown redo

2018-01-26 Thread Stephen Hemminger

These patches change how teardown of Hyper-V network devices
is done. These are tested on WS2012 and WS2016.

It moves the tx/rx shutdown into the rndis close handling,
and that makes earlier gpadl changes unnecsssary.

Stephen Hemminger (2):
  hv_netvsc: make sure device is idle before changes
  Revert "hv_netvsc: netvsc_teardown_gpadl() split"

 drivers/net/hyperv/hyperv_net.h   |  1 -
 drivers/net/hyperv/netvsc.c   | 75 ++-
 drivers/net/hyperv/netvsc_drv.c   | 29 ---
 drivers/net/hyperv/rndis_filter.c | 48 -
 4 files changed, 73 insertions(+), 80 deletions(-)

-- 
2.15.1

[PATCH net-next] net/mlx5e: IPoIB, Fix copy-paste bug in flow steering refactoring

2018-01-26 Thread Saeed Mahameed

From: Gal Pressman 

On TTC table creation, the indirection TIRs should be used instead of
the inner indirection TIRs.

Fixes: 1ae1df3a1193 ("net/mlx5e: Refactor RSS related objects and code")
Signed-off-by: Gal Pressman 
Reviewed-by: Shalom Lagziel 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 264504a990ca..1f50b77a081d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -271,7 +271,7 @@ static int mlx5i_create_flow_steering(struct mlx5e_priv 
*priv)
 
mlx5e_set_ttc_ft_params(_params);
for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
-   ttc_params.indir_tirn[tt] = priv->inner_indir_tir[tt].tirn;
+   ttc_params.indir_tirn[tt] = priv->indir_tir[tt].tirn;
 
err = mlx5e_create_ttc_table(priv, _params, >fs.ttc);
if (err) {
-- 
2.14.3

[PATCH net] ipv6: addrconf: break critical section in addrconf_verify_rtnl()

2018-01-26 Thread Eric Dumazet

From: Eric Dumazet 

Heiner reported a lockdep splat [1]

This is caused by attempting GFP_KERNEL allocation while RCU lock is
held and BH blocked.

We believe that addrconf_verify_rtnl() could run for a long period,
so instead of using GFP_ATOMIC here as Ido suggested, we should break
the critical section and restart it after the allocation.


[1]
[86220.125562] =
[86220.125586] WARNING: suspicious RCU usage
[86220.125612] 4.15.0-rc7-next-20180110+ #7 Not tainted
[86220.125641] -
[86220.125666] kernel/sched/core.c:6026 Illegal context switch in RCU-bh 
read-side critical section!
[86220.125711]
   other info that might help us debug this:

[86220.125755]
   rcu_scheduler_active = 2, debug_locks = 1
[86220.125792] 4 locks held by kworker/0:2/1003:
[86220.125817]  #0:  ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at: 
[] process_one_work+0x1de/0x680
[86220.125895]  #1:  ((addr_chk_work).work){+.+.}, at: [] 
process_one_work+0x1de/0x680
[86220.125959]  #2:  (rtnl_mutex){+.+.}, at: [] 
rtnl_lock+0x12/0x20
[86220.126017]  #3:  (rcu_read_lock_bh){}, at: [] 
addrconf_verify_rtnl+0x1e/0x510 [ipv6]
[86220.126111]
   stack backtrace:
[86220.126142] CPU: 0 PID: 1003 Comm: kworker/0:2 Not tainted 
4.15.0-rc7-next-20180110+ #7
[86220.126185] Hardware name: ZOTAC ZBOX-CI321NANO/ZBOX-CI321NANO, BIOS 
B246P105 06/01/2015
[86220.126250] Workqueue: ipv6_addrconf addrconf_verify_work [ipv6]
[86220.126288] Call Trace:
[86220.126312]  dump_stack+0x70/0x9e
[86220.126337]  lockdep_rcu_suspicious+0xce/0xf0
[86220.126365]  ___might_sleep+0x1d3/0x240
[86220.126390]  __might_sleep+0x45/0x80
[86220.126416]  kmem_cache_alloc_trace+0x53/0x250
[86220.126458]  ? ipv6_add_addr+0xfe/0x6e0 [ipv6]
[86220.126498]  ipv6_add_addr+0xfe/0x6e0 [ipv6]
[86220.126538]  ipv6_create_tempaddr+0x24d/0x430 [ipv6]
[86220.126580]  ? ipv6_create_tempaddr+0x24d/0x430 [ipv6]
[86220.126623]  addrconf_verify_rtnl+0x339/0x510 [ipv6]
[86220.126664]  ? addrconf_verify_rtnl+0x339/0x510 [ipv6]
[86220.126708]  addrconf_verify_work+0xe/0x20 [ipv6]
[86220.126738]  process_one_work+0x258/0x680
[86220.126765]  worker_thread+0x35/0x3f0
[86220.126790]  kthread+0x124/0x140
[86220.126813]  ? process_one_work+0x680/0x680
[86220.126839]  ? kthread_create_worker_on_cpu+0x40/0x40
[86220.126869]  ? umh_complete+0x40/0x40
[86220.126893]  ? call_usermodehelper_exec_async+0x12a/0x160
[86220.126926]  ret_from_fork+0x4b/0x60
[86220.126999] BUG: sleeping function called from invalid context at 
mm/slab.h:420
[86220.127041] in_atomic(): 1, irqs_disabled(): 0, pid: 1003, name: kworker/0:2
[86220.127082] 4 locks held by kworker/0:2/1003:
[86220.127107]  #0:  ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at: 
[] process_one_work+0x1de/0x680
[86220.127179]  #1:  ((addr_chk_work).work){+.+.}, at: [] 
process_one_work+0x1de/0x680
[86220.127242]  #2:  (rtnl_mutex){+.+.}, at: [] 
rtnl_lock+0x12/0x20
[86220.127300]  #3:  (rcu_read_lock_bh){}, at: [] 
addrconf_verify_rtnl+0x1e/0x510 [ipv6]
[86220.127414] CPU: 0 PID: 1003 Comm: kworker/0:2 Not tainted 
4.15.0-rc7-next-20180110+ #7
[86220.127463] Hardware name: ZOTAC ZBOX-CI321NANO/ZBOX-CI321NANO, BIOS 
B246P105 06/01/2015
[86220.127528] Workqueue: ipv6_addrconf addrconf_verify_work [ipv6]
[86220.127568] Call Trace:
[86220.127591]  dump_stack+0x70/0x9e
[86220.127616]  ___might_sleep+0x14d/0x240
[86220.127644]  __might_sleep+0x45/0x80
[86220.127672]  kmem_cache_alloc_trace+0x53/0x250
[86220.127717]  ? ipv6_add_addr+0xfe/0x6e0 [ipv6]
[86220.127762]  ipv6_add_addr+0xfe/0x6e0 [ipv6]
[86220.127807]  ipv6_create_tempaddr+0x24d/0x430 [ipv6]
[86220.127854]  ? ipv6_create_tempaddr+0x24d/0x430 [ipv6]
[86220.127903]  addrconf_verify_rtnl+0x339/0x510 [ipv6]
[86220.127950]  ? addrconf_verify_rtnl+0x339/0x510 [ipv6]
[86220.127998]  addrconf_verify_work+0xe/0x20 [ipv6]
[86220.128032]  process_one_work+0x258/0x680
[86220.128063]  worker_thread+0x35/0x3f0
[86220.128091]  kthread+0x124/0x140
[86220.128117]  ? process_one_work+0x680/0x680
[86220.128146]  ? kthread_create_worker_on_cpu+0x40/0x40
[86220.128180]  ? umh_complete+0x40/0x40
[86220.128207]  ? call_usermodehelper_exec_async+0x12a/0x160
[86220.128243]  ret_from_fork+0x4b/0x60

Fixes: f3d9832e56c4 ("ipv6: addrconf: cleanup locking in ipv6_add_addr")
Signed-off-by: Eric Dumazet 
Reported-by: Heiner Kallweit 
---
 net/ipv6/addrconf.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 
f49bd7897e95f15a381e4700660991f2d3c3fed4..10facd174210974ac82b2304211061b90714aac8
 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4352,9 +4352,11 @@ static void addrconf_verify_rtnl(void)
spin_lock(>lock);

Re: [RFC v2 net-next 00/10] Time based packet transmission

2018-01-26 Thread Jesus Sanchez-Palencia

Hi Levi,

On 01/23/2018 05:43 PM, Levi Pearson wrote:
> On Wed, Jan 17, 2018 at 4:06 PM, Jesus Sanchez-Palencia
>  wrote:
>> This series is the v2 of the Time based packet transmission RFC, which was
>> originally proposed by Richard Cochran: https://lwn.net/Articles/733962/ .
> 
> Great to see you carrying on with this!
> 
>> Our main questions at this stage are related to the qdisc:
>>  - does the proposed design attend all use cases?
>>  - should the qdisc really drop packets that expired after being queued even
>>for the SW best effort mode?
> 
> I don't think that being "expired" is necessarily cause for dropping.
> The semantic of a launch time is "launch no earlier than this point"
> after all, not a deadline. To keep the hardware working, we must only
> enforce the invariant that we never queue a packet with an earlier
> timestamp than one we previously enqueued that has not launched yet.
> Just checking for expiration is going to rule out some potential uses
> and also won't necessarily prevent enqueuing out-of-order packets.

Let me just split this a bit to make sure we don’t mix things up.

Currently, as discussed during the RFC v1 thread, on tbs_enqueue() we drop
packets if they are expired or if they have an earlier timestamp than the last
dequeued packet.

On tbs_dequeue(), we drop packets if they have expired while sitting at our
timerqueue. That is done because our current semantic for txtime is “no later
than this point”.  Are you suggesting that we change that to “no earlier than
this point” instead? The delta parameter would then be defining how early is
acceptable for dequeuing a packet, but we’ll need another parameter that can
define how late it should be when we decide to drop it.

> Here is an example:
> 
> A group of applications enqueue packets to be sent at 1 second
> intervals, and share a 5ms window in which they can send them. Due to
> scheduling variation, they may finish executing in a different order
> per interval, and occasionally some may not finish preparing their
> packet before the window opens, although they always will present
> their packet before the window closes.
> 
> If they all pick different times within the launch window, it is
> possible that two of them might pick times very close to one another.
> If they present their frames out-of-order to the qdisc, but close
> enough to the launch time that the qdisc doesn't hold on to them (i.e.
> in the [txtime - delta, txtime] range mentioned in tbs_dequeue), then
> they will get enqueued out of order and the invariant will be
> violated.  Reordering within some time window only works if all frames
> for that window are scheduled well in advance of the first launch
> time, and that's not great for applications that need to to calculate
> right up to the time they need to send their data.

I like the example, but due to the data structure that we use internally,
tbs_enqueue() will always enqueue packets onto their correct position, i.e. the
rbtree will always be ‘sorted’. If a dequeue() happens before the next enqueue,
then yes we may get to the situation you are describing, but that will always be
true regardless of the applications that are running, right? If that can’t be
fixed in userspace, then I’m afraid that either using a per-packet txtime is not
the right strategy for this system or tbs might not be the correct qdisc for it.

(...)

> 
> To maintain the hardware ordering invariant, you need to keep track of
> the most recent timestamp you have enqueued in the hardware. Anything
> that hits tbs_enqueue with a timestamp earlier than that must be
> either dropped or have its timestamp adjusted.

Yes, and we currently drop them there (that’s what the ktime_before(txtime, 
q->last)
check is doing). Adjusting timestamps is a can-of-worms, in my opinion, and I
don’t think we should go down that route.

> 
> The one remaining question is how late can a timestamped frame be
> before it should be dropped instead of enqueued, assuming it is to be
> allowed at all? The qdisc could track the allowed window based on user
> configuration. I believe the i210 hardware will launch any frame at
> the head of queue with a launch time set at or before the present
> time, but not so far before that it wraps and interprets the time as a
> future time. The qdisc would need to be able query the driver about
> how large that window is if it wants to pass in-the-past timestamps
> through as-is, but it could also just update timestamps still within
> the user-configured window to be set at the current time.

I believe I have tackled the question here already. For the rest, we don’t think
a qdisc should fetch any information from the driver. The information flow
should be kept as is, from qdisc to the driver, not the other way around.

> 
> My understanding of reservations for industrial TSN use cases is that
> applications will present their working period and their scheduling
>

Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Samudrala, Sridhar



On 1/26/2018 2:47 PM, Jakub Kicinski wrote:

On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:

On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:

and the VM is not expected to do any tuning/optimizations on the VF driver
directly,
i think the current patch that follows the netvsc model of 2 netdevs(virtio
and vf) should
work fine.

OK. For your use case that's fine. But that's too specific scenario
with lots of restrictions IMHO, perhaps very few users will benefit
from it, I'm not sure. If you're unwilling to move towards it, we'd
take this one and come back with a generic solution that is able to
address general use cases for VF/PT live migration .

I think that's a fine approach. Scratch your own itch!  I imagine a very
generic virtio-switchdev providing host routing info to guests could
address lots of usecases. A driver could bind to that one and enslave
arbitrary other devices.  Sounds reasonable.

But given the fundamental idea of a failover was floated at least as
early as 2013, and made 0 progress since precisely because it kept
trying to address more and more features, and given netvsc is already
using the basic solution with some success, I'm not inclined to block
this specific effort waiting for the generic one.

I think there is an agreement that the extra netdev will be useful for
more advanced use cases, and is generally preferable.  What is the
argument for not doing that from the start?  If it was made I must have
missed it.  Is it just unwillingness to write the extra 300 lines of
code?  Sounds like a pretty weak argument when adding kernel ABI is at
stake...


I am still not clear on the need for the extra netdev created by 
virtio_net. The only advantage
i can see is that the stats can be broken between VF and virtio 
datapaths compared
to the aggregrated stats on virtio netdev as seen with the 2 netdev 
approach.


With 2 netdev model, any VM image that has a working network 
configuration will transparently get
VF based acceleration without any changes. 3 netdev model breaks this 
configuration starting with the
creation and naming of the 2 devices to udev needing to be aware of 
master and slave virtio-net devices.
Also, from a user experience point of view, loading a virtio-net with 
BACKUP feature

enabled will  now show 2 virtio-net netdevs.

For live migration with advanced usecases that Siwei is suggesting, i 
think we need a new driver
with a new device type that can track the VF specific feature settings 
even when the VF driver is unloaded.


Thanks
Sridhar

Re: [PATCH net-next v1] samples/bpf: Partially fixes the bpf.o build

2018-01-26 Thread Daniel Borkmann

On 01/26/2018 09:30 AM, Mickaël Salaün wrote:
> On 26/01/2018 03:16, Alexei Starovoitov wrote:
>> On Fri, Jan 26, 2018 at 01:39:30AM +0100, Mickaël Salaün wrote:
>>> Do not build lib/bpf/bpf.o with this Makefile but use the one from the
>>> library directory.  This avoid making a buggy bpf.o file (e.g. missing
>>> symbols).
>>
>> could you provide an example?
>> What symbols will be missing?
>> I don't think there is an issue with existing Makefile.
> 
> You can run this commands:
> make -C samples/bpf; nm tools/lib/bpf/bpf.o > a; make -C tools/lib/bpf;
> nm tools/lib/bpf/bpf.o > b; diff -u a b
> 
> Symbols like bzero and sys_bpf are missing with the samples/bpf
> Makefile, which makes the bpf.o shrink from 25K to 7K.

I've applied it to bpf-next, thanks Mickaël!

[PATCH bpf-next 1/2] bpf: fix kernel page fault in lpm map trie_get_next_key

2018-01-26 Thread Yonghong Song

Commit b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command
for LPM_TRIE map") introduces a bug likes below:

if (!rcu_dereference(trie->root))
return -ENOENT;
if (!key || key->prefixlen > trie->max_prefixlen) {
root = >root;
goto find_leftmost;
}
..
  find_leftmost:
for (node = rcu_dereference(*root); node;) {

In the code after label find_leftmost, it is assumed
that *root should not be NULL, but it is not true as
it is possbile trie->root is changed to NULL by an
asynchronous delete operation.

The issue is reported by syzbot and Eric Dumazet with the
below error log:
  ..
  kasan: CONFIG_KASAN_INLINE enabled
  kasan: GPF could be caused by NULL-ptr deref or user memory access
  general protection fault:  [#1] SMP KASAN
  Dumping ftrace buffer:
 (ftrace buffer empty)
  Modules linked in:
  CPU: 1 PID: 8033 Comm: syz-executor3 Not tainted 4.15.0-rc8+ #4
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
Google 01/01/2011
  RIP: 0010:trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682
  ..

This patch fixed the issue by use local rcu_dereferenced
pointer instead of *(>root) later on.

Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command or LPM_TRIE map")
Reported-by: syzbot 
Reported-by: Eric Dumazet 
Signed-off-by: Yonghong Song 
---
 kernel/bpf/lpm_trie.c | 26 +++---
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 8f083ea..7b469d1 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -593,11 +593,10 @@ static void trie_free(struct bpf_map *map)
 
 static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 {
+   struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
-   struct lpm_trie_node *node, *next_node = NULL, *parent;
struct lpm_trie_node **node_stack = NULL;
-   struct lpm_trie_node __rcu **root;
int err = 0, stack_ptr = -1;
unsigned int next_bit;
size_t matchlen;
@@ -614,14 +613,13 @@ static int trie_get_next_key(struct bpf_map *map, void 
*_key, void *_next_key)
 */
 
/* Empty trie */
-   if (!rcu_dereference(trie->root))
+   search_root = rcu_dereference(trie->root);
+   if (!search_root)
return -ENOENT;
 
/* For invalid key, find the leftmost node in the trie */
-   if (!key || key->prefixlen > trie->max_prefixlen) {
-   root = >root;
+   if (!key || key->prefixlen > trie->max_prefixlen)
goto find_leftmost;
-   }
 
node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node 
*),
 GFP_ATOMIC | __GFP_NOWARN);
@@ -629,7 +627,7 @@ static int trie_get_next_key(struct bpf_map *map, void 
*_key, void *_next_key)
return -ENOMEM;
 
/* Try to find the exact node for the given key */
-   for (node = rcu_dereference(trie->root); node;) {
+   for (node = search_root; node;) {
node_stack[++stack_ptr] = node;
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
@@ -640,10 +638,8 @@ static int trie_get_next_key(struct bpf_map *map, void 
*_key, void *_next_key)
node = rcu_dereference(node->child[next_bit]);
}
if (!node || node->prefixlen != key->prefixlen ||
-   (node->flags & LPM_TREE_NODE_FLAG_IM)) {
-   root = >root;
+   (node->flags & LPM_TREE_NODE_FLAG_IM))
goto find_leftmost;
-   }
 
/* The node with the exactly-matching key has been found,
 * find the first node in postorder after the matched node.
@@ -651,10 +647,10 @@ static int trie_get_next_key(struct bpf_map *map, void 
*_key, void *_next_key)
node = node_stack[stack_ptr];
while (stack_ptr > 0) {
parent = node_stack[stack_ptr - 1];
-   if (rcu_dereference(parent->child[0]) == node &&
-   rcu_dereference(parent->child[1])) {
-   root = >child[1];
-   goto find_leftmost;
+   if (rcu_dereference(parent->child[0]) == node) {
+   search_root = rcu_dereference(parent->child[1]);
+   if (search_root)
+   goto find_leftmost;
}
if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
next_node = parent;
@@ -673,7 +669,7 @@ static int trie_get_next_key(struct bpf_map *map, void 
*_key, void *_next_key)
/* Find the leftmost non-intermediate node, all intermediate nodes
 * have exact two children, so

[PATCH bpf-next 2/2] tools/bpf: add a multithreaded stress test in bpf selftests test_lpm_map

2018-01-26 Thread Yonghong Song

The new test will spawn four threads, doing map update, delete, lookup
and get_next_key in parallel. It is able to reproduce the issue in the
previous commit found by syzbot and Eric Dumazet.

Signed-off-by: Yonghong Song 
---
 tools/testing/selftests/bpf/Makefile   |  2 +-
 tools/testing/selftests/bpf/test_lpm_map.c | 95 ++
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 9868835..bf05bc5 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -11,7 +11,7 @@ ifneq ($(wildcard $(GENHDR)),)
 endif
 
 CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) 
-I../../../include
-LDLIBS += -lcap -lelf -lrt
+LDLIBS += -lcap -lelf -lrt -lpthread
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs \
test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c 
b/tools/testing/selftests/bpf/test_lpm_map.c
index 0815108..2be87e9 100644
--- a/tools/testing/selftests/bpf/test_lpm_map.c
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -641,6 +642,98 @@ static void test_lpm_get_next_key(void)
close(map_fd);
 }
 
+#define MAX_TEST_KEYS  4
+struct lpm_mt_test_info {
+   int cmd; /* 0: update, 1: delete, 2: lookup, 3: get_next_key */
+   int iter;
+   int map_fd;
+   struct {
+   __u32 prefixlen;
+   __u32 data;
+   } key[MAX_TEST_KEYS];
+};
+
+static void *lpm_test_command(void *arg)
+{
+   int i, j, ret, iter, key_size;
+   struct lpm_mt_test_info *info = arg;
+   struct bpf_lpm_trie_key *key_p;
+
+   key_size = sizeof(struct bpf_lpm_trie_key) + sizeof(__u32);
+   key_p = alloca(key_size);
+   for (iter = 0; iter < info->iter; iter++)
+   for (i = 0; i < MAX_TEST_KEYS; i++) {
+   /* first half of iterations in forward order,
+* and second half in backward order.
+*/
+   j = (iter < (info->iter / 2)) ? i : MAX_TEST_KEYS - i - 
1;
+   key_p->prefixlen = info->key[j].prefixlen;
+   memcpy(key_p->data, >key[j].data, sizeof(__u32));
+   if (info->cmd == 0) {
+   __u32 value = j;
+   /* update must succeed */
+   assert(bpf_map_update_elem(info->map_fd, key_p, 
, 0) == 0);
+   } else if (info->cmd == 1) {
+   ret = bpf_map_delete_elem(info->map_fd, key_p);
+   assert(ret == 0 || errno == ENOENT);
+   } else if (info->cmd == 2) {
+   __u32 value;
+   ret = bpf_map_lookup_elem(info->map_fd, key_p, 
);
+   assert(ret == 0 || errno == ENOENT);
+   } else {
+   struct bpf_lpm_trie_key *next_key_p = 
alloca(key_size);
+   ret = bpf_map_get_next_key(info->map_fd, key_p, 
next_key_p);
+   assert(ret == 0 || errno == ENOENT || errno == 
ENOMEM);
+   }
+   }
+
+   // Pass successful exit info back to the main thread
+   pthread_exit((void *)info);
+}
+
+static void setup_lpm_mt_test_info(struct lpm_mt_test_info *info, int map_fd)
+{
+   info->iter = 2000;
+   info->map_fd = map_fd;
+   info->key[0].prefixlen = 16;
+   inet_pton(AF_INET, "192.168.0.0", >key[0].data);
+   info->key[1].prefixlen = 24;
+   inet_pton(AF_INET, "192.168.0.0", >key[1].data);
+   info->key[2].prefixlen = 24;
+   inet_pton(AF_INET, "192.168.128.0", >key[2].data);
+   info->key[3].prefixlen = 24;
+   inet_pton(AF_INET, "192.168.1.0", >key[3].data);
+}
+
+static void test_lpm_multi_thread(void)
+{
+   struct lpm_mt_test_info info[4];
+   size_t key_size, value_size;
+   pthread_t thread_id[4];
+   int i, map_fd;
+   void *ret;
+
+   /* create a trie */
+   value_size = sizeof(__u32);
+   key_size = sizeof(struct bpf_lpm_trie_key) + value_size;
+   map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, value_size,
+   100, BPF_F_NO_PREALLOC);
+
+   /* create 4 threads to test update, delete, lookup and get_next_key */
+   setup_lpm_mt_test_info([0], map_fd);
+   for (i = 0; i < 4; i++) {
+   if (i != 0)
+   memcpy([i], [0], sizeof(info[i]));
+   info[i].cmd = i;
+   assert(pthread_create(_id[i], NULL, _test_command, 
[i]) == 0);
+   }
+
+   for (i = 0; i < 4; i++)
+

[PATCH bpf-next 0/2] bpf: fix kernel page fault in lpm map trie_get_next_key

2018-01-26 Thread Yonghong Song

A kernel page fault which happens in lpm map trie_get_next_key is reported
by syzbot and Eric. The issue was introduced by commit b471f2f1de8b
("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map").
Patch #1 fixed the issue in the kernel and patch #2 adds a multithreaded
test case in tools/testing/selftests/bpf/test_lpm_map.

Yonghong Song (2):
  bpf: fix kernel page fault in lpm map trie_get_next_key
  tools/bpf: add a multithreaded stress test in bpf selftests
test_lpm_map

 kernel/bpf/lpm_trie.c  | 26 
 tools/testing/selftests/bpf/Makefile   |  2 +-
 tools/testing/selftests/bpf/test_lpm_map.c | 95 ++
 3 files changed, 107 insertions(+), 16 deletions(-)

-- 
2.9.5

Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Jakub Kicinski

On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:
> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
> > > and the VM is not expected to do any tuning/optimizations on the VF driver
> > > directly,
> > > i think the current patch that follows the netvsc model of 2 
> > > netdevs(virtio
> > > and vf) should
> > > work fine.  
> > 
> > OK. For your use case that's fine. But that's too specific scenario
> > with lots of restrictions IMHO, perhaps very few users will benefit
> > from it, I'm not sure. If you're unwilling to move towards it, we'd
> > take this one and come back with a generic solution that is able to
> > address general use cases for VF/PT live migration .  
> 
> I think that's a fine approach. Scratch your own itch!  I imagine a very
> generic virtio-switchdev providing host routing info to guests could
> address lots of usecases. A driver could bind to that one and enslave
> arbitrary other devices.  Sounds reasonable.
> 
> But given the fundamental idea of a failover was floated at least as
> early as 2013, and made 0 progress since precisely because it kept
> trying to address more and more features, and given netvsc is already
> using the basic solution with some success, I'm not inclined to block
> this specific effort waiting for the generic one.

I think there is an agreement that the extra netdev will be useful for
more advanced use cases, and is generally preferable.  What is the
argument for not doing that from the start?  If it was made I must have
missed it.  Is it just unwillingness to write the extra 300 lines of
code?  Sounds like a pretty weak argument when adding kernel ABI is at
stake...

Re: [PATCH bpf-next] bpf: clean up from test_tcpbpf_kern.c

2018-01-26 Thread Daniel Borkmann

On 01/26/2018 09:06 PM, Lawrence Brakmo wrote:
> Removed commented lines from test_tcpbpf_kern.c
> 
> Fixes: d6d4f60c3a09 bpf: add selftest for tcpbpf
> Signed-off-by: Lawrence Brakmo 

Applied to bpf-next, thanks Lawrence!

Re: [PATCH bpf-next 10/13] bpf, mips64: remove obsolete exception handling from div/mod

2018-01-26 Thread David Daney


On 01/26/2018 02:33 PM, Daniel Borkmann wrote:

Since we've changed div/mod exception handling for src_reg in
eBPF verifier itself, remove the leftovers from mips64 JIT.

Signed-off-by: Daniel Borkmann 
Cc: David Daney 


I didn't test it, but this looks correct, so ...

Reviewed-by: David Daney 


---
  arch/mips/net/ebpf_jit.c | 10 --
  1 file changed, 10 deletions(-)

diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 4e34703..296f1410 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -860,11 +860,6 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
break;
case BPF_DIV:
case BPF_MOD:
-   b_off = b_imm(exit_idx, ctx);
-   if (is_bad_offset(b_off))
-   return -E2BIG;
-   emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off);
-   emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src);
emit_instr(ctx, ddivu, dst, src);
if (bpf_op == BPF_DIV)
emit_instr(ctx, mflo, dst);
@@ -943,11 +938,6 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
break;
case BPF_DIV:
case BPF_MOD:
-   b_off = b_imm(exit_idx, ctx);
-   if (is_bad_offset(b_off))
-   return -E2BIG;
-   emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off);
-   emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src);
emit_instr(ctx, divu, dst, src);
if (bpf_op == BPF_DIV)
emit_instr(ctx, mflo, dst);

Re: [PATCH bpf-next 11/13] bpf, mips64: remove unneeded zero check from div/mod with k

2018-01-26 Thread David Daney


On 01/26/2018 02:33 PM, Daniel Borkmann wrote:

The verifier in both cBPF and eBPF reject div/mod by 0 imm,
so this can never load. Remove emitting such test and reject
it from being JITed instead (the latter is actually also not
needed, but given practice in sparc64, ppc64 today, so
doesn't hurt to add it here either).

Signed-off-by: Daniel Borkmann 
Cc: David Daney 


This looks plausible,

Reviewed-by: David Daney 



---
  arch/mips/net/ebpf_jit.c | 19 ---
  1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 296f1410..3e2798b 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -741,16 +741,11 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
break;
case BPF_ALU | BPF_DIV | BPF_K: /* ALU_IMM */
case BPF_ALU | BPF_MOD | BPF_K: /* ALU_IMM */
+   if (insn->imm == 0)
+   return -EINVAL;
dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
if (dst < 0)
return dst;
-   if (insn->imm == 0) { /* Div by zero */
-   b_off = b_imm(exit_idx, ctx);
-   if (is_bad_offset(b_off))
-   return -E2BIG;
-   emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
-   emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, 
MIPS_R_ZERO);
-   }
td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
if (td == REG_64BIT || td == REG_32BIT_ZERO_EX)
/* sign extend */
@@ -770,19 +765,13 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
break;
case BPF_ALU64 | BPF_DIV | BPF_K: /* ALU_IMM */
case BPF_ALU64 | BPF_MOD | BPF_K: /* ALU_IMM */
+   if (insn->imm == 0)
+   return -EINVAL;
dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
if (dst < 0)
return dst;
-   if (insn->imm == 0) { /* Div by zero */
-   b_off = b_imm(exit_idx, ctx);
-   if (is_bad_offset(b_off))
-   return -E2BIG;
-   emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
-   emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, 
MIPS_R_ZERO);
-   }
if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
-
if (insn->imm == 1) {
/* div by 1 is a nop, mod by 1 is zero */
if (bpf_op == BPF_MOD)

[PATCH bpf-next 00/13] BPF improvements and fixes

2018-01-26 Thread Daniel Borkmann

This set contains a small cleanup in cBPF prologue generation and
otherwise fixes an outstanding issue related to BPF to BPF calls
and exception handling. For details please see related patches.
Last but not least, BPF selftests is extended with several new
test cases.

Thanks!

Daniel Borkmann (13):
  bpf: xor of a/x in cbpf can be done in 32 bit alu
  bpf: improve dead code sanitizing
  bpf: make unknown opcode handling more robust
  bpf: fix subprog verifier bypass by div/mod by 0 exception
  bpf, x86_64: remove obsolete exception handling from div/mod
  bpf, arm64: remove obsolete exception handling from div/mod
  bpf, s390x: remove obsolete exception handling from div/mod
  bpf, ppc64: remove obsolete exception handling from div/mod
  bpf, sparc64: remove obsolete exception handling from div/mod
  bpf, mips64: remove obsolete exception handling from div/mod
  bpf, mips64: remove unneeded zero check from div/mod with k
  bpf, arm: remove obsolete exception handling from div/mod
  bpf: add further test cases around div/mod and others

 arch/arm/net/bpf_jit_32.c   |   8 -
 arch/arm64/net/bpf_jit_comp.c   |  13 --
 arch/mips/net/ebpf_jit.c|  29 +--
 arch/powerpc/net/bpf_jit_comp64.c   |   8 -
 arch/s390/net/bpf_jit_comp.c|  10 -
 arch/sparc/net/bpf_jit_comp_64.c|  18 --
 arch/x86/net/bpf_jit_comp.c |  20 --
 include/linux/filter.h  |   2 +
 kernel/bpf/core.c   | 258 -
 kernel/bpf/verifier.c   |  62 +++--
 lib/test_bpf.c  |   8 +-
 net/core/filter.c   |  13 +-
 tools/testing/selftests/bpf/test_verifier.c | 343 ++--
 13 files changed, 546 insertions(+), 246 deletions(-)

-- 
2.9.5

[PATCH bpf-next 05/13] bpf, x86_64: remove obsolete exception handling from div/mod

2018-01-26 Thread Daniel Borkmann

Since we've changed div/mod exception handling for src_reg in
eBPF verifier itself, remove the leftovers from x86_64 JIT.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 arch/x86/net/bpf_jit_comp.c | 20 
 1 file changed, 20 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5acee51..4923d92 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -568,26 +568,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, 
u8 *image,
 */
EMIT2(0x31, 0xd2);
 
-   if (BPF_SRC(insn->code) == BPF_X) {
-   /* if (src_reg == 0) return 0 */
-
-   /* cmp r11, 0 */
-   EMIT4(0x49, 0x83, 0xFB, 0x00);
-
-   /* jne .+9 (skip over pop, pop, xor and jmp) */
-   EMIT2(X86_JNE, 1 + 1 + 2 + 5);
-   EMIT1(0x5A); /* pop rdx */
-   EMIT1(0x58); /* pop rax */
-   EMIT2(0x31, 0xc0); /* xor eax, eax */
-
-   /* jmp cleanup_addr
-* addrs[i] - 11, because there are 11 bytes
-* after this insn: div, mov, pop, pop, mov
-*/
-   jmp_offset = ctx->cleanup_addr - (addrs[i] - 
11);
-   EMIT1_off32(0xE9, jmp_offset);
-   }
-
if (BPF_CLASS(insn->code) == BPF_ALU64)
/* div r11 */
EMIT3(0x49, 0xF7, 0xF3);
-- 
2.9.5

[PATCH bpf-next 13/13] bpf: add further test cases around div/mod and others

2018-01-26 Thread Daniel Borkmann

Update selftests to relfect recent changes and add various new
test cases.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 lib/test_bpf.c  |   8 +-
 tools/testing/selftests/bpf/test_verifier.c | 343 ++--
 2 files changed, 336 insertions(+), 15 deletions(-)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index e3938e3..4cd9ea9 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -2003,10 +2003,14 @@ static struct bpf_test tests[] = {
{ { 4, 0 }, { 5, 10 } }
},
{
-   "INT: DIV by zero",
+   /* This one doesn't go through verifier, but is just raw insn
+* as opposed to cBPF tests from here. Thus div by 0 tests are
+* done in test_verifier in BPF kselftests.
+*/
+   "INT: DIV by -1",
.u.insns_int = {
BPF_ALU64_REG(BPF_MOV, R6, R1),
-   BPF_ALU64_IMM(BPF_MOV, R7, 0),
+   BPF_ALU64_IMM(BPF_MOV, R7, -1),
BPF_LD_ABS(BPF_B, 3),
BPF_ALU32_REG(BPF_DIV, R0, R7),
BPF_EXIT_INSN(),
diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 9e7075b..697bd83 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -111,7 +112,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.result = ACCEPT,
-   .retval = 0,
+   .retval = 42,
},
{
"DIV32 by 0, zero check 2",
@@ -123,7 +124,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.result = ACCEPT,
-   .retval = 0,
+   .retval = 42,
},
{
"DIV64 by 0, zero check",
@@ -135,7 +136,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.result = ACCEPT,
-   .retval = 0,
+   .retval = 42,
},
{
"MOD32 by 0, zero check 1",
@@ -147,7 +148,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.result = ACCEPT,
-   .retval = 0,
+   .retval = 42,
},
{
"MOD32 by 0, zero check 2",
@@ -159,7 +160,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.result = ACCEPT,
-   .retval = 0,
+   .retval = 42,
},
{
"MOD64 by 0, zero check",
@@ -171,13 +172,245 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.result = ACCEPT,
+   .retval = 42,
+   },
+   {
+   "DIV32 by 0, zero check ok, cls",
+   .insns = {
+   BPF_MOV32_IMM(BPF_REG_0, 42),
+   BPF_MOV32_IMM(BPF_REG_1, 2),
+   BPF_MOV32_IMM(BPF_REG_2, 16),
+   BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1),
+   BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+   BPF_EXIT_INSN(),
+   },
+   .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+   .result = ACCEPT,
+   .retval = 8,
+   },
+   {
+   "DIV32 by 0, zero check 1, cls",
+   .insns = {
+   BPF_MOV32_IMM(BPF_REG_1, 0),
+   BPF_MOV32_IMM(BPF_REG_0, 1),
+   BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+   BPF_EXIT_INSN(),
+   },
+   .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+   .result = ACCEPT,
+   .retval = 0,
+   },
+   {
+   "DIV32 by 0, zero check 2, cls",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_1, 0xLL),
+   BPF_MOV32_IMM(BPF_REG_0, 1),
+   BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+   BPF_EXIT_INSN(),
+   },
+   .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+   .result = ACCEPT,
+   .retval = 0,
+   },
+   {
+   "DIV64 by 0, zero check, cls",
+   .insns = {
+   BPF_MOV32_IMM(BPF_REG_1, 0),
+   BPF_MOV32_IMM(BPF_REG_0, 1),
+   BPF_ALU64_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+   BPF_EXIT_INSN(),
+   },
+   .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+

[PATCH bpf-next 06/13] bpf, arm64: remove obsolete exception handling from div/mod

2018-01-26 Thread Daniel Borkmann

Since we've changed div/mod exception handling for src_reg in
eBPF verifier itself, remove the leftovers from arm64 JIT.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 arch/arm64/net/bpf_jit_comp.c | 13 -
 1 file changed, 13 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 0775d5a..1d4f1da 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -390,18 +390,6 @@ static int build_insn(const struct bpf_insn *insn, struct 
jit_ctx *ctx)
case BPF_ALU64 | BPF_DIV | BPF_X:
case BPF_ALU | BPF_MOD | BPF_X:
case BPF_ALU64 | BPF_MOD | BPF_X:
-   {
-   const u8 r0 = bpf2a64[BPF_REG_0];
-
-   /* if (src == 0) return 0 */
-   jmp_offset = 3; /* skip ahead to else path */
-   check_imm19(jmp_offset);
-   emit(A64_CBNZ(is64, src, jmp_offset), ctx);
-   emit(A64_MOVZ(1, r0, 0, 0), ctx);
-   jmp_offset = epilogue_offset(ctx);
-   check_imm26(jmp_offset);
-   emit(A64_B(jmp_offset), ctx);
-   /* else */
switch (BPF_OP(code)) {
case BPF_DIV:
emit(A64_UDIV(is64, dst, dst, src), ctx);
@@ -413,7 +401,6 @@ static int build_insn(const struct bpf_insn *insn, struct 
jit_ctx *ctx)
break;
}
break;
-   }
case BPF_ALU | BPF_LSH | BPF_X:
case BPF_ALU64 | BPF_LSH | BPF_X:
emit(A64_LSLV(is64, dst, dst, src), ctx);
-- 
2.9.5

[PATCH bpf-next 04/13] bpf: fix subprog verifier bypass by div/mod by 0 exception

2018-01-26 Thread Daniel Borkmann

One of the ugly leftovers from the early eBPF days is that div/mod
operations based on registers have a hard-coded src_reg == 0 test
in the interpreter as well as in JIT code generators that would
return from the BPF program with exit code 0. This was basically
adopted from cBPF interpreter for historical reasons.

There are multiple reasons why this is very suboptimal and prone
to bugs. To name one: the return code mapping for such abnormal
program exit of 0 does not always match with a suitable program
type's exit code mapping. For example, '0' in tc means action 'ok'
where the packet gets passed further up the stack, which is just
undesirable for such cases (e.g. when implementing policy) and
also does not match with other program types.

While trying to work out an exception handling scheme, I also
noticed that programs crafted like the following will currently
pass the verifier:

  0: (bf) r6 = r1
  1: (85) call pc+8
  caller:
   R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1
  callee:
   frame1: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_1
  10: (b4) (u32) r2 = (u32) 0
  11: (b4) (u32) r3 = (u32) 1
  12: (3c) (u32) r3 /= (u32) r2
  13: (61) r0 = *(u32 *)(r1 +76)
  14: (95) exit
  returning from callee:
   frame1: R0_w=pkt(id=0,off=0,r=0,imm=0)
   R1=ctx(id=0,off=0,imm=0) R2_w=inv0
   R3_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0x))
   R10=fp0,call_1
  to caller at 2:
   R0_w=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0)
   R10=fp0,call_-1

  from 14 to 2: R0=pkt(id=0,off=0,r=0,imm=0)
R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1
  2: (bf) r1 = r6
  3: (61) r1 = *(u32 *)(r1 +80)
  4: (bf) r2 = r0
  5: (07) r2 += 8
  6: (2d) if r2 > r1 goto pc+1
   R0=pkt(id=0,off=0,r=8,imm=0) R1=pkt_end(id=0,off=0,imm=0)
   R2=pkt(id=0,off=8,r=8,imm=0) R6=ctx(id=0,off=0,imm=0)
   R10=fp0,call_-1
  7: (71) r0 = *(u8 *)(r0 +0)
  8: (b7) r0 = 1
  9: (95) exit

  from 6 to 8: safe
  processed 16 insns (limit 131072), stack depth 0+0

Basically what happens is that in the subprog we make use of a
div/mod by 0 exception and in the 'normal' subprog's exit path
we just return skb->data back to the main prog. This has the
implication that the verifier thinks we always get a pkt pointer
in R0 while we still have the implicit 'return 0' from the div
as an alternative unconditional return path earlier. Thus, R0
then contains 0, meaning back in the parent prog we get the
address range of [0x0, skb->data_end] as read and writeable.
Similar can be crafted with other pointer register types.

Since i) BPF_ABS/IND is not allowed in programs that contain
BPF to BPF calls (and generally it's also disadvised to use in
native eBPF context), ii) unknown opcodes don't return zero
anymore, iii) we don't return an exception code in dead branches,
the only last missing case affected and to fix is the div/mod
handling.

What we would really need is some infrastructure to propagate
exceptions all the way to the original prog unwinding the
current stack and returning that code to the caller of the
BPF program. In user space such exception handling for similar
runtimes is typically implemented with setjmp(3) and longjmp(3)
as one possibility which is not available in the kernel,
though (kgdb used to implement it in kernel long time ago). I
implemented a PoC exception handling mechanism into the BPF
interpreter with porting setjmp()/longjmp() into x86_64 and
adding a new internal BPF_ABRT opcode that can use a program
specific exception code for all exception cases we have (e.g.
div/mod by 0, unknown opcodes, etc). While this seems to work
in the constrained BPF environment (meaning, here, we don't
need to deal with state e.g. from memory allocations that we
would need to undo before going into exception state), it still
has various drawbacks: i) we would need to implement the
setjmp()/longjmp() for every arch supported in the kernel and
for x86_64, arm64, sparc64 JITs currently supporting calls,
ii) it has unconditional additional cost on main program
entry to store CPU register state in initial setjmp() call,
and we would need some way to pass the jmp_buf down into
___bpf_prog_run() for main prog and all subprogs, but also
storing on stack is not really nice (other option would be
per-cpu storage for this, but it also has the drawback that
we need to disable preemption for every BPF program types).
All in all this approach would add a lot of complexity.

Another poor-man's solution would be to have some sort of
additional shared register or scratch buffer to hold state
for exceptions, and test that after every call return to
chain returns and pass R0 all the way down to BPF prog caller.
This is also problematic in various ways: i) an additional
register doesn't map well into JITs, and some other scratch
space could only be on per-cpu storage, which, again has the
side-effect that this only works when we disable preemption,
or somewhere in the input context which is not available
everywhere either, and

[PATCH bpf-next 01/13] bpf: xor of a/x in cbpf can be done in 32 bit alu

2018-01-26 Thread Daniel Borkmann

Very minor optimization; saves 1 byte per program in x86_64
JIT in cBPF prologue.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 18da42a..cba2f73 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -401,8 +401,8 @@ static int bpf_convert_filter(struct sock_filter *prog, int 
len,
/* Classic BPF expects A and X to be reset first. These need
 * to be guaranteed to be the first two instructions.
 */
-   *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
-   *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
+   *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+   *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
 
/* All programs must keep CTX in callee saved BPF_REG_CTX.
 * In eBPF case it's done by the compiler, here we need to
-- 
2.9.5

[PATCH bpf-next 03/13] bpf: make unknown opcode handling more robust

2018-01-26 Thread Daniel Borkmann

Recent findings by syzcaller fixed in 7891a87efc71 ("bpf: arsh is
not supported in 32 bit alu thus reject it") triggered a warning
in the interpreter due to unknown opcode not being rejected by
the verifier. The 'return 0' for an unknown opcode is really not
optimal, since with BPF to BPF calls, this would go untracked by
the verifier.

Do two things here to improve the situation: i) perform basic insn
sanity check early on in the verification phase and reject every
non-uapi insn right there. The bpf_opcode_in_insntable() table
reuses the same mapping as the jumptable in ___bpf_prog_run() sans
the non-public mappings. And ii) in ___bpf_prog_run() we do need
to BUG in the case where the verifier would ever create an unknown
opcode due to some rewrites.

Note that JITs do not have such issues since they would punt to
interpreter in these situations. Moreover, the BPF_JIT_ALWAYS_ON
would also help to avoid such unknown opcodes in the first place.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 include/linux/filter.h |   2 +
 kernel/bpf/core.c  | 250 -
 kernel/bpf/verifier.c  |   7 ++
 3 files changed, 154 insertions(+), 105 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 425056c..7bd06b4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -688,6 +688,8 @@ static inline int sk_filter(struct sock *sk, struct sk_buff 
*skb)
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
 void bpf_prog_free(struct bpf_prog *fp);
 
+bool bpf_opcode_in_insntable(u8 code);
+
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
  gfp_t gfp_extra_flags);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3aa0658..01962c4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -782,6 +782,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 
r4, u64 r5)
 }
 EXPORT_SYMBOL_GPL(__bpf_call_base);
 
+/* All UAPI available opcodes. */
+#define BPF_INSN_MAP(INSN_2, INSN_3)   \
+   /* 32 bit ALU operations. */\
+   /*   Register based. */ \
+   INSN_3(ALU, ADD, X),\
+   INSN_3(ALU, SUB, X),\
+   INSN_3(ALU, AND, X),\
+   INSN_3(ALU, OR,  X),\
+   INSN_3(ALU, LSH, X),\
+   INSN_3(ALU, RSH, X),\
+   INSN_3(ALU, XOR, X),\
+   INSN_3(ALU, MUL, X),\
+   INSN_3(ALU, MOV, X),\
+   INSN_3(ALU, DIV, X),\
+   INSN_3(ALU, MOD, X),\
+   INSN_2(ALU, NEG),   \
+   INSN_3(ALU, END, TO_BE),\
+   INSN_3(ALU, END, TO_LE),\
+   /*   Immediate based. */\
+   INSN_3(ALU, ADD, K),\
+   INSN_3(ALU, SUB, K),\
+   INSN_3(ALU, AND, K),\
+   INSN_3(ALU, OR,  K),\
+   INSN_3(ALU, LSH, K),\
+   INSN_3(ALU, RSH, K),\
+   INSN_3(ALU, XOR, K),\
+   INSN_3(ALU, MUL, K),\
+   INSN_3(ALU, MOV, K),\
+   INSN_3(ALU, DIV, K),\
+   INSN_3(ALU, MOD, K),\
+   /* 64 bit ALU operations. */\
+   /*   Register based. */ \
+   INSN_3(ALU64, ADD,  X), \
+   INSN_3(ALU64, SUB,  X), \
+   INSN_3(ALU64, AND,  X), \
+   INSN_3(ALU64, OR,   X), \
+   INSN_3(ALU64, LSH,  X), \
+   INSN_3(ALU64, RSH,  X), \
+   INSN_3(ALU64, XOR,  X), \
+   INSN_3(ALU64, MUL,  X), \
+   INSN_3(ALU64, MOV,  X), \
+   INSN_3(ALU64, ARSH, X), \
+   INSN_3(ALU64, DIV,  X), \
+   INSN_3(ALU64, MOD,  X), \
+   INSN_2(ALU64, NEG), \
+   /*   Immediate based. */\
+   INSN_3(ALU64, ADD,  K), \
+   INSN_3(ALU64, SUB,  K), \
+   INSN_3(ALU64, AND,  K), \
+   INSN_3(ALU64, OR,   K), \
+   INSN_3(ALU64, LSH,  K), \
+   INSN_3(ALU64, RSH,  K), \
+   INSN_3(ALU64, XOR,  K), \
+   INSN_3(ALU64, MUL,  K), \
+   INSN_3(ALU64, MOV,  K), \
+   INSN_3(ALU64, ARSH, K), \
+   INSN_3(ALU64, DIV,  K), \
+   INSN_3(ALU64, MOD,

[PATCH bpf-next 10/13] bpf, mips64: remove obsolete exception handling from div/mod

2018-01-26 Thread Daniel Borkmann

Since we've changed div/mod exception handling for src_reg in
eBPF verifier itself, remove the leftovers from mips64 JIT.

Signed-off-by: Daniel Borkmann 
Cc: David Daney 
---
 arch/mips/net/ebpf_jit.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 4e34703..296f1410 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -860,11 +860,6 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
break;
case BPF_DIV:
case BPF_MOD:
-   b_off = b_imm(exit_idx, ctx);
-   if (is_bad_offset(b_off))
-   return -E2BIG;
-   emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off);
-   emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src);
emit_instr(ctx, ddivu, dst, src);
if (bpf_op == BPF_DIV)
emit_instr(ctx, mflo, dst);
@@ -943,11 +938,6 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
break;
case BPF_DIV:
case BPF_MOD:
-   b_off = b_imm(exit_idx, ctx);
-   if (is_bad_offset(b_off))
-   return -E2BIG;
-   emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off);
-   emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src);
emit_instr(ctx, divu, dst, src);
if (bpf_op == BPF_DIV)
emit_instr(ctx, mflo, dst);
-- 
2.9.5

[PATCH bpf-next 09/13] bpf, sparc64: remove obsolete exception handling from div/mod

2018-01-26 Thread Daniel Borkmann

Since we've changed div/mod exception handling for src_reg in
eBPF verifier itself, remove the leftovers from sparc64 JIT.

Signed-off-by: Daniel Borkmann 
Cc: David S. Miller 
---
 arch/sparc/net/bpf_jit_comp_64.c | 18 --
 1 file changed, 18 deletions(-)

diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 50a24d7..48a2586 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -967,31 +967,17 @@ static int build_insn(const struct bpf_insn *insn, struct 
jit_ctx *ctx)
emit_alu(MULX, src, dst, ctx);
break;
case BPF_ALU | BPF_DIV | BPF_X:
-   emit_cmp(src, G0, ctx);
-   emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx);
-   emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx);
-
emit_write_y(G0, ctx);
emit_alu(DIV, src, dst, ctx);
break;
-
case BPF_ALU64 | BPF_DIV | BPF_X:
-   emit_cmp(src, G0, ctx);
-   emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx);
-   emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx);
-
emit_alu(UDIVX, src, dst, ctx);
break;
-
case BPF_ALU | BPF_MOD | BPF_X: {
const u8 tmp = bpf2sparc[TMP_REG_1];
 
ctx->tmp_1_used = true;
 
-   emit_cmp(src, G0, ctx);
-   emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx);
-   emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx);
-
emit_write_y(G0, ctx);
emit_alu3(DIV, dst, src, tmp, ctx);
emit_alu3(MULX, tmp, src, tmp, ctx);
@@ -1003,10 +989,6 @@ static int build_insn(const struct bpf_insn *insn, struct 
jit_ctx *ctx)
 
ctx->tmp_1_used = true;
 
-   emit_cmp(src, G0, ctx);
-   emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx);
-   emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx);
-
emit_alu3(UDIVX, dst, src, tmp, ctx);
emit_alu3(MULX, tmp, src, tmp, ctx);
emit_alu3(SUB, dst, tmp, dst, ctx);
-- 
2.9.5

[PATCH bpf-next 07/13] bpf, s390x: remove obsolete exception handling from div/mod

2018-01-26 Thread Daniel Borkmann

Since we've changed div/mod exception handling for src_reg in
eBPF verifier itself, remove the leftovers from s390x JIT.

Signed-off-by: Daniel Borkmann 
Cc: Michael Holzheu 
---
 arch/s390/net/bpf_jit_comp.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index e501887..78a19c9 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -610,11 +610,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
{
int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
 
-   jit->seen |= SEEN_RET0;
-   /* ltr %src,%src (if src == 0 goto fail) */
-   EMIT2(0x1200, src_reg, src_reg);
-   /* jz  */
-   EMIT4_PCREL(0xa784, jit->ret0_ip - jit->prg);
/* lhi %w0,0 */
EMIT4_IMM(0xa708, REG_W0, 0);
/* lr %w1,%dst */
@@ -630,11 +625,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
{
int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
 
-   jit->seen |= SEEN_RET0;
-   /* ltgr %src,%src (if src == 0 goto fail) */
-   EMIT4(0xb902, src_reg, src_reg);
-   /* jz  */
-   EMIT4_PCREL(0xa784, jit->ret0_ip - jit->prg);
/* lghi %w0,0 */
EMIT4_IMM(0xa709, REG_W0, 0);
/* lgr %w1,%dst */
-- 
2.9.5

[PATCH bpf-next 12/13] bpf, arm: remove obsolete exception handling from div/mod

2018-01-26 Thread Daniel Borkmann

Since we've changed div/mod exception handling for src_reg in
eBPF verifier itself, remove the leftovers from arm32 JIT.

Signed-off-by: Daniel Borkmann 
Cc: Shubham Bansal 
---
 arch/arm/net/bpf_jit_32.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 41e2feb..b5030e1 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -363,15 +363,7 @@ static inline int epilogue_offset(const struct jit_ctx 
*ctx)
 static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 
op)
 {
const u8 *tmp = bpf2a32[TMP_REG_1];
-   s32 jmp_offset;
 
-   /* checks if divisor is zero or not. If it is, then
-* exit directly.
-*/
-   emit(ARM_CMP_I(rn, 0), ctx);
-   _emit(ARM_COND_EQ, ARM_MOV_I(ARM_R0, 0), ctx);
-   jmp_offset = epilogue_offset(ctx);
-   _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
 #if __LINUX_ARM_ARCH__ == 7
if (elf_hwcap & HWCAP_IDIVA) {
if (op == BPF_DIV)
-- 
2.9.5

[PATCH bpf-next 08/13] bpf, ppc64: remove obsolete exception handling from div/mod

2018-01-26 Thread Daniel Borkmann

Since we've changed div/mod exception handling for src_reg in
eBPF verifier itself, remove the leftovers from ppc64 JIT.

Signed-off-by: Daniel Borkmann 
Cc: Naveen N. Rao 
---
 arch/powerpc/net/bpf_jit_comp64.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 217a78e..0a34b0c 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -381,10 +381,6 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image,
goto bpf_alu32_trunc;
case BPF_ALU | BPF_DIV | BPF_X: /* (u32) dst /= (u32) src */
case BPF_ALU | BPF_MOD | BPF_X: /* (u32) dst %= (u32) src */
-   PPC_CMPWI(src_reg, 0);
-   PPC_BCC_SHORT(COND_NE, (ctx->idx * 4) + 12);
-   PPC_LI(b2p[BPF_REG_0], 0);
-   PPC_JMP(exit_addr);
if (BPF_OP(code) == BPF_MOD) {
PPC_DIVWU(b2p[TMP_REG_1], dst_reg, src_reg);
PPC_MULW(b2p[TMP_REG_1], src_reg,
@@ -395,10 +391,6 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image,
goto bpf_alu32_trunc;
case BPF_ALU64 | BPF_DIV | BPF_X: /* dst /= src */
case BPF_ALU64 | BPF_MOD | BPF_X: /* dst %= src */
-   PPC_CMPDI(src_reg, 0);
-   PPC_BCC_SHORT(COND_NE, (ctx->idx * 4) + 12);
-   PPC_LI(b2p[BPF_REG_0], 0);
-   PPC_JMP(exit_addr);
if (BPF_OP(code) == BPF_MOD) {
PPC_DIVD(b2p[TMP_REG_1], dst_reg, src_reg);
PPC_MULD(b2p[TMP_REG_1], src_reg,
-- 
2.9.5

[PATCH bpf-next 11/13] bpf, mips64: remove unneeded zero check from div/mod with k

2018-01-26 Thread Daniel Borkmann

The verifier in both cBPF and eBPF reject div/mod by 0 imm,
so this can never load. Remove emitting such test and reject
it from being JITed instead (the latter is actually also not
needed, but given practice in sparc64, ppc64 today, so
doesn't hurt to add it here either).

Signed-off-by: Daniel Borkmann 
Cc: David Daney 
---
 arch/mips/net/ebpf_jit.c | 19 ---
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 296f1410..3e2798b 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -741,16 +741,11 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
break;
case BPF_ALU | BPF_DIV | BPF_K: /* ALU_IMM */
case BPF_ALU | BPF_MOD | BPF_K: /* ALU_IMM */
+   if (insn->imm == 0)
+   return -EINVAL;
dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
if (dst < 0)
return dst;
-   if (insn->imm == 0) { /* Div by zero */
-   b_off = b_imm(exit_idx, ctx);
-   if (is_bad_offset(b_off))
-   return -E2BIG;
-   emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
-   emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, 
MIPS_R_ZERO);
-   }
td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
if (td == REG_64BIT || td == REG_32BIT_ZERO_EX)
/* sign extend */
@@ -770,19 +765,13 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
break;
case BPF_ALU64 | BPF_DIV | BPF_K: /* ALU_IMM */
case BPF_ALU64 | BPF_MOD | BPF_K: /* ALU_IMM */
+   if (insn->imm == 0)
+   return -EINVAL;
dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
if (dst < 0)
return dst;
-   if (insn->imm == 0) { /* Div by zero */
-   b_off = b_imm(exit_idx, ctx);
-   if (is_bad_offset(b_off))
-   return -E2BIG;
-   emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
-   emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, 
MIPS_R_ZERO);
-   }
if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
-
if (insn->imm == 1) {
/* div by 1 is a nop, mod by 1 is zero */
if (bpf_op == BPF_MOD)
-- 
2.9.5

[PATCH bpf-next 02/13] bpf: improve dead code sanitizing

2018-01-26 Thread Daniel Borkmann

Given we recently had c131187db2d3 ("bpf: fix branch pruning
logic") and 95a762e2c8c9 ("bpf: fix incorrect sign extension in
check_alu_op()") in particular where before verifier skipped
verification of the wrongly assumed dead branch, we should not
just replace the dead code parts with nops (mov r0,r0). If there
is a bug such as fixed in 95a762e2c8c9 in future again, where
runtime could execute those insns, then one of the potential
issues with the current setting would be that given the nops
would be at the end of the program, we could execute out of
bounds at some point.

The best in such case would be to just exit the BPF program
altogether and return an exception code. However, given this
would require two instructions, and such a dead code gap could
just be a single insn long, we would need to place 'r0 = X; ret'
snippet at the very end after the user program or at the start
before the program (where we'd skip that region on prog entry),
and then place unconditional ja's into the dead code gap.

While more complex but possible, there's still another block
in the road that currently prevents from this, namely BPF to
BPF calls. The issue here is that such exception could be
returned from a callee, but the caller would not know that
it's an exception that needs to be propagated further down.
Alternative that has little complexity is to just use a ja-1
code for now which will trap the execution here instead of
silently doing bad things if we ever get there due to bugs.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 kernel/bpf/verifier.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dfb138b..8365259 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5064,14 +5064,21 @@ static struct bpf_prog *bpf_patch_insn_data(struct 
bpf_verifier_env *env, u32 of
return new_prog;
 }
 
-/* The verifier does more data flow analysis than llvm and will not explore
- * branches that are dead at run time. Malicious programs can have dead code
- * too. Therefore replace all dead at-run-time code with nops.
+/* The verifier does more data flow analysis than llvm and will not
+ * explore branches that are dead at run time. Malicious programs can
+ * have dead code too. Therefore replace all dead at-run-time code
+ * with 'ja -1'.
+ *
+ * Just nops are not optimal, e.g. if they would sit at the end of the
+ * program and through another bug we would manage to jump there, then
+ * we'd execute beyond program memory otherwise. Returning exception
+ * code also wouldn't work since we can have subprogs where the dead
+ * code could be located.
  */
 static void sanitize_dead_code(struct bpf_verifier_env *env)
 {
struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
-   struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+   struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
struct bpf_insn *insn = env->prog->insnsi;
const int insn_cnt = env->prog->len;
int i;
@@ -5079,7 +5086,7 @@ static void sanitize_dead_code(struct bpf_verifier_env 
*env)
for (i = 0; i < insn_cnt; i++) {
if (aux_data[i].seen)
continue;
-   memcpy(insn + i, , sizeof(nop));
+   memcpy(insn + i, , sizeof(trap));
}
 }
 
-- 
2.9.5

Re: ipv6_addrconf: WARNING about suspicious RCU usage

2018-01-26 Thread Heiner Kallweit

Am 23.01.2018 um 19:01 schrieb Ido Schimmel:
> On Sun, Jan 21, 2018 at 10:22:16PM +0100, Heiner Kallweit wrote:
>> So far everything looks good with Eric's patch. The warning didn't show up 
>> again.
> 
> Eric, can you please submit your patch?
> .
> 
Soon the merge window starts, can the patch be submitted before?

Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Michael S. Tsirkin

On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
> > and the VM is not expected to do any tuning/optimizations on the VF driver
> > directly,
> > i think the current patch that follows the netvsc model of 2 netdevs(virtio
> > and vf) should
> > work fine.
> 
> OK. For your use case that's fine. But that's too specific scenario
> with lots of restrictions IMHO, perhaps very few users will benefit
> from it, I'm not sure. If you're unwilling to move towards it, we'd
> take this one and come back with a generic solution that is able to
> address general use cases for VF/PT live migration .

I think that's a fine approach. Scratch your own itch!  I imagine a very
generic virtio-switchdev providing host routing info to guests could
address lots of usecases. A driver could bind to that one and enslave
arbitrary other devices.  Sounds reasonable.

But given the fundamental idea of a failover was floated at least as
early as 2013, and made 0 progress since precisely because it kept
trying to address more and more features, and given netvsc is already
using the basic solution with some success, I'm not inclined to block
this specific effort waiting for the generic one.

-- 
MST

Re: [PATCH] vsock.7: document VSOCK socket address family

2018-01-26 Thread Michael Kerrisk (man-pages)

Stefan,

I've just now noted that your page came with no license. What license
do you want to use Please see
https://www.kernel.org/doc/man-pages/licenses.html

Thanks,

Michael


On 30 November 2017 at 12:21, Stefan Hajnoczi  wrote:
> The AF_VSOCK address family has been available since Linux 3.9 without a
> corresponding man page.
>
> This patch adds vsock.7 and describes its use along the same lines as
> existing ip.7, unix.7, and netlink.7 man pages.
>
> CC: Jorgen Hansen 
> CC: Dexuan Cui 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  man7/vsock.7 | 175 
> +++
>  1 file changed, 175 insertions(+)
>  create mode 100644 man7/vsock.7
>
> diff --git a/man7/vsock.7 b/man7/vsock.7
> new file mode 100644
> index 0..48c6c2e1e
> --- /dev/null
> +++ b/man7/vsock.7
> @@ -0,0 +1,175 @@
> +.TH VSOCK 7 2017-11-30 "Linux" "Linux Programmer's Manual"
> +.SH NAME
> +vsock \- Linux VSOCK address family
> +.SH SYNOPSIS
> +.B #include 
> +.br
> +.B #include 
> +.PP
> +.IB stream_socket " = socket(AF_VSOCK, SOCK_STREAM, 0);"
> +.br
> +.IB datagram_socket " = socket(AF_VSOCK, SOCK_DGRAM, 0);"
> +.SH DESCRIPTION
> +The VSOCK address family facilitates communication between virtual machines 
> and
> +the host they are running on.  This address family is used by guest agents 
> and
> +hypervisor services that need a communications channel that is independent of
> +virtual machine network configuration.
> +.PP
> +Valid socket types are
> +.B SOCK_STREAM
> +and
> +.B SOCK_DGRAM .
> +.B SOCK_STREAM
> +provides connection-oriented byte streams with guaranteed, in-order delivery.
> +.B SOCK_DGRAM
> +provides a connectionless datagram packet service.  Availability of these
> +socket types is dependent on the underlying hypervisor.
> +.PP
> +A new socket is created with
> +.PP
> +socket(AF_VSOCK, socket_type, 0);
> +.PP
> +When a process wants to establish a connection it calls
> +.BR connect (2)
> +with a given destination socket address.  The socket is automatically bound 
> to
> +a free port if unbound.
> +.PP
> +A process can listen for incoming connections by first binding to a socket 
> address using
> +.BR bind (2)
> +and then calling
> +.BR listen (2).
> +.PP
> +Data is transferred using the usual
> +.BR send (2)
> +and
> +.BR recv (2)
> +family of socket system calls.
> +.SS Address format
> +A socket address is defined as a combination of a 32-bit Context Identifier 
> (CID) and a 32-bit port number.  The CID identifies the source or 
> destination, which is either a virtual machine or the host.  The port number 
> differentiates between multiple services running on a single machine.
> +.PP
> +.in +4n
> +.EX
> +struct sockaddr_vm {
> +sa_family_t svm_family; /* address family: AF_VSOCK */
> +unsigned short  svm_reserved1;
> +unsigned intsvm_port;   /* port in native byte order */
> +unsigned intsvm_cid;/* address in native byte order */
> +};
> +.EE
> +.in
> +.PP
> +.I svm_family
> +is always set to
> +.BR AF_VSOCK .
> +.I svm_reserved1
> +is always set to 0.
> +.I svm_port
> +contains the port in native byte order.
> +The port numbers below 1024 are called
> +.IR "privileged ports" .
> +Only a process with
> +.B CAP_NET_BIND_SERVER
> +capability may
> +.BR bind (2)
> +to these port numbers.
> +.PP
> +There are several special addresses:
> +.B VMADDR_CID_ANY
> +(-1U)
> +means any address for binding;
> +.B VMADDR_CID_HYPERVISOR
> +(0) and
> +.B VMADDR_CID_RESERVED
> +(1) are unused addresses;
> +.B VMADDR_CID_HOST
> +(2)
> +is the well-known address of the host.
> +.PP
> +The special constant
> +.B VMADDR_PORT_ANY
> +(-1U)
> +means any port number for binding.
> +.SS Live migration
> +Sockets are affected by live migration of virtual machines.  Connected
> +.B SOCK_STREAM
> +sockets become disconnected when the virtual machine migrates to a new host.
> +Applications must reconnect when this happens.
> +.PP
> +The local CID may change across live migration if the old CID is not 
> available
> +on the new host.  Bound sockets are automatically updated to the new CID.
> +.SS Ioctls
> +.TP
> +.B IOCTL_VM_SOCKETS_GET_LOCAL_CID
> +Get the CID of the local machine.  The argument is a pointer to an unsigned 
> int.
> +.IP
> +.in +4n
> +.EX
> +.IB error " = ioctl(" socket ", " IOCTL_VM_SOCKETS_GET_LOCAL_CID ", "  
> ");"
> +.EE
> +.in
> +.IP
> +Consider using
> +.B VMADDR_CID_ANY
> +when binding instead of getting the local CID with
> +.B IOCTL_VM_SOCKETS_GET_LOCAL_CID .
> +.SH ERRORS
> +.TP
> +.B EACCES
> +Unable to bind to a privileged port without the
> +.B CAP_NET_BIND_SERVICE
> +capability.
> +.TP
> +.B EINVAL
> +Invalid parameters.  This includes:
> +attempting to bind a socket that is already bound, providing an invalid 
> struct
> +.B sockaddr_vm ,
> +and other input validation errors.
> +.TP
> +.B EOPNOTSUPP
> +Operation not

Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-01-26 Thread Siwei Liu

On Fri, Jan 26, 2018 at 8:51 AM, Samudrala, Sridhar
 wrote:
>
>
> On 1/26/2018 12:14 AM, Siwei Liu wrote:
>>
>> On Tue, Jan 23, 2018 at 2:58 PM, Michael S. Tsirkin 
>> wrote:
>>>
>>> On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote:

 On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin 
 wrote:
>
> On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
>>
>> First off, as mentioned in another thread, the model of stacking up
>> virt-bond functionality over virtio seems a wrong direction to me.
>> Essentially the migration process would need to carry over all guest
>> side configurations previously done on the VF/PT and get them moved to
>> the new device being it virtio or VF/PT.
>
> I might be wrong but I don't see why we should worry about this
> usecase.
> Whoever has a bond configured already has working config for migration.
> We are trying to help people who don't, not convert existig users.

 That has been placed in the view of cloud providers that the imported
 images from the store must be able to run unmodified thus no
 additional setup script is allowed (just as Stephen mentioned in
 another mail). Cloud users don't care about live migration themselves
 but the providers are required to implement such automation mechanism
 to make this process transparent if at all possible. The user does not
 care about the device underneath being VF or not, but they do care
 about consistency all across and the resulting performance
 acceleration in making VF the prefered datapath. It is not quite
 peculiar user cases but IMHO *any* approach proposed for live
 migration should be able to persist the state including network config
 e.g. as simple as MTU. Actually this requirement has nothing to do
 with virtio but our target users are live migration agnostic, being it
 tracking DMA through dirty pages, using virtio as the helper, or
 whatsoever, the goal of persisting configs across remains same.
>>>
>>> So the patching being discussed here will mostly do exactly that if your
>>> original config was simply a single virtio net device.
>>>
>> True, but I don't see the patch being discussed starts with good
>> foundation of supporting the same for VF/PT device. That is the core
>> of the issue.
>
>
>>> What kind of configs do your users have right now?
>>
>> Any configs be it generic or driver specific that the VF/PT device
>> supports and have been enabled/configured. General network configs
>> (MAC, IP address, VLAN, MTU, iptables rules), ethtool settings
>> (hardware offload, # of queues and ring entris, RSC options, rss
>> rxfh-indir table, rx-flow-hash, et al) , bpf/XDP program being run, tc
>> flower offload, just to name a few. As cloud providers we don't limit
>> users from applying driver specific tuning to the NIC/VF, and
>> sometimes this is essential to achieving best performance for their
>> workload. We've seen cases like tuning coalescing parameters for
>> getting low latency, changing rx-flow-hash function for better VXLAN
>> throughput, or even adopting quite advanced NIC features such as flow
>> director or cloud filter. We don't expect users to compromise even a
>> little bit on these. That is once we turn on live migration for the VF
>> or pass through devices in the VM, it all takes place under the hood,
>> users (guest admins, applications) don't have to react upon it or even
>> notice the change. I should note that the majority of live migrations
>> take place between machines with completely identical hardware, it's
>> more critical than necessary to keep the config as-is across the move,
>> stealth while quiet.
>
>
> This usecase is much more complicated and different than what this patch is
> trying
> to address.

Yep, it is technically difficult, but as cloud providers we would like
to take actions to address use case for our own if no one else is
willing to do so. However we're not seeking complicated design or
messing up the others such as your use case. As this is the first time
a real patch of the PV failover approach, although having be discussed
for years, posted to the mailing list. All voices suddenly came over,
various parties wish their specific needs added to the todo list, it's
indeed hard to accommodate all at once in the first place. I went
through same tough period of time while I was doing similar work so I
completely understand that. The task is not easy for sure. :)

The attempts I made was trying to consolidate all potential use cases
into one single solution rather than diverge from the very beginning.
It's in the phase of RFC and I don't want to wait expressing our
interest until very late.

>  Also your usecase seems to be assuming that source and
> destination
> hosts are identical and have the same HW.

Not exactly, this will be positioned as an optimization, but

[net-next 12/15] i40e: disallow programming multiple filters with same criteria

2018-01-26 Thread Jeff Kirsher

From: Jacob Keller 

Our hardware does not allow situations where two filters might conflict
when matching. Essentially hardware only programs one filter for each
set of matching criteria. We don't support filters with overlapping
input sets, because each flow type can only use a single input set.

Additionally, different flow types will never have overlapping matches,
because of how the hardware parses the flow type before checking
matching criteria.

For this reason, we do not need or use the location number when
programming filters to hardware.

In order to avoid confusing scenarios with filters that match the same
criteria but program the flow to different queues, do not allow multiple
filters that match identical criteria to be programmed.

This ensures that we avoid odd scenarios when deleting filters, and when
programming new filters that match the same criteria.

Instead, users that wish to update the criteria for a filter must use
the same location id, or must delete all the matching filters first.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 87 ++
 1 file changed, 87 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 69644b621b45..b35c61ccc64a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -3840,6 +3840,87 @@ static int i40e_check_fdir_input_set(struct i40e_vsi 
*vsi,
return 0;
 }
 
+/**
+ * i40e_match_fdir_filter - Return true of two filters match
+ * @a: pointer to filter struct
+ * @b: pointer to filter struct
+ *
+ * Returns true if the two filters match exactly the same criteria. I.e. they
+ * match the same flow type and have the same parameters. We don't need to
+ * check any input-set since all filters of the same flow type must use the
+ * same input set.
+ **/
+static bool i40e_match_fdir_filter(struct i40e_fdir_filter *a,
+  struct i40e_fdir_filter *b)
+{
+   /* The filters do not much if any of these criteria differ. */
+   if (a->dst_ip != b->dst_ip ||
+   a->src_ip != b->src_ip ||
+   a->dst_port != b->dst_port ||
+   a->src_port != b->src_port ||
+   a->flow_type != b->flow_type ||
+   a->ip4_proto != b->ip4_proto)
+   return false;
+
+   return true;
+}
+
+/**
+ * i40e_disallow_matching_filters - Check that new filters differ
+ * @vsi: pointer to the targeted VSI
+ * @input: new filter to check
+ *
+ * Due to hardware limitations, it is not possible for two filters that match
+ * similar criteria to be programmed at the same time. This is true for a few
+ * reasons:
+ *
+ * (a) all filters matching a particular flow type must use the same input
+ * set, that is they must match the same criteria.
+ * (b) different flow types will never match the same packet, as the flow type
+ * is decided by hardware before checking which rules apply.
+ * (c) hardware has no way to distinguish which order filters apply in.
+ *
+ * Due to this, we can't really support using the location data to order
+ * filters in the hardware parsing. It is technically possible for the user to
+ * request two filters matching the same criteria but which select different
+ * queues. In this case, rather than keep both filters in the list, we reject
+ * the 2nd filter when the user requests adding it.
+ *
+ * This avoids needing to track location for programming the filter to
+ * hardware, and ensures that we avoid some strange scenarios involving
+ * deleting filters which match the same criteria.
+ **/
+static int i40e_disallow_matching_filters(struct i40e_vsi *vsi,
+ struct i40e_fdir_filter *input)
+{
+   struct i40e_pf *pf = vsi->back;
+   struct i40e_fdir_filter *rule;
+   struct hlist_node *node2;
+
+   /* Loop through every filter, and check that it doesn't match */
+   hlist_for_each_entry_safe(rule, node2,
+ >fdir_filter_list, fdir_node) {
+   /* Don't check the filters match if they share the same fd_id,
+* since the new filter is actually just updating the target
+* of the old filter.
+*/
+   if (rule->fd_id == input->fd_id)
+   continue;
+
+   /* If any filters match, then print a warning message to the
+* kernel message buffer and bail out.
+*/
+   if (i40e_match_fdir_filter(rule, input)) {
+   dev_warn(>pdev->dev,
+"Existing user defined filter %d already 
matches this flow.\n",
+rule->fd_id);
+

[net-next 11/15] i40e: program fragmented IPv4 filter input set

2018-01-26 Thread Jeff Kirsher

From: Jacob Keller 

When implementing support for IP_USER_FLOW filters, we correctly
programmed a filter for both the non fragmented IPv4/Other filter, as
well as the fragmented IPv4 filters. However, we did not properly
program the input set for fragmented IPv4 PCTYPE. This meant that the
filters would almost certainly not match, unless the user specified all
of the flow types.

Add support to program the fragmented IPv4 filter input set. Since we
always program these filters together, we'll assume that the two input
sets must match, and will thus always program the input sets to the same
value.

Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 10 ++
 drivers/net/ethernet/intel/i40e/i40e_main.c|  3 +++
 2 files changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 03f7007d025e..69644b621b45 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -3809,6 +3809,16 @@ static int i40e_check_fdir_input_set(struct i40e_vsi 
*vsi,
 
i40e_write_fd_input_set(pf, index, new_mask);
 
+   /* IP_USER_FLOW filters match both IPv4/Other and IPv4/Fragmented
+* frames. If we're programming the input set for IPv4/Other, we also
+* need to program the IPv4/Fragmented input set. Since we don't have
+* separate support, we'll always assume and enforce that the two flow
+* types must have matching input sets.
+*/
+   if (index == I40E_FILTER_PCTYPE_NONF_IPV4_OTHER)
+   i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_FRAG_IPV4,
+   new_mask);
+
/* Add the new offset and update table, if necessary */
if (new_flex_offset) {
err = i40e_add_flex_offset(>l4_flex_pit_list, src_offset,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index db611433120a..25087e21a051 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7680,6 +7680,9 @@ static void i40e_fdir_filter_exit(struct i40e_pf *pf)
/* Reprogram the default input set for Other/IPv4 */
i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_OTHER,
I40E_L3_SRC_MASK | I40E_L3_DST_MASK);
+
+   i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_FRAG_IPV4,
+   I40E_L3_SRC_MASK | I40E_L3_DST_MASK);
 }
 
 /**
-- 
2.14.3

[net-next 06/15] i40e: change flags to use 64 bits

2018-01-26 Thread Jeff Kirsher

From: Alice Michael 

As we have added more flags, we need to now use more
bits and have over flooded the 32 bit size.  So
make it 64.

Also change all the existing bits to unsigned long long
bits.

Signed-off-by: Alice Michael 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h | 67 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  4 +-
 2 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index e019baa905c5..46e9f4e0a02c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -508,39 +508,40 @@ struct i40e_pf {
 #define I40E_HW_PORT_ID_VALID  BIT(17)
 #define I40E_HW_RESTART_AUTONEGBIT(18)
 
-   u32 flags;
-#define I40E_FLAG_RX_CSUM_ENABLED  BIT(0)
-#define I40E_FLAG_MSI_ENABLED  BIT(1)
-#define I40E_FLAG_MSIX_ENABLED BIT(2)
-#define I40E_FLAG_RSS_ENABLED  BIT(3)
-#define I40E_FLAG_VMDQ_ENABLED BIT(4)
-#define I40E_FLAG_FILTER_SYNC  BIT(5)
-#define I40E_FLAG_SRIOV_ENABLEDBIT(6)
-#define I40E_FLAG_DCB_CAPABLE  BIT(7)
-#define I40E_FLAG_DCB_ENABLED  BIT(8)
-#define I40E_FLAG_FD_SB_ENABLEDBIT(9)
-#define I40E_FLAG_FD_ATR_ENABLED   BIT(10)
-#define I40E_FLAG_FD_SB_AUTO_DISABLED  BIT(11)
-#define I40E_FLAG_FD_ATR_AUTO_DISABLED BIT(12)
-#define I40E_FLAG_MFP_ENABLED  BIT(13)
-#define I40E_FLAG_UDP_FILTER_SYNC  BIT(14)
-#define I40E_FLAG_HW_ATR_EVICT_ENABLED BIT(15)
-#define I40E_FLAG_VEB_MODE_ENABLED BIT(16)
-#define I40E_FLAG_VEB_STATS_ENABLEDBIT(17)
-#define I40E_FLAG_LINK_POLLING_ENABLED BIT(18)
-#define I40E_FLAG_TRUE_PROMISC_SUPPORT BIT(19)
-#define I40E_FLAG_TEMP_LINK_POLLINGBIT(20)
-#define I40E_FLAG_LEGACY_RXBIT(21)
-#define I40E_FLAG_PTP  BIT(22)
-#define I40E_FLAG_IWARP_ENABLEDBIT(23)
-#define I40E_FLAG_SERVICE_CLIENT_REQUESTED BIT(24)
-#define I40E_FLAG_CLIENT_L2_CHANGE BIT(25)
-#define I40E_FLAG_CLIENT_RESET BIT(26)
-#define I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED   BIT(27)
-#define I40E_FLAG_SOURCE_PRUNING_DISABLED  BIT(28)
-#define I40E_FLAG_TC_MQPRIOBIT(29)
-#define I40E_FLAG_FD_SB_INACTIVE   BIT(30)
-#define I40E_FLAG_FD_SB_TO_CLOUD_FILTERBIT(31)
+   u64 flags;
+#define I40E_FLAG_RX_CSUM_ENABLED  BIT_ULL(0)
+#define I40E_FLAG_MSI_ENABLED  BIT_ULL(1)
+#define I40E_FLAG_MSIX_ENABLED BIT_ULL(2)
+#define I40E_FLAG_RSS_ENABLED  BIT_ULL(3)
+#define I40E_FLAG_VMDQ_ENABLED BIT_ULL(4)
+#define I40E_FLAG_FILTER_SYNC  BIT_ULL(5)
+#define I40E_FLAG_SRIOV_ENABLEDBIT_ULL(6)
+#define I40E_FLAG_DCB_CAPABLE  BIT_ULL(7)
+#define I40E_FLAG_DCB_ENABLED  BIT_ULL(8)
+#define I40E_FLAG_FD_SB_ENABLEDBIT_ULL(9)
+#define I40E_FLAG_FD_ATR_ENABLED   BIT_ULL(10)
+#define I40E_FLAG_FD_SB_AUTO_DISABLED  BIT_ULL(11)
+#define I40E_FLAG_FD_ATR_AUTO_DISABLED BIT_ULL(12)
+#define I40E_FLAG_MFP_ENABLED  BIT_ULL(13)
+#define I40E_FLAG_UDP_FILTER_SYNC  BIT_ULL(14)
+#define I40E_FLAG_HW_ATR_EVICT_ENABLED BIT_ULL(15)
+#define I40E_FLAG_VEB_MODE_ENABLED BIT_ULL(16)
+#define I40E_FLAG_VEB_STATS_ENABLEDBIT_ULL(17)
+#define I40E_FLAG_LINK_POLLING_ENABLED BIT_ULL(18)
+#define I40E_FLAG_TRUE_PROMISC_SUPPORT BIT_ULL(19)
+#define I40E_FLAG_TEMP_LINK_POLLINGBIT_ULL(20)
+#define I40E_FLAG_LEGACY_RXBIT_ULL(21)
+#define I40E_FLAG_PTP  BIT_ULL(22)
+#define I40E_FLAG_IWARP_ENABLEDBIT_ULL(23)
+#define I40E_FLAG_SERVICE_CLIENT_REQUESTED BIT_ULL(24)
+#define I40E_FLAG_CLIENT_L2_CHANGE BIT_ULL(25)
+#define I40E_FLAG_CLIENT_RESET BIT_ULL(26)
+#define I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED   BIT_ULL(27)
+#define I40E_FLAG_SOURCE_PRUNING_DISABLED  BIT_ULL(28)
+#define I40E_FLAG_TC_MQPRIOBIT_ULL(29)
+#define I40E_FLAG_FD_SB_INACTIVE   BIT_ULL(30)
+#define I40E_FLAG_FD_SB_TO_CLOUD_FILTERBIT_ULL(31)
+#define I40E_FLAG_DISABLE_FW_LLDP  BIT_ULL(32)
 
struct i40e_client_instance *cinst;
bool stat_offsets_loaded;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c

[net-next 08/15] i40e: fix FW_LLDP flag on init

2018-01-26 Thread Jeff Kirsher

From: Alan Brady 

Using ethtool --set-priv-flags disable-fw-lldp  is persistent
across reboots/reloads so we need some mechanism in the driver to detect
if it's on or off on init so we can set the ethtool private flag
appropriately.  Without this, every time the driver is reloaded the flag
will default to off regardless of whether it's on or off in FW.

We detect this by first attempting to program DCB and if AQ fails
returning I40E_AQ_RC_EPERM, we know that LLDP is disabled in FW.

Signed-off-by: Alan Brady 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index fdeaeb9d44e2..ed0870ff4be2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6351,6 +6351,9 @@ static int i40e_init_pf_dcb(struct i40e_pf *pf)
dev_dbg(>pdev->dev,
"DCBX offload is supported for this PF.\n");
}
+   } else if (pf->hw.aq.asq_last_status == I40E_AQ_RC_EPERM) {
+   dev_info(>pdev->dev, "FW LLDP disabled for this PF.\n");
+   pf->flags |= I40E_FLAG_DISABLE_FW_LLDP;
} else {
dev_info(>pdev->dev,
 "Query for DCB configuration failed, err %s aq_err 
%s\n",
-- 
2.14.3

[net-next 14/15] i40e/i40evf: Record ITR register location in the q_vector

2018-01-26 Thread Jeff Kirsher

From: Alexander Duyck 

The drivers for i40e and i40evf had a reg_idx value stored in the q_vector
that was going completely unused. I can only assume this was copied over
from ixgbe and nobody knew how to use it.

I'm going to make use of the value to avoid having to compute the vector
and thus the register index for multiple paths throughout the drivers.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c |  1 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 12 
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c   | 12 
 drivers/net/ethernet/intel/i40evf/i40evf.h  |  4 ++--
 drivers/net/ethernet/intel/i40evf/i40evf_main.c |  1 +
 5 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 25087e21a051..827c082c4356 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -4122,6 +4122,7 @@ static void i40e_vsi_map_rings_to_vectors(struct i40e_vsi 
*vsi)
num_ringpairs = DIV_ROUND_UP(qp_remaining, q_vectors - v_start);
 
q_vector->num_ringpairs = num_ringpairs;
+   q_vector->reg_idx = q_vector->v_idx + vsi->base_vector - 1;
 
q_vector->rx.count = 0;
q_vector->tx.count = 0;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 8d2275830a40..e554aa6cf070 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -956,7 +956,7 @@ static void i40e_enable_wb_on_itr(struct i40e_vsi *vsi,
  I40E_PFINT_DYN_CTLN_ITR_INDX_MASK; /* set noitr */
 
wr32(>back->hw,
-I40E_PFINT_DYN_CTLN(q_vector->v_idx + vsi->base_vector - 
1),
+I40E_PFINT_DYN_CTLN(q_vector->reg_idx),
 val);
} else {
val = I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK |
@@ -983,8 +983,7 @@ void i40e_force_wb(struct i40e_vsi *vsi, struct 
i40e_q_vector *q_vector)
  /* allow 00 to be written to the index */
 
wr32(>back->hw,
-I40E_PFINT_DYN_CTLN(q_vector->v_idx +
-vsi->base_vector - 1), val);
+I40E_PFINT_DYN_CTLN(q_vector->reg_idx), val);
} else {
u32 val = I40E_PFINT_DYN_CTL0_INTENA_MASK |
  I40E_PFINT_DYN_CTL0_ITR_INDX_MASK | /* set noitr */
@@ -2311,7 +2310,6 @@ static inline void i40e_update_enable_itr(struct i40e_vsi 
*vsi,
struct i40e_hw *hw = >back->hw;
bool rx = false, tx = false;
u32 rxval, txval;
-   int vector;
int idx = q_vector->v_idx;
int rx_itr_setting, tx_itr_setting;
 
@@ -2321,8 +2319,6 @@ static inline void i40e_update_enable_itr(struct i40e_vsi 
*vsi,
return;
}
 
-   vector = (q_vector->v_idx + vsi->base_vector);
-
/* avoid dynamic calculation if in countdown mode OR if
 * all dynamic is disabled
 */
@@ -2371,12 +2367,12 @@ static inline void i40e_update_enable_itr(struct 
i40e_vsi *vsi,
 */
rxval |= BIT(31);
/* don't check _DOWN because interrupt isn't being enabled */
-   wr32(hw, INTREG(vector - 1), rxval);
+   wr32(hw, INTREG(q_vector->reg_idx), rxval);
}
 
 enable_int:
if (!test_bit(__I40E_VSI_DOWN, vsi->state))
-   wr32(hw, INTREG(vector - 1), txval);
+   wr32(hw, INTREG(q_vector->reg_idx), txval);
 
if (q_vector->itr_countdown)
q_vector->itr_countdown--;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index c7831f7f7761..357d6051281f 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -369,8 +369,7 @@ static void i40e_enable_wb_on_itr(struct i40e_vsi *vsi,
  I40E_VFINT_DYN_CTLN1_ITR_INDX_MASK; /* set noitr */
 
wr32(>back->hw,
-I40E_VFINT_DYN_CTLN1(q_vector->v_idx +
- vsi->base_vector - 1), val);
+I40E_VFINT_DYN_CTLN1(q_vector->reg_idx), val);
q_vector->arm_wb_state = true;
 }
 
@@ -389,7 +388,7 @@ void i40evf_force_wb(struct i40e_vsi *vsi, struct 
i40e_q_vector *q_vector)
  /* allow 00 to be written to the index */;
 
wr32(>back->hw,
-I40E_VFINT_DYN_CTLN1(q_vector->v_idx + vsi->base_vector - 1),
+I40E_VFINT_DYN_CTLN1(q_vector->reg_idx),
 val);
 }
 
@@ -1498,12 +1497,9 @@ static inline void

[net-next 15/15] i40e: Do not allow use more TC queue pairs than MSI-X vectors exist

2018-01-26 Thread Jeff Kirsher

From: Paweł Jabłoński 

This patch suppresses the message about invalid TC mapping and wrong
selected TX queue. The root cause of this bug was setting too many
TC queue pairs on huge multiprocessor machines. When quantity of the
TC queue pairs is exceeding MSI-X vectors count then TX queue number
can be selected beyond actual TX queues amount.

Signed-off-by: Paweł Jabłoński 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 827c082c4356..f95ce9b5e4fb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1818,6 +1818,10 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi 
*vsi,
num_tc_qps = qcount / numtc;
num_tc_qps = min_t(int, num_tc_qps, i40e_pf_get_max_q_per_tc(pf));
 
+   /* Do not allow use more TC queue pairs than MSI-X vectors exist */
+   if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+   num_tc_qps = min_t(int, num_tc_qps, pf->num_lan_msix);
+
/* Setup queue offset/count for all TCs for given VSI */
for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
/* See if the given TC is enabled for the given VSI */
-- 
2.14.3

[net-next 09/15] i40e: cleanup unnecessary parens

2018-01-26 Thread Jeff Kirsher

Clean up unnecessary parenthesis.

Signed-off-by: Jeff Kirsher 
Tested-by: Andrew Bowers 
---
 drivers/net/ethernet/intel/i40e/i40e_adminq.c   | 2 +-
 drivers/net/ethernet/intel/i40evf/i40e_adminq.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c 
b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
index c4fa06dd0a2e..e78971605e0b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -976,7 +976,7 @@ i40e_status i40e_clean_arq_element(struct i40e_hw *hw,
}
 
/* set next_to_use to head */
-   ntu = (rd32(hw, hw->aq.arq.head) & I40E_PF_ARQH_ARQH_MASK);
+   ntu = rd32(hw, hw->aq.arq.head) & I40E_PF_ARQH_ARQH_MASK;
if (ntu == ntc) {
/* nothing to do - shouldn't need to update ring's values */
ret_code = I40E_ERR_ADMIN_QUEUE_NO_WORK;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq.c 
b/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
index ae3a74067425..d1aab6b8bfb1 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
@@ -906,7 +906,7 @@ i40e_status i40evf_clean_arq_element(struct i40e_hw *hw,
}
 
/* set next_to_use to head */
-   ntu = (rd32(hw, hw->aq.arq.head) & I40E_VF_ARQH1_ARQH_MASK);
+   ntu = rd32(hw, hw->aq.arq.head) & I40E_VF_ARQH1_ARQH_MASK;
if (ntu == ntc) {
/* nothing to do - shouldn't need to update ring's values */
ret_code = I40E_ERR_ADMIN_QUEUE_NO_WORK;
-- 
2.14.3

[net-next 01/15] i40e: Add returning AQ critical error to SW

2018-01-26 Thread Jeff Kirsher

From: Michal Kosiarz 

The FW has the ability to return a critical error on every AQ command.
When this critical error occurs then we need to send the correct response
to the caller.

Signed-off-by: Michal Kosiarz 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_adminq.c   | 13 +
 drivers/net/ethernet/intel/i40e/i40e_common.c   |  2 ++
 drivers/net/ethernet/intel/i40e/i40e_status.h   |  1 +
 drivers/net/ethernet/intel/i40evf/i40e_adminq.c | 13 +
 drivers/net/ethernet/intel/i40evf/i40e_common.c |  2 ++
 drivers/net/ethernet/intel/i40evf/i40e_status.h |  1 +
 6 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c 
b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
index d9670cd8743f..c4fa06dd0a2e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -907,10 +907,15 @@ i40e_status i40e_asq_send_command(struct i40e_hw *hw,
/* update the error if time out occurred */
if ((!cmd_completed) &&
(!details->async && !details->postpone)) {
-   i40e_debug(hw,
-  I40E_DEBUG_AQ_MESSAGE,
-  "AQTX: Writeback timeout.\n");
-   status = I40E_ERR_ADMIN_QUEUE_TIMEOUT;
+   if (rd32(hw, hw->aq.asq.len) & I40E_GL_ATQLEN_ATQCRIT_MASK) {
+   i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+  "AQTX: AQ Critical error.\n");
+   status = I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR;
+   } else {
+   i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+  "AQTX: Writeback timeout.\n");
+   status = I40E_ERR_ADMIN_QUEUE_TIMEOUT;
+   }
}
 
 asq_send_command_error:
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index ee6052ecd215..c690e9c64c48 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -278,6 +278,8 @@ const char *i40e_stat_str(struct i40e_hw *hw, i40e_status 
stat_err)
return "I40E_NOT_SUPPORTED";
case I40E_ERR_FIRMWARE_API_VERSION:
return "I40E_ERR_FIRMWARE_API_VERSION";
+   case I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR:
+   return "I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR";
}
 
snprintf(hw->err_str, sizeof(hw->err_str), "%d", stat_err);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_status.h 
b/drivers/net/ethernet/intel/i40e/i40e_status.h
index 5f9cac55aa55..afb72e711d43 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_status.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_status.h
@@ -95,6 +95,7 @@ enum i40e_status_code {
I40E_ERR_NOT_READY  = -63,
I40E_NOT_SUPPORTED  = -64,
I40E_ERR_FIRMWARE_API_VERSION   = -65,
+   I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR = -66,
 };
 
 #endif /* _I40E_STATUS_H_ */
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq.c 
b/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
index 8b0d4b255dea..ae3a74067425 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
@@ -837,10 +837,15 @@ i40e_status i40evf_asq_send_command(struct i40e_hw *hw,
/* update the error if time out occurred */
if ((!cmd_completed) &&
(!details->async && !details->postpone)) {
-   i40e_debug(hw,
-  I40E_DEBUG_AQ_MESSAGE,
-  "AQTX: Writeback timeout.\n");
-   status = I40E_ERR_ADMIN_QUEUE_TIMEOUT;
+   if (rd32(hw, hw->aq.asq.len) & I40E_VF_ATQLEN1_ATQCRIT_MASK) {
+   i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+  "AQTX: AQ Critical error.\n");
+   status = I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR;
+   } else {
+   i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+  "AQTX: Writeback timeout.\n");
+   status = I40E_ERR_ADMIN_QUEUE_TIMEOUT;
+   }
}
 
 asq_send_command_error:
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_common.c 
b/drivers/net/ethernet/intel/i40evf/i40e_common.c
index a94648429a5b..67bf5cebb76f 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_common.c
@@ -284,6 +284,8 @@ const char *i40evf_stat_str(struct i40e_hw *hw, i40e_status 
stat_err)
return "I40E_NOT_SUPPORTED";
case I40E_ERR_FIRMWARE_API_VERSION:
return "I40E_ERR_FIRMWARE_API_VERSION";
+   case I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR:
+

[net-next 10/15] i40e: Fix kdump failure

2018-01-26 Thread Jeff Kirsher

From: Avinash Dayanand 

kdump fails in the system when used in conjunction with Ethernet driver
X722/X710. This is mainly because when we are resource constrained i.e.
when we have just one online_cpus, we are enabling VMDq and iWARP. It
doesn't make sense to enable them with just one CPU and starve kdump
for lack of IRQs.

So don't enable VMDq or iWARP when we just have a single CPU.

Signed-off-by: Avinash Dayanand 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ed0870ff4be2..db611433120a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11069,13 +11069,13 @@ static int i40e_sw_init(struct i40e_pf *pf)
pf->hw.aq.fw_maj_ver >= 6)
pf->hw_features |= I40E_HW_PTP_L4_CAPABLE;
 
-   if (pf->hw.func_caps.vmdq) {
+   if (pf->hw.func_caps.vmdq && num_online_cpus() != 1) {
pf->num_vmdq_vsis = I40E_DEFAULT_NUM_VMDQ_VSI;
pf->flags |= I40E_FLAG_VMDQ_ENABLED;
pf->num_vmdq_qps = i40e_default_queues_per_vmdq(pf);
}
 
-   if (pf->hw.func_caps.iwarp) {
+   if (pf->hw.func_caps.iwarp && num_online_cpus() != 1) {
pf->flags |= I40E_FLAG_IWARP_ENABLED;
/* IWARP needs one extra vector for CQP just like MISC.*/
pf->num_iwarp_msix = (int)num_online_cpus() + 1;
-- 
2.14.3

[net-next 03/15] i40evf: Allow turning off offloads when the VF has VLAN set

2018-01-26 Thread Jeff Kirsher

From: Paweł Jabłoński 

This patch adds back the capability to turn off offloads when VF has
VLAN set. The commit 0a3b4f702fb1 ("i40evf: enable support for VF VLAN
tag stripping control") adds the i40evf_set_features function and
changes the 'turn off' flow for offloads. This patch adds that
capability back by moving checking the VLAN option for VF to the
next statement.

Signed-off-by: Paweł Jabłoński 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 8934f784e96f..d59bf060196b 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -2344,13 +2344,19 @@ static int i40evf_set_features(struct net_device 
*netdev,
 {
struct i40evf_adapter *adapter = netdev_priv(netdev);
 
-   if (!VLAN_ALLOWED(adapter))
+   /* Don't allow changing VLAN_RX flag when VLAN is set for VF
+* and return an error in this case
+*/
+   if (VLAN_ALLOWED(adapter)) {
+   if (features & NETIF_F_HW_VLAN_CTAG_RX)
+   adapter->aq_required |=
+   I40EVF_FLAG_AQ_ENABLE_VLAN_STRIPPING;
+   else
+   adapter->aq_required |=
+   I40EVF_FLAG_AQ_DISABLE_VLAN_STRIPPING;
+   } else if ((netdev->features ^ features) & NETIF_F_HW_VLAN_CTAG_RX) {
return -EINVAL;
-
-   if (features & NETIF_F_HW_VLAN_CTAG_RX)
-   adapter->aq_required |= I40EVF_FLAG_AQ_ENABLE_VLAN_STRIPPING;
-   else
-   adapter->aq_required |= I40EVF_FLAG_AQ_DISABLE_VLAN_STRIPPING;
+   }
 
return 0;
 }
-- 
2.14.3

[net-next 02/15] i40e: Fix for adding multiple ethtool filters on the same location

2018-01-26 Thread Jeff Kirsher

From: Patryk Małek 

This patch reorders i40e_add_del_fdir and i40e_update_ethtool_fdir_entry
calls so that we first remove an already existing filter (inside
i40e_update_ethtool_fdir_entry using i40e_add_del_fdir) and then
we add a new one with i40e_add_del_fdir.
After applying this patch, creating multiple identical filters (with
the same location) one after another doesn't revert their behavior
but behaves correctly.

Signed-off-by: Patryk Małek 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 34173f821fd9..2cbd564e437a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -3939,19 +3939,19 @@ static int i40e_add_fdir_ethtool(struct i40e_vsi *vsi,
input->flex_offset = userdef.flex_offset;
}
 
-   ret = i40e_add_del_fdir(vsi, input, true);
-   if (ret)
-   goto free_input;
-
/* Add the input filter to the fdir_input_list, possibly replacing
 * a previous filter. Do not free the input structure after adding it
 * to the list as this would cause a use-after-free bug.
 */
i40e_update_ethtool_fdir_entry(vsi, input, fsp->location, NULL);
-
+   ret = i40e_add_del_fdir(vsi, input, true);
+   if (ret)
+   goto remove_sw_rule;
return 0;
 
-free_input:
+remove_sw_rule:
+   hlist_del(>fdir_node);
+   pf->fdir_pf_active_filters--;
kfree(input);
return ret;
 }
-- 
2.14.3

[net-next 04/15] i40e/i40evf: Use ring pointers to clean up _set_itr_per_queue

2018-01-26 Thread Jeff Kirsher

From: Alexander Duyck 

This change cleans up the i40e/i40evf_set_itr_per_queue function by
dropping all the unneeded pointer chases. Instead we can just pull out the
pointers for the Tx and Rx rings and use them throughout the function.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 22 +
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c | 28 +++---
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 2cbd564e437a..dd6996e65396 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -2305,6 +2305,8 @@ static void i40e_set_itr_per_queue(struct i40e_vsi *vsi,
   struct ethtool_coalesce *ec,
   int queue)
 {
+   struct i40e_ring *rx_ring = vsi->rx_rings[queue];
+   struct i40e_ring *tx_ring = vsi->tx_rings[queue];
struct i40e_pf *pf = vsi->back;
struct i40e_hw *hw = >hw;
struct i40e_q_vector *q_vector;
@@ -2312,26 +2314,26 @@ static void i40e_set_itr_per_queue(struct i40e_vsi *vsi,
 
intrl = i40e_intrl_usec_to_reg(vsi->int_rate_limit);
 
-   vsi->rx_rings[queue]->rx_itr_setting = ec->rx_coalesce_usecs;
-   vsi->tx_rings[queue]->tx_itr_setting = ec->tx_coalesce_usecs;
+   rx_ring->rx_itr_setting = ec->rx_coalesce_usecs;
+   tx_ring->tx_itr_setting = ec->tx_coalesce_usecs;
 
if (ec->use_adaptive_rx_coalesce)
-   vsi->rx_rings[queue]->rx_itr_setting |= I40E_ITR_DYNAMIC;
+   rx_ring->rx_itr_setting |= I40E_ITR_DYNAMIC;
else
-   vsi->rx_rings[queue]->rx_itr_setting &= ~I40E_ITR_DYNAMIC;
+   rx_ring->rx_itr_setting &= ~I40E_ITR_DYNAMIC;
 
if (ec->use_adaptive_tx_coalesce)
-   vsi->tx_rings[queue]->tx_itr_setting |= I40E_ITR_DYNAMIC;
+   tx_ring->tx_itr_setting |= I40E_ITR_DYNAMIC;
else
-   vsi->tx_rings[queue]->tx_itr_setting &= ~I40E_ITR_DYNAMIC;
+   tx_ring->tx_itr_setting &= ~I40E_ITR_DYNAMIC;
 
-   q_vector = vsi->rx_rings[queue]->q_vector;
-   q_vector->rx.itr = ITR_TO_REG(vsi->rx_rings[queue]->rx_itr_setting);
+   q_vector = rx_ring->q_vector;
+   q_vector->rx.itr = ITR_TO_REG(rx_ring->rx_itr_setting);
vector = vsi->base_vector + q_vector->v_idx;
wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, vector - 1), q_vector->rx.itr);
 
-   q_vector = vsi->tx_rings[queue]->q_vector;
-   q_vector->tx.itr = ITR_TO_REG(vsi->tx_rings[queue]->tx_itr_setting);
+   q_vector = tx_ring->q_vector;
+   q_vector->tx.itr = ITR_TO_REG(tx_ring->tx_itr_setting);
vector = vsi->base_vector + q_vector->v_idx;
wr32(hw, I40E_PFINT_ITRN(I40E_TX_ITR, vector - 1), q_vector->tx.itr);
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
index da006fa3fec1..e2d8aa19d205 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
@@ -512,31 +512,31 @@ static void i40evf_set_itr_per_queue(struct 
i40evf_adapter *adapter,
 struct ethtool_coalesce *ec,
 int queue)
 {
+   struct i40e_ring *rx_ring = >rx_rings[queue];
+   struct i40e_ring *tx_ring = >tx_rings[queue];
struct i40e_vsi *vsi = >vsi;
struct i40e_hw *hw = >hw;
struct i40e_q_vector *q_vector;
u16 vector;
 
-   adapter->rx_rings[queue].rx_itr_setting = ec->rx_coalesce_usecs;
-   adapter->tx_rings[queue].tx_itr_setting = ec->tx_coalesce_usecs;
+   rx_ring->rx_itr_setting = ec->rx_coalesce_usecs;
+   tx_ring->tx_itr_setting = ec->tx_coalesce_usecs;
 
-   if (ec->use_adaptive_rx_coalesce)
-   adapter->rx_rings[queue].rx_itr_setting |= I40E_ITR_DYNAMIC;
-   else
-   adapter->rx_rings[queue].rx_itr_setting &= ~I40E_ITR_DYNAMIC;
+   rx_ring->rx_itr_setting |= I40E_ITR_DYNAMIC;
+   if (!ec->use_adaptive_rx_coalesce)
+   rx_ring->rx_itr_setting ^= I40E_ITR_DYNAMIC;
 
-   if (ec->use_adaptive_tx_coalesce)
-   adapter->tx_rings[queue].tx_itr_setting |= I40E_ITR_DYNAMIC;
-   else
-   adapter->tx_rings[queue].tx_itr_setting &= ~I40E_ITR_DYNAMIC;
+   tx_ring->tx_itr_setting |= I40E_ITR_DYNAMIC;
+   if (!ec->use_adaptive_tx_coalesce)
+   tx_ring->tx_itr_setting ^= I40E_ITR_DYNAMIC;
 
-   q_vector = adapter->rx_rings[queue].q_vector;
-   q_vector->rx.itr = ITR_TO_REG(adapter->rx_rings[queue].rx_itr_setting);
+   q_vector =

[net-next 05/15] i40e: Display LLDP information on vSphere Web Client

2018-01-26 Thread Jeff Kirsher

From: Upasana Menon 

This patch enables driver to display LLDP information on the vSphere Web
Client with Intel adapters (X710, XL710) and Distributed Virtual Switch.

Signed-off-by: Upasana Menon 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  | 12 ++
 drivers/net/ethernet/intel/i40e/i40e_common.c  | 27 ++
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |  4 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h| 12 ++
 4 files changed, 55 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index 0d471b0db0f4..a852775d3059 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -205,6 +205,7 @@ enum i40e_admin_queue_opc {
/* DCB commands */
i40e_aqc_opc_dcb_ignore_pfc = 0x0301,
i40e_aqc_opc_dcb_updated= 0x0302,
+   i40e_aqc_opc_set_dcb_parameters = 0x0303,
 
/* TX scheduler */
i40e_aqc_opc_configure_vsi_bw_limit = 0x0400,
@@ -2496,6 +2497,17 @@ struct i40e_aqc_lldp_start {
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_start);
 
+/* Set DCB (direct 0x0303) */
+struct i40e_aqc_set_dcb_parameters {
+   u8 command;
+#define I40E_AQ_DCB_SET_AGENT  0x1
+#define I40E_DCB_VALID 0x1
+   u8 valid_flags;
+   u8 reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_dcb_parameters);
+
 /* Get CEE DCBX Oper Config (0x0A07)
  * uses the generic descriptor struct
  * returns below as indirect response
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index c690e9c64c48..ef5a868aae46 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -3641,7 +3641,34 @@ i40e_status i40e_aq_start_lldp(struct i40e_hw *hw,
i40e_fill_default_direct_cmd_desc(, i40e_aqc_opc_lldp_start);
 
cmd->command = I40E_AQ_LLDP_AGENT_START;
+   status = i40e_asq_send_command(hw, , NULL, 0, cmd_details);
 
+   return status;
+}
+
+/**
+ * i40e_aq_set_dcb_parameters
+ * @hw: pointer to the hw struct
+ * @cmd_details: pointer to command details structure or NULL
+ * @dcb_enable: True if DCB configuration needs to be applied
+ *
+ **/
+enum i40e_status_code
+i40e_aq_set_dcb_parameters(struct i40e_hw *hw, bool dcb_enable,
+  struct i40e_asq_cmd_details *cmd_details)
+{
+   struct i40e_aq_desc desc;
+   struct i40e_aqc_set_dcb_parameters *cmd =
+   (struct i40e_aqc_set_dcb_parameters *)
+   i40e_status status;
+
+   i40e_fill_default_direct_cmd_desc(,
+ i40e_aqc_opc_set_dcb_parameters);
+
+   if (dcb_enable) {
+   cmd->valid_flags = I40E_DCB_VALID;
+   cmd->command = I40E_AQ_DCB_SET_AGENT;
+   }
status = i40e_asq_send_command(hw, , NULL, 0, cmd_details);
 
return status;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h 
b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index 187dd53e0056..83798b7841b9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -225,6 +225,10 @@ i40e_status i40e_aq_cfg_lldp_mib_change_event(struct 
i40e_hw *hw,
struct i40e_asq_cmd_details *cmd_details);
 i40e_status i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent,
struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_set_dcb_parameters(struct i40e_hw *hw,
+  bool dcb_enable,
+  struct i40e_asq_cmd_details
+  *cmd_details);
 i40e_status i40e_aq_start_lldp(struct i40e_hw *hw,
struct i40e_asq_cmd_details *cmd_details);
 i40e_status i40e_aq_get_cee_dcb_config(struct i40e_hw *hw,
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
index b0e6454995b6..815de8d9c3fb 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
@@ -205,6 +205,7 @@ enum i40e_admin_queue_opc {
/* DCB commands */
i40e_aqc_opc_dcb_ignore_pfc = 0x0301,
i40e_aqc_opc_dcb_updated= 0x0302,
+   i40e_aqc_opc_set_dcb_parameters = 0x0303,
 
/* TX scheduler */
i40e_aqc_opc_configure_vsi_bw_limit = 0x0400,
@@ -2461,6 +2462,17 @@ struct i40e_aqc_lldp_start {
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_start);
 
+/* Set DCB (direct 0x0303) */
+struct i40e_aqc_set_dcb_parameters {
+   u8 command;
+#define

[net-next 07/15] i40e: Implement an ethtool private flag to stop LLDP in FW

2018-01-26 Thread Jeff Kirsher

From: Dave Ertman 

Implement the private flag disable-fw-lldp for ethtool
to disable the processing of LLDP packets by the FW.
This will stop the FW from consuming LLDPDU and cause
them to be sent up the stack.

The FW is also being configured to apply a default DCB
configuration on link up.

Toggling the value of this flag will also cause a PF reset.

Disabling FW DCB will also disable DCBx.

Signed-off-by: Dave Ertman 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 47 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c| 14 ++--
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 8b0062ec8edb..03f7007d025e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -233,6 +233,7 @@ static const struct i40e_priv_flags 
i40e_gstrings_priv_flags[] = {
I40E_PRIV_FLAG("legacy-rx", I40E_FLAG_LEGACY_RX, 0),
I40E_PRIV_FLAG("disable-source-pruning",
   I40E_FLAG_SOURCE_PRUNING_DISABLED, 0),
+   I40E_PRIV_FLAG("disable-fw-lldp", I40E_FLAG_DISABLE_FW_LLDP, 0),
 };
 
 #define I40E_PRIV_FLAGS_STR_LEN ARRAY_SIZE(i40e_gstrings_priv_flags)
@@ -4317,6 +4318,25 @@ static int i40e_set_priv_flags(struct net_device *dev, 
u32 flags)
!(pf->hw_features & I40E_HW_ATR_EVICT_CAPABLE))
return -EOPNOTSUPP;
 
+   /* Disable FW LLDP not supported if NPAR active or if FW
+* API version < 1.7
+*/
+   if (new_flags & I40E_FLAG_DISABLE_FW_LLDP) {
+   if (pf->hw.func_caps.npar_enable) {
+   dev_warn(>pdev->dev,
+"Unable to stop FW LLDP if NPAR active\n");
+   return -EOPNOTSUPP;
+   }
+
+   if (pf->hw.aq.api_maj_ver < 1 ||
+   (pf->hw.aq.api_maj_ver == 1 &&
+pf->hw.aq.api_min_ver < 7)) {
+   dev_warn(>pdev->dev,
+"FW ver does not support stopping FW LLDP\n");
+   return -EOPNOTSUPP;
+   }
+   }
+
/* Compare and exchange the new flags into place. If we failed, that
 * is if cmpxchg returns anything but the old value, this means that
 * something else has modified the flags variable since we copied it
@@ -4362,12 +4382,37 @@ static int i40e_set_priv_flags(struct net_device *dev, 
u32 flags)
}
}
 
+   if (changed_flags & I40E_FLAG_DISABLE_FW_LLDP) {
+   if (pf->flags & I40E_FLAG_DISABLE_FW_LLDP) {
+   struct i40e_dcbx_config *dcbcfg;
+   int i;
+
+   i40e_aq_stop_lldp(>hw, true, NULL);
+   i40e_aq_set_dcb_parameters(>hw, true, NULL);
+   /* reset local_dcbx_config to default */
+   dcbcfg = >hw.local_dcbx_config;
+   dcbcfg->etscfg.willing = 1;
+   dcbcfg->etscfg.maxtcs = 0;
+   dcbcfg->etscfg.tcbwtable[0] = 100;
+   for (i = 1; i < I40E_MAX_TRAFFIC_CLASS; i++)
+   dcbcfg->etscfg.tcbwtable[i] = 0;
+   for (i = 0; i < I40E_MAX_USER_PRIORITY; i++)
+   dcbcfg->etscfg.prioritytable[i] = 0;
+   dcbcfg->etscfg.tsatable[0] = I40E_IEEE_TSA_ETS;
+   dcbcfg->pfc.willing = 1;
+   dcbcfg->pfc.pfccap = I40E_MAX_TRAFFIC_CLASS;
+   } else {
+   i40e_aq_start_lldp(>hw, NULL);
+   }
+   }
+
/* Issue reset to cause things to take effect, as additional bits
 * are added we will need to create a mask of bits requiring reset
 */
if (changed_flags & (I40E_FLAG_VEB_STATS_ENABLED |
 I40E_FLAG_LEGACY_RX |
-I40E_FLAG_SOURCE_PRUNING_DISABLED))
+I40E_FLAG_SOURCE_PRUNING_DISABLED |
+I40E_FLAG_DISABLE_FW_LLDP))
i40e_do_reset(pf, BIT(__I40E_PF_RESET_REQUESTED), true);
 
return 0;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 2703a92f3778..fdeaeb9d44e2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6320,8 +6320,11 @@ static int i40e_init_pf_dcb(struct i40e_pf *pf)
struct i40e_hw *hw = >hw;
int err = 0;
 
-   /* Do not enable DCB for SW1 and SW2 images even if the FW is capable */
-   if (pf->hw_features & I40E_HW_NO_DCB_SUPPORT)
+   /*

[net-next 13/15] i40e: fix reported mask for ntuple filters

2018-01-26 Thread Jeff Kirsher

From: Jacob Keller 

In commit 36777d9fa24c ("i40e: check current configured input set when
adding ntuple filters") some code was added to report the input set
mask for a given filter when reporting it to the user.

This code is necessary so that the reported filter correctly displays
that it is or is not masking certain fields.

Unfortunately the code was incorrect. Development error accidentally
swapped the mask values for the IPv4 addresses with the L4 port numbers.
The port numbers are only 16bits wide while IPv4 addresses are 32 bits.
Unfortunately we assigned only 16 bits to the IPv4 address masks.
Additionally we assigned 32bit value 0xFFF to the TCP port numbers.
This second part does not matter as the value would be truncated to
16bits regardless, but it is unnecessary.

Fix the reported masks to properly report that the entire field is
masked.

Fixes: 36777d9fa24c ("i40e: check current configured input set when adding 
ntuple filters")
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index b35c61ccc64a..2f5bee713fef 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -2749,16 +2749,16 @@ static int i40e_get_ethtool_fdir_entry(struct i40e_pf 
*pf,
 
 no_input_set:
if (input_set & I40E_L3_SRC_MASK)
-   fsp->m_u.tcp_ip4_spec.ip4src = htonl(0x);
+   fsp->m_u.tcp_ip4_spec.ip4src = htonl(0x);
 
if (input_set & I40E_L3_DST_MASK)
-   fsp->m_u.tcp_ip4_spec.ip4dst = htonl(0x);
+   fsp->m_u.tcp_ip4_spec.ip4dst = htonl(0x);
 
if (input_set & I40E_L4_SRC_MASK)
-   fsp->m_u.tcp_ip4_spec.psrc = htons(0x);
+   fsp->m_u.tcp_ip4_spec.psrc = htons(0x);
 
if (input_set & I40E_L4_DST_MASK)
-   fsp->m_u.tcp_ip4_spec.pdst = htons(0x);
+   fsp->m_u.tcp_ip4_spec.pdst = htons(0x);
 
if (rule->dest_ctl == I40E_FILTER_PROGRAM_DESC_DEST_DROP_PACKET)
fsp->ring_cookie = RX_CLS_FLOW_DISC;
-- 
2.14.3

[net-next 00/15][pull request] 40GbE Intel Wired LAN Driver Updates 2018-01-26

2018-01-26 Thread Jeff Kirsher

This series contains updates to i40e and i40evf.

Michal updates the driver to pass critical errors from the firmware to
the caller.

Patryk fixes an issue of creating multiple identical filters with the
same location, by simply moving the functions so that we remove the
existing filter and then add the new filter.

Paweł adds back in the ability to turn off offloads when VLAN is set for
the VF driver.  Fixed an issue where the number of TC queue pairs was
exceeding MSI-X vectors count, causing messages about invalid TC mapping
and wrong selected Tx queue.

Alex cleans up the i40e/i40evf_set_itr_per_queue() by dropping all the
unneeded pointer chases.  Puts to use the reg_idx value, which was going
unused, so that we can avoid having to compute the vector every time
throughout the driver.

Upasana enable the driver to display LLDP information on the vSphere Web
Client by exposing DCB parameters.

Alice converts our flags from 32 to 64 bit size, since we have added
more flags.

Dave implements a private ethtool flag to disable the processing of LLDP
packets by the firmware, so that the firmware will not consume LLDPDU
and cause them to be sent up the stack.

Alan adds a mechanism for detecting/storing the flag for processing of
LLDP packets by the firmware, so that its current state is persistent
across reboots/reloads of the driver.

Avinash fixes kdump with i40e due to resource constraints.  We were
enabling VMDq and iWARP when we just have a single CPU, which was
starving kdump for the lack of IRQs.

Jake adds support to program the fragmented IPv4 input set PCTYPE.
Fixed the reported masks to properly report that the entire field is
masked, since we had accidentally swapped the mask values for the IPv4
addresses with the L4 port numbers.

The following are changes since commit 9515a2e082f91457db0ecff4b65371d0fb5d9aad:
  net/ipv4: Allow send to local broadcast from a socket bound to a VRF
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 40GbE

Alan Brady (1):
  i40e: fix FW_LLDP flag on init

Alexander Duyck (2):
  i40e/i40evf: Use ring pointers to clean up _set_itr_per_queue
  i40e/i40evf: Record ITR register location in the q_vector

Alice Michael (1):
  i40e: change flags to use 64 bits

Avinash Dayanand (1):
  i40e: Fix kdump failure

Dave Ertman (1):
  i40e: Implement an ethtool private flag to stop LLDP in FW

Jacob Keller (3):
  i40e: program fragmented IPv4 filter input set
  i40e: disallow programming multiple filters with same criteria
  i40e: fix reported mask for ntuple filters

Jeff Kirsher (1):
  i40e: cleanup unnecessary parens

Michal Kosiarz (1):
  i40e: Add returning AQ critical error to SW

Patryk Małek (1):
  i40e: Fix for adding multiple ethtool filters on the same location

Paweł Jabłoński (2):
  i40evf: Allow turning off offloads when the VF has VLAN set
  i40e: Do not allow use more TC queue pairs than MSI-X vectors exist

Upasana Menon (1):
  i40e: Display LLDP information on vSphere Web Client

 drivers/net/ethernet/intel/i40e/i40e.h |  67 
 drivers/net/ethernet/intel/i40e/i40e_adminq.c  |  15 +-
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |  12 ++
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  29 
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 186 ++---
 drivers/net/ethernet/intel/i40e/i40e_main.c|  29 +++-
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |   4 +
 drivers/net/ethernet/intel/i40e/i40e_status.h  |   1 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c|  12 +-
 drivers/net/ethernet/intel/i40evf/i40e_adminq.c|  15 +-
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|  12 ++
 drivers/net/ethernet/intel/i40evf/i40e_common.c|   2 +
 drivers/net/ethernet/intel/i40evf/i40e_status.h|   1 +
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c  |  12 +-
 drivers/net/ethernet/intel/i40evf/i40evf.h |   4 +-
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c |  28 ++--
 drivers/net/ethernet/intel/i40evf/i40evf_main.c|  19 ++-
 17 files changed, 342 insertions(+), 106 deletions(-)

-- 
2.14.3

Re: [PATCH v2 net-next 2/2] kcm: Check if sk_user_data already set in kcm_attach

2018-01-26 Thread Eric Dumazet

On Wed, 2018-01-24 at 12:35 -0800, Tom Herbert wrote:
> This is needed to prevent sk_user_data being overwritten.
> The check is done under the callback lock. This should prevent
> a socket from being attached twice to a KCM mux. It also prevents
> a socket from being attached for other use cases of sk_user_data
> as long as the other cases set sk_user_data under the lock.
> Followup work is needed to unify all the use cases of sk_user_data
> to use the same locking.
> 
> Reported-by: syzbot+114b15f2be420a888...@syzkaller.appspotmail.com
> Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
> Signed-off-by: Tom Herbert 
> ---
>  net/kcm/kcmsock.c | 16 ++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
> index 7632797fb68e..4a8d407f8902 100644
> --- a/net/kcm/kcmsock.c
> +++ b/net/kcm/kcmsock.c
> @@ -1410,9 +1410,18 @@ static int kcm_attach(struct socket *sock, struct 
> socket *csock,
>   return err;
>   }
>  
> - sock_hold(csk);
> -
>   write_lock_bh(>sk_callback_lock);
> +
> + /* Check if sk_user_data is aready by KCM or someone else.
> +  * Must be done under lock to prevent race conditions.
> +  */
> + if (csk->sk_user_data) {
> + write_unlock_bh(>sk_callback_lock);
> + strp_done(>strp);

Although it seems psock->strp->stopped wont be set ?

We should hit WARN_ON(!strp->stopped);

Re: [bpf-next PATCH v2 2/3] bpf: sockmap, add sock close() hook to remove socks

2018-01-26 Thread John Fastabend

On 01/26/2018 10:14 AM, John Fastabend wrote:
> The selftests test_maps program was leaving dangling BPF sockmap
> programs around because not all psock elements were removed from
> the map. The elements in turn hold a reference on the BPF program
> they are attached to causing BPF programs to stay open even after
> test_maps has completed.
> 
> The original intent was that sk_state_change() would be called
> when TCP socks went through TCP_CLOSE state. However, because
> socks may be in SOCK_DEAD state or the sock may be a listening
> socket the event is not always triggered.
> 
> To resolve this use the ULP infrastructure and register our own
> proto close() handler. This fixes the above case.
> 
> Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
> Reported-by: Prashant Bhole 
> Signed-off-by: John Fastabend 
> ---

[...]

v3 will be needed.

> +void bpf_tcp_close(struct sock *sk, long timeout)
> +{
> + struct smap_psock_map_entry *e, *tmp;
> + struct smap_psock *psock;
> + struct sock *osk;
> +
> + psock = smap_psock_sk(sk);
> + if (unlikely(!psock))
> + return sk->sk_prot->close(sk, timeout);
> +
> + write_lock_bh(>sk_callback_lock);
> + list_for_each_entry_safe(e, tmp, >maps, list) {
> + osk = cmpxchg(e->entry, sk, NULL);
> + if (osk == sk) {
> + list_del(>list);
> + smap_release_sock(psock, sk);
> + }
> + }
> + write_unlock_bh(>sk_callback_lock);
> + return psock->save_close(sk, timeout);

We need this to be in an RCU critical section. Else the release op
could free the psock immediately presumably. I've not actually triggered
this case, but seems possible. Also the save_close call can not be done
inside RCU so we need to cache the value and run it at the end. This is
OK because we have the sock lock held.

Probably like this,

void bpf_tcp_close(struct sock *sk, long timeout)
{
struct smap_psock_map_entry *e, *tmp;
struct smap_psock *psock;
struct sock *osk;
void (*close_fun)(struct sock *sk, long timeout);

rcu_read_lock();
psock = smap_psock_sk(sk);
if (unlikely(!psock))
return sk->sk_prot->close(sk, timeout);

/* Although the psock may be destroyed, after RCU grace period, the
 * sk will not because we are holding the sock lock here. So we can
 * call the original close routine outside the RCU critical section.
 */
close_fun = psock->save_close;

write_lock_bh(>sk_callback_lock);
list_for_each_entry_safe(e, tmp, >maps, list) {
osk = cmpxchg(e->entry, sk, NULL);
if (osk == sk) {
list_del(>list);
smap_release_sock(psock, sk);
}
}
write_unlock_bh(>sk_callback_lock);
rcu_read_unlock(); 
close_fun(sk, timeout);
}

Re: [PATCH 3/3] Revert "e1000e: Do not read ICR in Other interrupt"

2018-01-26 Thread Alexander Duyck

On Fri, Jan 26, 2018 at 1:12 AM, Benjamin Poirier  wrote:
> This reverts commit 16ecba59bc333d6282ee057fb02339f77a880beb.
>
> It was reported that emulated e1000e devices in vmware esxi 6.5 Build
> 7526125 do not link up after commit 4aea7a5c5e94 ("e1000e: Avoid receiver
> overrun interrupt bursts"). Some tracing shows that after
> e1000e_trigger_lsc() is called, ICR reads out as 0x0 in e1000_msix_other()
> on emulated e1000e devices. In comparison, on real e1000e 82574 hardware,
> icr=0x8004 (_INT_ASSERTED | _LSC) in the same situation.
>
> Some experimentation showed that this flaw in vmware e1000e emulation can
> be worked around by not setting Other in EIAC. This is how it was before
> commit 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt").
>
> Since the ICR read in the Other interrupt handler has already been
> restored, this patch effectively reverts the remainder of commit
> 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt").
>
> Fixes: 4aea7a5c5e94 ("e1000e: Avoid receiver overrun interrupt bursts")
> Signed-off-by: Benjamin Poirier 
> ---
>  drivers/net/ethernet/intel/e1000e/netdev.c | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
> b/drivers/net/ethernet/intel/e1000e/netdev.c
> index ed103b9a8d3a..fffc1f0e3895 100644
> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
> @@ -1916,6 +1916,13 @@ static irqreturn_t e1000_msix_other(int 
> __always_unused irq, void *data)
> struct e1000_hw *hw = >hw;
> u32 icr = er32(ICR);
>
> +   /* Certain events (such as RXO) which trigger Other do not set
> +* INT_ASSERTED. In that case, read to clear of icr does not take
> +* place.
> +*/
> +   if (!(icr & E1000_ICR_INT_ASSERTED))
> +   ew32(ICR, E1000_ICR_OTHER);
> +

This piece doesn't make sense to me. Why are we clearing OTHER if
ICR_INT_ASSERTED is not set? The original code that was removed was in
commit 4d432f67ff00 "e1000e: Remove unreachable code" was setting IMS
and returning, not clearing the ICR register. I would argue that the
code is probably unreachable and if we just have the checks for OTHER
and LSC then we should be taking care of all of this in the task
anyway. All this code the code in the original was doing was
re-enabling the interrupt via IMS so we probably don't need this bit
as long as we are clearing OTHER and LSC when they are set so that we
can get future interrupts.

> if (icr & adapter->eiac_mask)
> ew32(ICS, (icr & adapter->eiac_mask));
>
> @@ -2033,7 +2040,6 @@ static void e1000_configure_msix(struct e1000_adapter 
> *adapter)
>hw->hw_addr + E1000_EITR_82574(vector));
> else
> writel(1, hw->hw_addr + E1000_EITR_82574(vector));
> -   adapter->eiac_mask |= E1000_IMS_OTHER;
>
> /* Cause Tx interrupts on every write back */
> ivar |= BIT(31);
> @@ -2258,7 +2264,7 @@ static void e1000_irq_enable(struct e1000_adapter 
> *adapter)
>
> if (adapter->msix_entries) {
> ew32(EIAC_82574, adapter->eiac_mask & E1000_EIAC_MASK_82574);
> -   ew32(IMS, adapter->eiac_mask | E1000_IMS_LSC);
> +   ew32(IMS, adapter->eiac_mask | E1000_IMS_OTHER | 
> E1000_IMS_LSC);
> } else if (hw->mac.type >= e1000_pch_lpt) {
> ew32(IMS, IMS_ENABLE_MASK | E1000_IMS_ECCER);
> } else {
> --
> 2.15.1
>

Re: [PATCH iproute2-next 0/2] tc: qdisc: provide JSON output for RED and prio

2018-01-26 Thread David Ahern

On 1/26/18 12:27 PM, Jakub Kicinski wrote:
> Hi!
> 
> This small series adds support for JSON output for prio and RED qdiscs.
> 
> Jakub Kicinski (2):
>   tc: red: JSON-ify RED output
>   tc: prio: JSON-ify prio output
> 
>  include/json_print.h |  1 +
>  lib/json_print.c |  1 +
>  lib/json_writer.c|  4 
>  tc/q_prio.c  | 12 
>  tc/q_red.c   | 41 ++---
>  5 files changed, 36 insertions(+), 23 deletions(-)
> 

Series applied to iproute2-next.

Re: [nf-next] netfilter: Add support for inner IPv6 packet match

2018-01-26 Thread Ahmed Abdelsalam

Hi Pablo, 

> Hi Ahmed,
> 
> On Thu, Jan 18, 2018 at 04:13:25PM +0100, Ahmed Abdelsalam wrote:
> [...]
> > diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_inner6.h 
> > b/include/uapi/linux/netfilter_ipv6/ip6t_inner6.h
> Matching at inner headers is a very useful, no doubt. Problem is that
> this approach is rather limited since it only allows for matching
> source and destination address at the inner header. I suspect someone
> else will follow up later on to add more fields to this, and we will
> end up having a new version of ip6tables... inside ip6t_inner6 :-).

Most probably it would be me who come to add more features, but i would call it 
sr6tables :-) 

> 
> nf_tables is a much more flexible framework, we can store the offset
> of this inner header in nft_pktinfo on demand, add new base to
> nft_payload and have access to all matching capabilities from any
> arbitrary offset. I really think this new feature belongs there.

Indeed, I started looking into the nftables implemenation and really convienced 
it's more convienent. Moreover, I had many issues with the ip6tables 
performance specially with the increae in the number of rules. However, why 
don't we have these patches in the kernel? since we have them implemented (some 
folks still like ip6tables). 

P.S. I'm looking into nftables exthdrs to support SRH.
Thanks, 
Ahmed

Re: [PATCH net] ipv6: change route cache aging logic

2018-01-26 Thread Wei Wang

On Fri, Jan 26, 2018 at 12:05 PM, Martin KaFai Lau  wrote:
> On Fri, Jan 26, 2018 at 11:40:17AM -0800, Wei Wang wrote:
>> From: Wei Wang 
>>
>> In current route cache aging logic, if a route has both RTF_EXPIRE and
>> RTF_GATEWAY set, the route will only be removed if the neighbor cache
>> has no RTN_ROUTE flag. Otherwise, even if the route has expired, it
> You meant NTF_ROUTER instead of RTN_ROUTE?
>
Yes. NTF_ROUTER flag. Sorry...

>> won't get deleted.
>> Fix this logic to always check if the route has expired first and then
>> do the gateway neighbor cache check if previous check decide to not
>> remove the exception entry.
>>
>> Fixes: 1859bac04fb6 ("ipv6: remove from fib tree aged out RTF_CACHE dst")
>> Signed-off-by: Wei Wang 
>> Signed-off-by: Eric Dumazet 
> Nice catch!
>
> Acked-by: Martin KaFai Lau 
>
>> ---
>>  net/ipv6/route.c | 20 
>>  1 file changed, 12 insertions(+), 8 deletions(-)
>>
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index 0458b761f3c5..a560fb1d0230 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -1586,12 +1586,19 @@ static void rt6_age_examine_exception(struct 
>> rt6_exception_bucket *bucket,
>>* EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
>>* expired, independently from their aging, as per RFC 8201 section 4
>>*/
>> - if (!(rt->rt6i_flags & RTF_EXPIRES) &&
>> - time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
>> - RT6_TRACE("aging clone %p\n", rt);
>> + if (!(rt->rt6i_flags & RTF_EXPIRES)) {
>> + if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
>> + RT6_TRACE("aging clone %p\n", rt);
>> + rt6_remove_exception(bucket, rt6_ex);
>> + return;
>> + }
>> + } else if (time_after(jiffies, rt->dst.expires)) {
>> + RT6_TRACE("purging expired route %p\n", rt);
>>   rt6_remove_exception(bucket, rt6_ex);
>>   return;
>> - } else if (rt->rt6i_flags & RTF_GATEWAY) {
>> + }
>> +
>> + if (rt->rt6i_flags & RTF_GATEWAY) {
>>   struct neighbour *neigh;
>>   __u8 neigh_flags = 0;
>>
>> @@ -1606,11 +1613,8 @@ static void rt6_age_examine_exception(struct 
>> rt6_exception_bucket *bucket,
>>   rt6_remove_exception(bucket, rt6_ex);
>>   return;
>>   }
>> - } else if (__rt6_check_expired(rt)) {
>> - RT6_TRACE("purging expired route %p\n", rt);
>> - rt6_remove_exception(bucket, rt6_ex);
>> - return;
>>   }
>> +
>>   gc_args->more++;
>>  }
>>
>> --
>> 2.16.0.rc1.238.g530d649a79-goog
>>

[PATCH bpf-next] bpf: clean up from test_tcpbpf_kern.c

2018-01-26 Thread Lawrence Brakmo

Removed commented lines from test_tcpbpf_kern.c

Fixes: d6d4f60c3a09 bpf: add selftest for tcpbpf
Signed-off-by: Lawrence Brakmo 
---
 tools/testing/selftests/bpf/test_tcpbpf_kern.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_tcpbpf_kern.c 
b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
index 66bf715..57119ad 100644
--- a/tools/testing/selftests/bpf/test_tcpbpf_kern.c
+++ b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
@@ -79,9 +79,6 @@ int bpf_testcb(struct bpf_sock_ops *skops)
}
break;
case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
-   /* Set callback */
-// good_call_rv = bpf_sock_ops_cb_flags_set(skops,
-//  BPF_SOCK_OPS_STATE_CB_FLAG);
skops->sk_txhash = 0x12345f;
v = 0xff;
rv = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, ,
-- 
2.9.5

Re: [PATCH net] ipv6: change route cache aging logic

2018-01-26 Thread Martin KaFai Lau

On Fri, Jan 26, 2018 at 11:40:17AM -0800, Wei Wang wrote:
> From: Wei Wang 
> 
> In current route cache aging logic, if a route has both RTF_EXPIRE and
> RTF_GATEWAY set, the route will only be removed if the neighbor cache
> has no RTN_ROUTE flag. Otherwise, even if the route has expired, it
You meant NTF_ROUTER instead of RTN_ROUTE?

> won't get deleted.
> Fix this logic to always check if the route has expired first and then
> do the gateway neighbor cache check if previous check decide to not
> remove the exception entry.
> 
> Fixes: 1859bac04fb6 ("ipv6: remove from fib tree aged out RTF_CACHE dst")
> Signed-off-by: Wei Wang 
> Signed-off-by: Eric Dumazet 
Nice catch!

Acked-by: Martin KaFai Lau 

> ---
>  net/ipv6/route.c | 20 
>  1 file changed, 12 insertions(+), 8 deletions(-)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 0458b761f3c5..a560fb1d0230 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -1586,12 +1586,19 @@ static void rt6_age_examine_exception(struct 
> rt6_exception_bucket *bucket,
>* EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
>* expired, independently from their aging, as per RFC 8201 section 4
>*/
> - if (!(rt->rt6i_flags & RTF_EXPIRES) &&
> - time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
> - RT6_TRACE("aging clone %p\n", rt);
> + if (!(rt->rt6i_flags & RTF_EXPIRES)) {
> + if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
> + RT6_TRACE("aging clone %p\n", rt);
> + rt6_remove_exception(bucket, rt6_ex);
> + return;
> + }
> + } else if (time_after(jiffies, rt->dst.expires)) {
> + RT6_TRACE("purging expired route %p\n", rt);
>   rt6_remove_exception(bucket, rt6_ex);
>   return;
> - } else if (rt->rt6i_flags & RTF_GATEWAY) {
> + }
> +
> + if (rt->rt6i_flags & RTF_GATEWAY) {
>   struct neighbour *neigh;
>   __u8 neigh_flags = 0;
>  
> @@ -1606,11 +1613,8 @@ static void rt6_age_examine_exception(struct 
> rt6_exception_bucket *bucket,
>   rt6_remove_exception(bucket, rt6_ex);
>   return;
>   }
> - } else if (__rt6_check_expired(rt)) {
> - RT6_TRACE("purging expired route %p\n", rt);
> - rt6_remove_exception(bucket, rt6_ex);
> - return;
>   }
> +
>   gc_args->more++;
>  }
>  
> -- 
> 2.16.0.rc1.238.g530d649a79-goog
>

Re: [PATCH net-next,v2 2/2] net: sched: add em_ipt ematch for calling xtables matches

2018-01-26 Thread Eyal Birger

On Fri, Jan 26, 2018 at 8:50 PM, Pablo Neira Ayuso  wrote:
> On Fri, Jan 26, 2018 at 06:48:53PM +0200, Eyal Birger wrote:
>> diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
>> new file mode 100644
>> index 000..2103b30
>> --- /dev/null
>> +++ b/net/sched/em_ipt.c
> [...]
>> +static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em,
>> + struct tcf_pkt_info *info)
>> +{
>> + const struct em_ipt_match *im = (const void *)em->data;
>> + struct xt_action_param acpar = {};
>> + struct net_device *indev = NULL;
>> + struct nf_hook_state state;
>> + int ret;
>> +
>> + if (unlikely(!skb_at_tc_ingress(skb))) {
>> + pr_notice_once("ipt match must not be used at egress\n");
>
> Isn't there a way to reject the use of this from ->change()? ie. from
> control plane configuration.

I wasn't able to find a simple way of doing so:

- AFAIU tc filters are detached from the qdiscs they operate on via
tcf_block instances
  that may be shared by different qdiscs. I was not able to be sure that filters
  attached to ingress qdiscs via tcf_blocks at configuration time
cannot be later be shared
  with non ingress qdiscs. Nor was I able to find another classifier
making the ingress/egress
  distinction at configuration time.

- ematches are not provided with 'ingress/egress' information at
'change()' invocation, though
  of course the infrastructure could be extended to provide this,
given the distinction is available.

Eyal.

[PATCH net] ipv6: change route cache aging logic

2018-01-26 Thread Wei Wang

From: Wei Wang 

In current route cache aging logic, if a route has both RTF_EXPIRE and
RTF_GATEWAY set, the route will only be removed if the neighbor cache
has no RTN_ROUTE flag. Otherwise, even if the route has expired, it
won't get deleted.
Fix this logic to always check if the route has expired first and then
do the gateway neighbor cache check if previous check decide to not
remove the exception entry.

Fixes: 1859bac04fb6 ("ipv6: remove from fib tree aged out RTF_CACHE dst")
Signed-off-by: Wei Wang 
Signed-off-by: Eric Dumazet 
---
 net/ipv6/route.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0458b761f3c5..a560fb1d0230 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1586,12 +1586,19 @@ static void rt6_age_examine_exception(struct 
rt6_exception_bucket *bucket,
 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
 * expired, independently from their aging, as per RFC 8201 section 4
 */
-   if (!(rt->rt6i_flags & RTF_EXPIRES) &&
-   time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
-   RT6_TRACE("aging clone %p\n", rt);
+   if (!(rt->rt6i_flags & RTF_EXPIRES)) {
+   if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
+   RT6_TRACE("aging clone %p\n", rt);
+   rt6_remove_exception(bucket, rt6_ex);
+   return;
+   }
+   } else if (time_after(jiffies, rt->dst.expires)) {
+   RT6_TRACE("purging expired route %p\n", rt);
rt6_remove_exception(bucket, rt6_ex);
return;
-   } else if (rt->rt6i_flags & RTF_GATEWAY) {
+   }
+
+   if (rt->rt6i_flags & RTF_GATEWAY) {
struct neighbour *neigh;
__u8 neigh_flags = 0;
 
@@ -1606,11 +1613,8 @@ static void rt6_age_examine_exception(struct 
rt6_exception_bucket *bucket,
rt6_remove_exception(bucket, rt6_ex);
return;
}
-   } else if (__rt6_check_expired(rt)) {
-   RT6_TRACE("purging expired route %p\n", rt);
-   rt6_remove_exception(bucket, rt6_ex);
-   return;
}
+
gc_args->more++;
 }
 
-- 
2.16.0.rc1.238.g530d649a79-goog

Re: dvb usb issues since kernel 4.9

2018-01-26 Thread Mauro Carvalho Chehab

Em Fri, 26 Jan 2018 12:17:37 -0200
Mauro Carvalho Chehab  escreveu:

> Hi Alan,
> 
> Em Mon, 8 Jan 2018 14:15:35 -0500 (EST)
> Alan Stern  escreveu:
> 
> > On Mon, 8 Jan 2018, Linus Torvalds wrote:
> >   
> > > Can somebody tell which softirq it is that dvb/usb cares about?
> > 
> > I don't know about the DVB part.  The USB part is a little difficult to
> > analyze, mostly because the bug reports I've seen are mostly from
> > people running non-vanilla kernels.   
> 
> I suspect that the main reason for people not using non-vanilla Kernels
> is that, among other bugs, the dwc2 upstream driver has serious troubles
> handling ISOCH traffic.
> 
> Using Kernel 4.15-rc7 from this git tree:
>   https://git.linuxtv.org/mchehab/experimental.git/log/?h=softirq_fixup
> 
> (e. g. with the softirq bug partially reverted with Linux patch, and
>  the DWC2 deferred probe fixed)
> 
> With a PCTV 461e device, with uses em28xx driver + Montage frontend
> (with is the same used on dvbsky hardware - except for em28xx).
> 
> This device doesn't support bulk for DVB, just ISOCH. The drivers work 
> fine on x86.
> 
> Using a test signal at the bit rate of 56698,4 Kbits/s, that's what
> happens, when capturing less than one second of data:
> 
> $ dvbv5-zap -c ~/dvb_channel.conf "tv brasil" -l universal -X 100 -m 
> -t2dvbv5-zap -c ~/dvb_channel.conf "tv brasil" -l universal -X 100 -m -t2
> Using LNBf UNIVERSAL
>   Universal, Europe
>   Freqs : 10800 to 11800 MHz, LO: 9750 MHz
>   Freqs : 11600 to 12700 MHz, LO: 10600 MHz
> using demux 'dvb0.demux0'
> reading channels from file '/home/mchehab/dvb_channel.conf'
> tuning to 11468000 Hz
>(0x00) Signal= -33.90dBm
> Lock   (0x1f) Signal= -33.90dBm C/N= 30.28dB postBER= 2.33x10^-6
> dvb_dev_set_bufsize: buffer set to 6160384
>   dvb_set_pesfilter to 0x2000
> 354.08s: Starting capture
> 354.73s: only read 59220 bytes
> 354.73s: Stopping capture
> 
> [  354.000827] dwc2 3f98.usb: DWC OTG HCD EP DISABLE: 
> bEndpointAddress=0x84, ep->hcpriv=116f41b2
> [  354.000859] dwc2 3f98.usb: DWC OTG HCD EP RESET: bEndpointAddress=0x84
> [  354.010744] dwc2 3f98.usb: --Host Channel 5 Interrupt: Frame Overrun--
> ... (hundreds of thousands of Frame Overrun messages)
> [  354.660857] dwc2 3f98.usb: --Host Channel 5 Interrupt: Frame Overrun--
> [  354.660935] dwc2 3f98.usb: DWC OTG HCD URB Dequeue
> [  354.660959] dwc2 3f98.usb: Called usb_hcd_giveback_urb()
> [  354.660966] dwc2 3f98.usb:   urb->status = 0
> [  354.660992] dwc2 3f98.usb: DWC OTG HCD URB Dequeue
> [  354.661001] dwc2 3f98.usb: Called usb_hcd_giveback_urb()
> [  354.661008] dwc2 3f98.usb:   urb->status = 0
> [  354.661054] dwc2 3f98.usb: DWC OTG HCD URB Dequeue
> [  354.661065] dwc2 3f98.usb: Called usb_hcd_giveback_urb()
> [  354.661072] dwc2 3f98.usb:   urb->status = 0
> [  354.661107] dwc2 3f98.usb: DWC OTG HCD URB Dequeue
> [  354.661120] dwc2 3f98.usb: Called usb_hcd_giveback_urb()
> [  354.661127] dwc2 3f98.usb:   urb->status = 0
> [  354.661146] dwc2 3f98.usb: DWC OTG HCD URB Dequeue
> [  354.661158] dwc2 3f98.usb: Called usb_hcd_giveback_urb()
> [  354.661165] dwc2 3f98.usb:   urb->status = 0

Btw, 

Just in case, I also applied all recent pending dwc2 patches I found at
linux-usb (even trivial unrelated ones) at:

https://git.linuxtv.org/mchehab/experimental.git/log/?h=dwc2_patches

No differences. ISOCH is still broken.

If anyone wants to see the full logs, it is there:
https://pastebin.com/XJYyTwPv


Cheers,
Mauro

[PATCH iproute2] ip: address: fix stats64 JSON object name

2018-01-26 Thread Jakub Kicinski

The JSON object name for statistics in ip link show is "stats644".
Looks like a typo, commit d0e720111aad ("ip: ipaddress.c: add support
for json output") contains an example with the expected "stats64" name.

The fact that no one has noticed until now is probably an indication
that no one is using this object.  Hopefully it's not too late to fix
this, although IIUC this has already been in 4.13 and 4.14 releases :S

Fixes: d0e720111aad ("ip: ipaddress.c: add support for json output")
Signed-off-by: Jakub Kicinski 
---
 ip/ipaddress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index ba60125c1b78..67ac6bd31373 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -598,7 +598,7 @@ static void print_link_stats64(FILE *fp, const struct 
rtnl_link_stats64 *s,
   const struct rtattr *carrier_changes)
 {
if (is_json_context()) {
-   open_json_object("stats644");
+   open_json_object("stats64");
 
/* RX stats */
open_json_object("rx");
-- 
2.15.1

[PATCH iproute2-next 0/2] tc: qdisc: provide JSON output for RED and prio

2018-01-26 Thread Jakub Kicinski

Hi!

This small series adds support for JSON output for prio and RED qdiscs.

Jakub Kicinski (2):
  tc: red: JSON-ify RED output
  tc: prio: JSON-ify prio output

 include/json_print.h |  1 +
 lib/json_print.c |  1 +
 lib/json_writer.c|  4 
 tc/q_prio.c  | 12 
 tc/q_red.c   | 41 ++---
 5 files changed, 36 insertions(+), 23 deletions(-)

-- 
2.15.1

[PATCH iproute2-next 1/2] tc: red: JSON-ify RED output

2018-01-26 Thread Jakub Kicinski

Make JSON output work with RED Qdiscs.  Float/double printing
helpers have to be added/uncommented to print the probability.
Since TC stats in general are not split out to a separate object
the xstats printed by this patch are not separated either.

Signed-off-by: Jakub Kicinski 
---
 include/json_print.h |  1 +
 lib/json_print.c |  1 +
 lib/json_writer.c|  4 
 tc/q_red.c   | 41 ++---
 4 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/include/json_print.h b/include/json_print.h
index dc4d2bb3ba59..2ca7830adbd6 100644
--- a/include/json_print.h
+++ b/include/json_print.h
@@ -64,6 +64,7 @@ _PRINT_FUNC(hu, unsigned short);
 _PRINT_FUNC(hex, unsigned int);
 _PRINT_FUNC(0xhex, unsigned int);
 _PRINT_FUNC(lluint, unsigned long long int);
+_PRINT_FUNC(float, double);
 #undef _PRINT_FUNC
 
 #endif /* _JSON_PRINT_H_ */
diff --git a/lib/json_print.c b/lib/json_print.c
index aa527af652c5..6518ba98f5bf 100644
--- a/lib/json_print.c
+++ b/lib/json_print.c
@@ -120,6 +120,7 @@ _PRINT_FUNC(int, int);
 _PRINT_FUNC(hu, unsigned short);
 _PRINT_FUNC(uint, uint64_t);
 _PRINT_FUNC(lluint, unsigned long long int);
+_PRINT_FUNC(float, double);
 #undef _PRINT_FUNC
 
 void print_color_string(enum output_type type,
diff --git a/lib/json_writer.c b/lib/json_writer.c
index 6b77d288cce2..f3eeaf7bc479 100644
--- a/lib/json_writer.c
+++ b/lib/json_writer.c
@@ -209,12 +209,10 @@ void jsonw_float_fmt(json_writer_t *self, const char 
*fmt, double num)
jsonw_printf(self, fmt, num);
 }
 
-#ifdef notused
 void jsonw_float(json_writer_t *self, double num)
 {
jsonw_printf(self, "%g", num);
 }
-#endif
 
 void jsonw_hu(json_writer_t *self, unsigned short num)
 {
@@ -249,13 +247,11 @@ void jsonw_bool_field(json_writer_t *self, const char 
*prop, bool val)
jsonw_bool(self, val);
 }
 
-#ifdef notused
 void jsonw_float_field(json_writer_t *self, const char *prop, double val)
 {
jsonw_name(self, prop);
jsonw_float(self, val);
 }
-#endif
 
 void jsonw_float_field_fmt(json_writer_t *self,
   const char *prop,
diff --git a/tc/q_red.c b/tc/q_red.c
index 1949558f14f9..40ba7c3e07c1 100644
--- a/tc/q_red.c
+++ b/tc/q_red.c
@@ -183,23 +183,34 @@ static int red_print_opt(struct qdisc_util *qu, FILE *f, 
struct rtattr *opt)
RTA_PAYLOAD(tb[TCA_RED_MAX_P]) >= sizeof(__u32))
max_P = rta_getattr_u32(tb[TCA_RED_MAX_P]);
 
-   fprintf(f, "limit %s min %s max %s ",
-   sprint_size(qopt->limit, b1),
-   sprint_size(qopt->qth_min, b2),
-   sprint_size(qopt->qth_max, b3));
+   print_uint(PRINT_JSON, "limit", NULL, qopt->limit);
+   print_string(PRINT_FP, NULL, "limit %s ", sprint_size(qopt->limit, b1));
+   print_uint(PRINT_JSON, "min", NULL, qopt->qth_min);
+   print_string(PRINT_FP, NULL, "min %s ", sprint_size(qopt->qth_min, b2));
+   print_uint(PRINT_JSON, "max", NULL, qopt->qth_max);
+   print_string(PRINT_FP, NULL, "max %s ", sprint_size(qopt->qth_max, b3));
+
if (qopt->flags & TC_RED_ECN)
-   fprintf(f, "ecn ");
+   print_bool(PRINT_ANY, "ecn", "ecn ", true);
+   else
+   print_bool(PRINT_ANY, "ecn", NULL, false);
if (qopt->flags & TC_RED_HARDDROP)
-   fprintf(f, "harddrop ");
+   print_bool(PRINT_ANY, "harddrop", "harddrop ", true);
+   else
+   print_bool(PRINT_ANY, "harddrop", NULL, false);
if (qopt->flags & TC_RED_ADAPTATIVE)
-   fprintf(f, "adaptive ");
+   print_bool(PRINT_ANY, "adaptive", "adaptive ", true);
+   else
+   print_bool(PRINT_ANY, "adaptive", NULL, false);
if (show_details) {
-   fprintf(f, "ewma %u ", qopt->Wlog);
+   print_uint(PRINT_ANY, "ewma", "ewma %u ", qopt->Wlog);
if (max_P)
-   fprintf(f, "probability %lg ", max_P / pow(2, 32));
+   print_float(PRINT_ANY, "probability",
+   "probability %lg ", max_P / pow(2, 32));
else
-   fprintf(f, "Plog %u ", qopt->Plog);
-   fprintf(f, "Scell_log %u", qopt->Scell_log);
+   print_uint(PRINT_ANY, "Plog", "Plog %u ", qopt->Plog);
+   print_uint(PRINT_ANY, "Scell_log", "Scell_log %u",
+  qopt->Scell_log);
}
return 0;
 }
@@ -216,10 +227,10 @@ static int red_print_xstats(struct qdisc_util *qu, FILE 
*f, struct rtattr *xstat
return -1;
 
st = RTA_DATA(xstats);
-   fprintf(f, "  marked %u early %u pdrop %u other %u",
-   st->marked, st->early, st->pdrop, st->other);
-   return 0;
-
+   print_uint(PRINT_ANY, "marked", "  marked %u ", st->marked);
+   print_uint(PRINT_ANY, "early", "early %u ", st->early);
+

[PATCH iproute2-next 2/2] tc: prio: JSON-ify prio output

2018-01-26 Thread Jakub Kicinski

Make JSON output work with prio Qdiscs.  This will also make
other qdiscs which reuse the print_qopt work, like mqprio or
pfifo_fast.

Note that there is a double space between "priomap" and first
prio number.  Keep this original behaviour.

Signed-off-by: Jakub Kicinski 
---
 tc/q_prio.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tc/q_prio.c b/tc/q_prio.c
index 358cf06cca0a..8ef7cfa260d5 100644
--- a/tc/q_prio.c
+++ b/tc/q_prio.c
@@ -107,13 +107,17 @@ int prio_print_opt(struct qdisc_util *qu, FILE *f, struct 
rtattr *opt)
sizeof(*qopt)))
return -1;
 
-   fprintf(f, "bands %u priomap ", qopt->bands);
+   print_uint(PRINT_ANY, "bands", "bands %u ", qopt->bands);
+   open_json_array(PRINT_ANY, "priomap ");
for (i = 0; i <= TC_PRIO_MAX; i++)
-   fprintf(f, " %d", qopt->priomap[i]);
+   print_uint(PRINT_ANY, NULL, " %d", qopt->priomap[i]);
+   close_json_array(PRINT_ANY, "");
 
if (tb[TCA_PRIO_MQ])
-   fprintf(f, " multiqueue: %s ",
-   rta_getattr_u8(tb[TCA_PRIO_MQ]) ? "on" : "off");
+   print_string(PRINT_FP, NULL, " multiqueue: %s ",
+rta_getattr_u8(tb[TCA_PRIO_MQ]) ? "on" : "off");
+   print_bool(PRINT_JSON, "multiqueue", NULL,
+  tb[TCA_PRIO_MQ] && rta_getattr_u8(tb[TCA_PRIO_MQ]));
 
return 0;
 }
-- 
2.15.1

Re: [regresssion 4.15] Userspace compilation broken by uapi/linux/if_ether.h update

2018-01-26 Thread David Miller

From: Guillaume Nault 
Date: Fri, 26 Jan 2018 18:17:23 +0100

> On Fri, Jan 26, 2018 at 11:51:38AM +0100, Guillaume Nault wrote:
>> On Thu, Jan 25, 2018 at 11:21:34PM +0100, Hauke Mehrtens wrote:
>> > On 01/25/2018 03:58 PM, Guillaume Nault wrote:
>> > > Now that linux/libc-compat.h is included in linux/if_ether.h, it is
>> > > processed before netinet/in.h. Therefore, it sets the relevant
>> > > __UAPI_DEF_* guards to 1 (as _NETINET_IN_H isn't yet defined).
>> > > Then netinet/in.h is included, followed by linux/in.h. The later
>> > > doesn't realise that what it defines has already been included by
>> > > netinet/in.h because the __UAPI_DEF_* guards were set too early.
>> > > 
>> > This is about this commit:
>> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6926e041a8920c8ec27e4e155efa760aa01551fd
>> > 
>> > On option would be to move this into include/uapi/linux/if_ether.h and
>> > remove the include for libc-compat.h:
>> > #ifndef __UAPI_DEF_ETHHDR
>> > #define __UAPI_DEF_ETHHDR  1
>> > #endif
>> > 
>> > This will only work if netinet/if_ether.h is included before
>> > linux/if_ether.h, but I think this is very likely.
>> >
>> I don't see what makes its likely. That's not directly related to your
>> point, but for example, glibc guarantees the opposite as it includes
>> linux/if_ether.h at the beginning of netinet/if_ether.h.
>> 
>> > I think we can do this because we do not need some special libc handling
>> > like it is done for other symbols as __UAPI_DEF_ETHHDR is currently only
>> > needed by musl and not by glibc.
>> > 
>> That's ok for me as long as existing projects keep compiling. But all
>> __UAPI_DEF_* are currently centralised in libc-compat.h. Adding
>> __UAPI_DEF_ETHHDR in if_ether.h looks like defeating the purpose of
>> libc-compat.h and I wonder if that'd be accepted. Maybe with a
>> different name.
>> 
>> In any case, we're really late in the release cycle. If more discussion
>> is needed, it's probably better to revert and take time to work on a
>> solution for the next release.
>> 
> Hi David,
> 
> I just realise you've sent your last pull request for this release. I
> was waiting for feedbacks in order to avoid a revert. Should I send a
> revert now or do you prefer to sort this out later and backport a fix
> in 4.15.1?

We can do a -stable backport, and I was planning to help looking into this
as well.

Re: [PATCH net-next] bnxt_en: cleanup DIM work on device shutdown

2018-01-26 Thread Michael Chan

On Fri, Jan 26, 2018 at 7:27 AM, Andy Gospodarek  wrote:
>
> From: Andy Gospodarek 
>
> Make sure to cancel any pending work that might update driver coalesce
> settings when taking down an interface.
>
> Fixes: 6a8788f25625 ("bnxt_en: add support for software dynamic interrupt 
> moderation")
> Signed-off-by: Andy Gospodarek 
> Cc: Michael Chan 

Acked-by: Michael Chan

Re: [PATCH net-next,v2 2/2] net: sched: add em_ipt ematch for calling xtables matches

2018-01-26 Thread Pablo Neira Ayuso

On Fri, Jan 26, 2018 at 06:48:53PM +0200, Eyal Birger wrote:
> diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
> new file mode 100644
> index 000..2103b30
> --- /dev/null
> +++ b/net/sched/em_ipt.c
[...]
> +static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em,
> + struct tcf_pkt_info *info)
> +{
> + const struct em_ipt_match *im = (const void *)em->data;
> + struct xt_action_param acpar = {};
> + struct net_device *indev = NULL;
> + struct nf_hook_state state;
> + int ret;
> +
> + if (unlikely(!skb_at_tc_ingress(skb))) {
> + pr_notice_once("ipt match must not be used at egress\n");

Isn't there a way to reject the use of this from ->change()? ie. from
control plane configuration.

> + return 0;
> + }

[net-next 00/15][pull request] 10GbE Intel Wired LAN Driver Updates 2018-01-26

2018-01-26 Thread Jeff Kirsher

This series contains updates to ixgbe and ixgbevf.

Emil updates ixgbevf to match ixgbe functionality, starting with the
consolidating of functions that represent logical steps in the receive
process so we can later update them more easily.  Updated ixgbevf to
only synchronize the length of the frame, which will typically be the
MTU or smaller.  Updated the VF driver to use the length of the packet
instead of the DD status bit to determine if a new descriptor is ready
to be processed, which saves on reads and we can save time on
initialization.  Added support for DMA_ATTR_SKIP_CPU_SYNC/WEAK_ORDERING
to help improve performance on some platforms.  Updated the VF driver to
do bulk updates of the page reference count instead of just incrementing
it by one reference at a time.  Updated the VF driver to only go through
the region of the receive ring that was designated to be cleaned up,
rather than process the entire ring.

Colin Ian King adds the use of ARRAY_SIZE() on various arrays.

Miroslav Lichvar fixes an issue where ethtool was reporting timestamping
filters unsupported for X550, which is incorrect.

Paul adds support for reporting 5G link speed for some devices.

Dan Carpenter fixes a typo where && was used when it should have been
||.

The following are changes since commit 9515a2e082f91457db0ecff4b65371d0fb5d9aad:
  net/ipv4: Allow send to local broadcast from a socket bound to a VRF
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 10GbE

Colin Ian King (2):
  ixgbevf: use ARRAY_SIZE for various array sizing calculations
  ixgbe: use ARRAY_SIZE for array sizing calculation on array buf

Dan Carpenter (1):
  ixgbe: Fix && vs || typo

Emil Tantilov (10):
  ixgbevf: add function for checking if we can reuse page
  ixgbevf: only DMA sync frame length
  ixgbevf: use length to determine if descriptor is done
  ixgbevf: add support for DMA_ATTR_SKIP_CPU_SYNC/WEAK_ORDERING
  ixgbevf: update code to better handle incrementing page count
  ixgbevf: add counters for Rx page allocations
  ixgbevf: clear rx_buffer_info in configure instead of clean
  ixgbevf: improve performance and reduce size of ixgbevf_tx_map()
  ixgbevf: don't bother clearing tx_buffer_info in
ixgbevf_clean_tx_ring()
  ixgbe: don't set RXDCTL.RLPML for 82599

Miroslav Lichvar (1):
  ixgbe: Don't report unsupported timestamping filters for X550

Paul Greenwalt (1):
  ixgbe: add support for reporting 5G link speed

 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c   |   2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c  |  37 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  11 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c |   2 +-
 drivers/net/ethernet/intel/ixgbevf/ethtool.c  |   3 +
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  16 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 362 +-
 drivers/net/ethernet/intel/ixgbevf/vf.c   |  17 +-
 8 files changed, 271 insertions(+), 179 deletions(-)

-- 
2.14.3

[net-next 02/15] ixgbevf: only DMA sync frame length

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Based on commit 64f2525ca4e7 ("igb: Only DMA sync frame length")

On some architectures synching a buffer for DMA may be expensive.
Instead of the entire 2K receive buffer only synchronize the length of
the frame, which will typically be the MTU or smaller.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 7ffd429d8e40..0cc2688671ec 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -803,12 +803,12 @@ static bool ixgbevf_can_reuse_rx_page(struct 
ixgbevf_rx_buffer *rx_buffer,
  **/
 static bool ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring,
struct ixgbevf_rx_buffer *rx_buffer,
+   u16 size,
union ixgbe_adv_rx_desc *rx_desc,
struct sk_buff *skb)
 {
struct page *page = rx_buffer->page;
unsigned char *va = page_address(page) + rx_buffer->page_offset;
-   unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
 #if (PAGE_SIZE < 8192)
unsigned int truesize = IXGBEVF_RX_BUFSZ;
 #else
@@ -856,6 +856,7 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct 
ixgbevf_ring *rx_ring,
 {
struct ixgbevf_rx_buffer *rx_buffer;
struct page *page;
+   u16 size = le16_to_cpu(rx_desc->wb.upper.length);
 
rx_buffer = _ring->rx_buffer_info[rx_ring->next_to_clean];
page = rx_buffer->page;
@@ -890,11 +891,11 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct 
ixgbevf_ring *rx_ring,
dma_sync_single_range_for_cpu(rx_ring->dev,
  rx_buffer->dma,
  rx_buffer->page_offset,
- IXGBEVF_RX_BUFSZ,
+ size,
  DMA_FROM_DEVICE);
 
/* pull page into skb */
-   if (ixgbevf_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
+   if (ixgbevf_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) {
/* hand second half of page back to the ring */
ixgbevf_reuse_rx_page(rx_ring, rx_buffer);
} else {
-- 
2.14.3

[net-next 01/15] ixgbevf: add function for checking if we can reuse page

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Introduce ixgbevf_can_reuse_page() similar to the change in ixgbe from
commit af43da0dba0b
("ixgbe: Add function for checking to see if we can reuse page")

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 59 +--
 1 file changed, 33 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index ed5c3aea7939..7ffd429d8e40 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -754,6 +754,38 @@ static inline bool ixgbevf_page_is_reserved(struct page 
*page)
return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
+static bool ixgbevf_can_reuse_rx_page(struct ixgbevf_rx_buffer *rx_buffer,
+ struct page *page,
+ const unsigned int truesize)
+{
+   /* avoid re-using remote pages */
+   if (unlikely(ixgbevf_page_is_reserved(page)))
+   return false;
+
+#if (PAGE_SIZE < 8192)
+   /* if we are only owner of page we can reuse it */
+   if (unlikely(page_count(page) != 1))
+   return false;
+
+   /* flip page offset to other buffer */
+   rx_buffer->page_offset ^= IXGBEVF_RX_BUFSZ;
+
+#else
+   /* move offset up to the next cache line */
+   rx_buffer->page_offset += truesize;
+
+   if (rx_buffer->page_offset > (PAGE_SIZE - IXGBEVF_RX_BUFSZ))
+   return false;
+
+#endif
+   /* Even if we own the page, we are not allowed to use atomic_set()
+* This would break get_page_unless_zero() users.
+*/
+   page_ref_inc(page);
+
+   return true;
+}
+
 /**
  * ixgbevf_add_rx_frag - Add contents of Rx buffer to sk_buff
  * @rx_ring: rx descriptor ring to transact packets on
@@ -815,32 +847,7 @@ static bool ixgbevf_add_rx_frag(struct ixgbevf_ring 
*rx_ring,
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
(unsigned long)va & ~PAGE_MASK, size, truesize);
 
-   /* avoid re-using remote pages */
-   if (unlikely(ixgbevf_page_is_reserved(page)))
-   return false;
-
-#if (PAGE_SIZE < 8192)
-   /* if we are only owner of page we can reuse it */
-   if (unlikely(page_count(page) != 1))
-   return false;
-
-   /* flip page offset to other buffer */
-   rx_buffer->page_offset ^= IXGBEVF_RX_BUFSZ;
-
-#else
-   /* move offset up to the next cache line */
-   rx_buffer->page_offset += truesize;
-
-   if (rx_buffer->page_offset > (PAGE_SIZE - IXGBEVF_RX_BUFSZ))
-   return false;
-
-#endif
-   /* Even if we own the page, we are not allowed to use atomic_set()
-* This would break get_page_unless_zero() users.
-*/
-   page_ref_inc(page);
-
-   return true;
+   return ixgbevf_can_reuse_rx_page(rx_buffer, page, truesize);
 }
 
 static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring,
-- 
2.14.3

[net-next 03/15] ixgbevf: use length to determine if descriptor is done

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Based on:
commit 7ec0116c9131 ("igb: Use length to determine if descriptor is done")

This change makes it so that we use the length of the packet instead of the
DD status bit to determine if a new descriptor is ready to be processed.
The obvious advantage is that it cuts down on reads as we don't really even
need the DD bit if going from a 0 to a non-zero value on size is enough to
inform us that the packet has been completed.

In addition we only reset the Rx descriptor length for descriptor zero when
resetting a ring instead of having to do a memset with 0 over the entire
ring. By doing this we can save some time on initialization.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 0cc2688671ec..725fe2dca868 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -653,8 +653,8 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring 
*rx_ring,
i -= rx_ring->count;
}
 
-   /* clear the hdr_addr for the next_to_use descriptor */
-   rx_desc->read.hdr_addr = 0;
+   /* clear the length for the next_to_use descriptor */
+   rx_desc->wb.upper.length = 0;
 
cleaned_count--;
} while (cleaned_count);
@@ -938,7 +938,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector 
*q_vector,
 
rx_desc = IXGBEVF_RX_DESC(rx_ring, rx_ring->next_to_clean);
 
-   if (!ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_DD))
+   if (!rx_desc->wb.upper.length)
break;
 
/* This memory barrier is needed to keep us from reading
@@ -1729,6 +1729,7 @@ static void ixgbevf_configure_rx_ring(struct 
ixgbevf_adapter *adapter,
  struct ixgbevf_ring *ring)
 {
struct ixgbe_hw *hw = >hw;
+   union ixgbe_adv_rx_desc *rx_desc;
u64 rdba = ring->dma;
u32 rxdctl;
u8 reg_idx = ring->reg_idx;
@@ -1757,6 +1758,10 @@ static void ixgbevf_configure_rx_ring(struct 
ixgbevf_adapter *adapter,
IXGBE_WRITE_REG(hw, IXGBE_VFRDT(reg_idx), 0);
ring->tail = adapter->io_addr + IXGBE_VFRDT(reg_idx);
 
+   /* initialize Rx descriptor 0 */
+   rx_desc = IXGBEVF_RX_DESC(ring, 0);
+   rx_desc->wb.upper.length = 0;
+
/* reset ntu and ntc to place SW in sync with hardwdare */
ring->next_to_clean = 0;
ring->next_to_use = 0;
@@ -2141,9 +2146,6 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring 
*rx_ring)
 
size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count;
memset(rx_ring->rx_buffer_info, 0, size);
-
-   /* Zero out the descriptor ring */
-   memset(rx_ring->desc, 0, rx_ring->size);
 }
 
 /**
-- 
2.14.3

[net-next 06/15] ixgbevf: add counters for Rx page allocations

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

We already had placehloders for failed page and buffer allocations.
Added alloc_rx_page and made sure the stats are properly updated and
exposed in ethtool.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ethtool.c  |  3 +++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  6 --
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 23 ++-
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c 
b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index ff9d05f308ee..4400e49090b4 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -75,6 +75,9 @@ static struct ixgbe_stats ixgbevf_gstrings_stats[] = {
IXGBEVF_STAT("tx_timeout_count", tx_timeout_count),
IXGBEVF_NETDEV_STAT(multicast),
IXGBEVF_STAT("rx_csum_offload_errors", hw_csum_rx_error),
+   IXGBEVF_STAT("alloc_rx_page", alloc_rx_page),
+   IXGBEVF_STAT("alloc_rx_page_failed", alloc_rx_page_failed),
+   IXGBEVF_STAT("alloc_rx_buff_failed", alloc_rx_buff_failed),
 };
 
 #define IXGBEVF_QUEUE_STATS_LEN ( \
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index c70a789035ae..f6952425c87d 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -84,6 +84,7 @@ struct ixgbevf_tx_queue_stats {
 struct ixgbevf_rx_queue_stats {
u64 alloc_rx_page_failed;
u64 alloc_rx_buff_failed;
+   u64 alloc_rx_page;
u64 csum_err;
 };
 
@@ -295,8 +296,9 @@ struct ixgbevf_adapter {
u64 hw_csum_rx_error;
u64 hw_rx_no_dma_resources;
int num_msix_vectors;
-   u32 alloc_rx_page_failed;
-   u32 alloc_rx_buff_failed;
+   u64 alloc_rx_page_failed;
+   u64 alloc_rx_buff_failed;
+   u64 alloc_rx_page;
 
struct msix_entry *msix_entries;
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index ae2402ddd9fb..350afec3dde8 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -604,7 +604,7 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring 
*rx_ring,
if (dma_mapping_error(rx_ring->dev, dma)) {
__free_page(page);
 
-   rx_ring->rx_stats.alloc_rx_buff_failed++;
+   rx_ring->rx_stats.alloc_rx_page_failed++;
return false;
}
 
@@ -612,6 +612,7 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring 
*rx_ring,
bi->page = page;
bi->page_offset = 0;
bi->pagecnt_bias = 1;
+   rx_ring->rx_stats.alloc_rx_page++;
 
return true;
 }
@@ -963,8 +964,10 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector 
*q_vector,
skb = ixgbevf_fetch_rx_buffer(rx_ring, rx_desc, skb);
 
/* exit if we failed to retrieve a buffer */
-   if (!skb)
+   if (!skb) {
+   rx_ring->rx_stats.alloc_rx_buff_failed++;
break;
+   }
 
cleaned_count++;
 
@@ -2749,6 +2752,8 @@ static int ixgbevf_sw_init(struct ixgbevf_adapter 
*adapter)
 void ixgbevf_update_stats(struct ixgbevf_adapter *adapter)
 {
struct ixgbe_hw *hw = >hw;
+   u64 alloc_rx_page_failed = 0, alloc_rx_buff_failed = 0;
+   u64 alloc_rx_page = 0, hw_csum_rx_error = 0;
int i;
 
if (test_bit(__IXGBEVF_DOWN, >state) ||
@@ -2769,10 +2774,18 @@ void ixgbevf_update_stats(struct ixgbevf_adapter 
*adapter)
adapter->stats.vfmprc);
 
for (i = 0;  i  < adapter->num_rx_queues;  i++) {
-   adapter->hw_csum_rx_error +=
-   adapter->rx_ring[i]->hw_csum_rx_error;
-   adapter->rx_ring[i]->hw_csum_rx_error = 0;
+   struct ixgbevf_ring *rx_ring = adapter->rx_ring[i];
+
+   hw_csum_rx_error += rx_ring->rx_stats.csum_err;
+   alloc_rx_page_failed += rx_ring->rx_stats.alloc_rx_page_failed;
+   alloc_rx_buff_failed += rx_ring->rx_stats.alloc_rx_buff_failed;
+   alloc_rx_page += rx_ring->rx_stats.alloc_rx_page;
}
+
+   adapter->hw_csum_rx_error = hw_csum_rx_error;
+   adapter->alloc_rx_page_failed = alloc_rx_page_failed;
+   adapter->alloc_rx_buff_failed = alloc_rx_buff_failed;
+   adapter->alloc_rx_page = alloc_rx_page;
 }
 
 /**
-- 
2.14.3

[net-next 09/15] ixgbevf: don't bother clearing tx_buffer_info in ixgbevf_clean_tx_ring()

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

In the case of the Tx rings we need to only clear the Tx buffer_info when
we are resetting the rings.  Ideally we do this when we configure the ring
to bring it back up instead of when we are taking it down in order to avoid
dirtying pages we don't need to.

In addition we don't need to clear the Tx descriptor ring since we will
fully repopulate it when we begin transmitting frames and next_to_watch can
be cleared to prevent the ring from being cleaned beyond that point instead
of needing to touch anything in the Tx descriptor ring.

Finally with these changes we can avoid having to reset the skb member of
the Tx buffer_info structure in the cleanup path since the skb will always
be associated with the first buffer which has next_to_watch set.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 115 ++
 1 file changed, 72 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index d3415ee38597..9b3d43d28106 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -206,28 +206,6 @@ static void ixgbevf_set_ivar(struct ixgbevf_adapter 
*adapter, s8 direction,
}
 }
 
-static void ixgbevf_unmap_and_free_tx_resource(struct ixgbevf_ring *tx_ring,
-   struct ixgbevf_tx_buffer *tx_buffer)
-{
-   if (tx_buffer->skb) {
-   dev_kfree_skb_any(tx_buffer->skb);
-   if (dma_unmap_len(tx_buffer, len))
-   dma_unmap_single(tx_ring->dev,
-dma_unmap_addr(tx_buffer, dma),
-dma_unmap_len(tx_buffer, len),
-DMA_TO_DEVICE);
-   } else if (dma_unmap_len(tx_buffer, len)) {
-   dma_unmap_page(tx_ring->dev,
-  dma_unmap_addr(tx_buffer, dma),
-  dma_unmap_len(tx_buffer, len),
-  DMA_TO_DEVICE);
-   }
-   tx_buffer->next_to_watch = NULL;
-   tx_buffer->skb = NULL;
-   dma_unmap_len_set(tx_buffer, len, 0);
-   /* tx_buffer must be completely set up in the transmit path */
-}
-
 static u64 ixgbevf_get_tx_completed(struct ixgbevf_ring *ring)
 {
return ring->stats.packets;
@@ -349,7 +327,6 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector 
*q_vector,
 DMA_TO_DEVICE);
 
/* clear tx_buffer data */
-   tx_buffer->skb = NULL;
dma_unmap_len_set(tx_buffer, len, 0);
 
/* unmap remaining buffers */
@@ -1576,6 +1553,10 @@ static void ixgbevf_configure_tx_ring(struct 
ixgbevf_adapter *adapter,
txdctl |= (1u << 8) |/* HTHRESH = 1 */
   32;   /* PTHRESH = 32 */
 
+   /* reinitialize tx_buffer_info */
+   memset(ring->tx_buffer_info, 0,
+  sizeof(struct ixgbevf_tx_buffer) * ring->count);
+
clear_bit(__IXGBEVF_HANG_CHECK_ARMED, >state);
 
IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(reg_idx), txdctl);
@@ -2184,23 +2165,57 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring 
*rx_ring)
  **/
 static void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring)
 {
-   struct ixgbevf_tx_buffer *tx_buffer_info;
-   unsigned long size;
-   unsigned int i;
+   u16 i = tx_ring->next_to_clean;
+   struct ixgbevf_tx_buffer *tx_buffer = _ring->tx_buffer_info[i];
 
-   if (!tx_ring->tx_buffer_info)
-   return;
+   while (i != tx_ring->next_to_use) {
+   union ixgbe_adv_tx_desc *eop_desc, *tx_desc;
+
+   /* Free all the Tx ring sk_buffs */
+   dev_kfree_skb_any(tx_buffer->skb);
+
+   /* unmap skb header data */
+   dma_unmap_single(tx_ring->dev,
+dma_unmap_addr(tx_buffer, dma),
+dma_unmap_len(tx_buffer, len),
+DMA_TO_DEVICE);
+
+   /* check for eop_desc to determine the end of the packet */
+   eop_desc = tx_buffer->next_to_watch;
+   tx_desc = IXGBEVF_TX_DESC(tx_ring, i);
+
+   /* unmap remaining buffers */
+   while (tx_desc != eop_desc) {
+   tx_buffer++;
+   tx_desc++;
+   i++;
+   if (unlikely(i == tx_ring->count)) {
+   i = 0;
+   tx_buffer = tx_ring->tx_buffer_info;
+   tx_desc = IXGBEVF_TX_DESC(tx_ring, 0);
+   }
+
+

[net-next 11/15] ixgbe: use ARRAY_SIZE for array sizing calculation on array buf

2018-01-26 Thread Jeff Kirsher

From: Colin Ian King 

Use the ARRAY_SIZE macro on array buf to determine size of the array.
Improvement suggested by coccinelle.

Signed-off-by: Colin Ian King 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index 3bce26e77090..f470d0204771 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -949,7 +949,7 @@ static s32 ixgbe_checksum_ptr_x550(struct ixgbe_hw *hw, u16 
ptr,
u16 length, bufsz, i, start;
u16 *local_buffer;
 
-   bufsz = sizeof(buf) / sizeof(buf[0]);
+   bufsz = ARRAY_SIZE(buf);
 
/* Read a chunk at the pointer location */
if (!buffer) {
-- 
2.14.3

[net-next 12/15] ixgbe: Don't report unsupported timestamping filters for X550

2018-01-26 Thread Jeff Kirsher

From: Miroslav Lichvar 

The current code enables on X550 timestamping of all packets for any
filter, which means ethtool should not report any PTP-specific filters
as unsupported.

Signed-off-by: Miroslav Lichvar 
Acked-by: Richard Cochran 
Acked-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 37 
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 317351025fd7..221f15803480 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -3085,26 +3085,9 @@ static int ixgbe_get_ts_info(struct net_device *dev,
case ixgbe_mac_X550EM_x:
case ixgbe_mac_x550em_a:
info->rx_filters |= BIT(HWTSTAMP_FILTER_ALL);
-   /* fallthrough */
+   break;
case ixgbe_mac_X540:
case ixgbe_mac_82599EB:
-   info->so_timestamping =
-   SOF_TIMESTAMPING_TX_SOFTWARE |
-   SOF_TIMESTAMPING_RX_SOFTWARE |
-   SOF_TIMESTAMPING_SOFTWARE |
-   SOF_TIMESTAMPING_TX_HARDWARE |
-   SOF_TIMESTAMPING_RX_HARDWARE |
-   SOF_TIMESTAMPING_RAW_HARDWARE;
-
-   if (adapter->ptp_clock)
-   info->phc_index = ptp_clock_index(adapter->ptp_clock);
-   else
-   info->phc_index = -1;
-
-   info->tx_types =
-   BIT(HWTSTAMP_TX_OFF) |
-   BIT(HWTSTAMP_TX_ON);
-
info->rx_filters |=
BIT(HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
BIT(HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
@@ -3113,6 +3096,24 @@ static int ixgbe_get_ts_info(struct net_device *dev,
default:
return ethtool_op_get_ts_info(dev, info);
}
+
+   info->so_timestamping =
+   SOF_TIMESTAMPING_TX_SOFTWARE |
+   SOF_TIMESTAMPING_RX_SOFTWARE |
+   SOF_TIMESTAMPING_SOFTWARE |
+   SOF_TIMESTAMPING_TX_HARDWARE |
+   SOF_TIMESTAMPING_RX_HARDWARE |
+   SOF_TIMESTAMPING_RAW_HARDWARE;
+
+   if (adapter->ptp_clock)
+   info->phc_index = ptp_clock_index(adapter->ptp_clock);
+   else
+   info->phc_index = -1;
+
+   info->tx_types =
+   BIT(HWTSTAMP_TX_OFF) |
+   BIT(HWTSTAMP_TX_ON);
+
return 0;
 }
 
-- 
2.14.3

[net-next 04/15] ixgbevf: add support for DMA_ATTR_SKIP_CPU_SYNC/WEAK_ORDERING

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Based on commit 5be5955425c2
("igb: update driver to make use of DMA_ATTR_SKIP_CPU_SYNC")
and
commit 7bd175928280 ("igb: Add support for DMA_ATTR_WEAK_ORDERING")

Convert the calls to dma_map/unmap_page() to the attributes version
and add DMA_ATTR_SKIP_CPU_SYNC/WEAK_ORDERING which should help
improve performance on some platforms.

Move sync_for_cpu call before we perform a prefetch to avoid
invalidating the first 128 bytes of the packet on architectures where
that call may invalidate the cache.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  3 ++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 57 ++-
 2 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 581f44bbd7b3..b1da9f41c1dc 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -260,6 +260,9 @@ static inline void ixgbevf_write_tail(struct ixgbevf_ring 
*ring, u32 value)
 #define MIN_MSIX_Q_VECTORS 1
 #define MIN_MSIX_COUNT (MIN_MSIX_Q_VECTORS + NON_Q_VECTORS)
 
+#define IXGBEVF_RX_DMA_ATTR \
+   (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
 /* board specific private data structure */
 struct ixgbevf_adapter {
/* this field must be first, see ixgbevf_process_skb_fields */
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 725fe2dca868..fbd493efd14e 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -595,8 +595,8 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring 
*rx_ring,
}
 
/* map page for use */
-   dma = dma_map_page(rx_ring->dev, page, 0,
-  PAGE_SIZE, DMA_FROM_DEVICE);
+   dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE,
+DMA_FROM_DEVICE, IXGBEVF_RX_DMA_ATTR);
 
/* if mapping failed free memory back to system since
 * there isn't much point in holding memory we can't use
@@ -639,6 +639,12 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring 
*rx_ring,
if (!ixgbevf_alloc_mapped_page(rx_ring, bi))
break;
 
+   /* sync the buffer for use by the device */
+   dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
+bi->page_offset,
+IXGBEVF_RX_BUFSZ,
+DMA_FROM_DEVICE);
+
/* Refresh the desc even if pkt_addr didn't change
 * because each write-back erases this info.
 */
@@ -741,12 +747,6 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring 
*rx_ring,
new_buff->page = old_buff->page;
new_buff->dma = old_buff->dma;
new_buff->page_offset = old_buff->page_offset;
-
-   /* sync the buffer for use by the device */
-   dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
-new_buff->page_offset,
-IXGBEVF_RX_BUFSZ,
-DMA_FROM_DEVICE);
 }
 
 static inline bool ixgbevf_page_is_reserved(struct page *page)
@@ -862,6 +862,13 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct 
ixgbevf_ring *rx_ring,
page = rx_buffer->page;
prefetchw(page);
 
+   /* we are reusing so sync this buffer for CPU use */
+   dma_sync_single_range_for_cpu(rx_ring->dev,
+ rx_buffer->dma,
+ rx_buffer->page_offset,
+ size,
+ DMA_FROM_DEVICE);
+
if (likely(!skb)) {
void *page_addr = page_address(page) +
  rx_buffer->page_offset;
@@ -887,21 +894,15 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct 
ixgbevf_ring *rx_ring,
prefetchw(skb->data);
}
 
-   /* we are reusing so sync this buffer for CPU use */
-   dma_sync_single_range_for_cpu(rx_ring->dev,
- rx_buffer->dma,
- rx_buffer->page_offset,
- size,
- DMA_FROM_DEVICE);
-
/* pull page into skb */
if (ixgbevf_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) {
/* hand second half of page back to the ring */
ixgbevf_reuse_rx_page(rx_ring, rx_buffer);
} else {

[net-next 05/15] ixgbevf: update code to better handle incrementing page count

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Based on commit bd4171a5d4c2
("igb: update code to better handle incrementing page count")

Update the driver code so that we do bulk updates of the page reference
count instead of just incrementing it by one reference at a time.  The
advantage to doing this is that we cut down on atomic operations and
this in turn should give us a slight improvement in cycles per packet.
In addition if we eventually move this over to using build_skb the gains
will be more noticeable.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  7 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 30 +--
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index b1da9f41c1dc..c70a789035ae 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -62,7 +62,12 @@ struct ixgbevf_tx_buffer {
 struct ixgbevf_rx_buffer {
dma_addr_t dma;
struct page *page;
-   unsigned int page_offset;
+#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
+   __u32 page_offset;
+#else
+   __u16 page_offset;
+#endif
+   __u16 pagecnt_bias;
 };
 
 struct ixgbevf_stats {
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index fbd493efd14e..ae2402ddd9fb 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -611,6 +611,7 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring 
*rx_ring,
bi->dma = dma;
bi->page = page;
bi->page_offset = 0;
+   bi->pagecnt_bias = 1;
 
return true;
 }
@@ -747,6 +748,7 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring 
*rx_ring,
new_buff->page = old_buff->page;
new_buff->dma = old_buff->dma;
new_buff->page_offset = old_buff->page_offset;
+   new_buff->pagecnt_bias = old_buff->pagecnt_bias;
 }
 
 static inline bool ixgbevf_page_is_reserved(struct page *page)
@@ -758,13 +760,15 @@ static bool ixgbevf_can_reuse_rx_page(struct 
ixgbevf_rx_buffer *rx_buffer,
  struct page *page,
  const unsigned int truesize)
 {
+   unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--;
+
/* avoid re-using remote pages */
if (unlikely(ixgbevf_page_is_reserved(page)))
return false;
 
 #if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */
-   if (unlikely(page_count(page) != 1))
+   if (unlikely(page_ref_count(page) != pagecnt_bias))
return false;
 
/* flip page offset to other buffer */
@@ -778,10 +782,15 @@ static bool ixgbevf_can_reuse_rx_page(struct 
ixgbevf_rx_buffer *rx_buffer,
return false;
 
 #endif
-   /* Even if we own the page, we are not allowed to use atomic_set()
-* This would break get_page_unless_zero() users.
+
+   /* If we have drained the page fragment pool we need to update
+* the pagecnt_bias and page count so that we fully restock the
+* number of references the driver holds.
 */
-   page_ref_inc(page);
+   if (unlikely(pagecnt_bias == 1)) {
+   page_ref_add(page, USHRT_MAX);
+   rx_buffer->pagecnt_bias = USHRT_MAX;
+   }
 
return true;
 }
@@ -827,7 +836,6 @@ static bool ixgbevf_add_rx_frag(struct ixgbevf_ring 
*rx_ring,
return true;
 
/* this page cannot be reused so discard it */
-   put_page(page);
return false;
}
 
@@ -899,10 +907,13 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct 
ixgbevf_ring *rx_ring,
/* hand second half of page back to the ring */
ixgbevf_reuse_rx_page(rx_ring, rx_buffer);
} else {
-   /* we are not reusing the buffer so unmap it */
+   /* We are not reusing the buffer so unmap it and free
+* any references we are holding to it
+*/
dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
 PAGE_SIZE, DMA_FROM_DEVICE,
 IXGBEVF_RX_DMA_ATTR);
+   __page_frag_cache_drain(page, rx_buffer->pagecnt_bias);
}
 
/* clear contents of buffer_info */
@@ -2135,6 +2146,8 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring 
*rx_ring)
struct ixgbevf_rx_buffer *rx_buffer;
 
rx_buffer = _ring->rx_buffer_info[i];
+   if (!rx_buffer->page)
+   continue;

[net-next 08/15] ixgbevf: improve performance and reduce size of ixgbevf_tx_map()

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Based on commit ec718254cbfe
("ixgbe: Improve performance and reduce size of ixgbe_tx_map")

This change is meant to both improve the performance and reduce the size of
ixgbevf_tx_map().

Expand the work done in the main loop by pushing first into tx_buffer.
This allows us to pull in the dma_mapping_error check, the tx_buffer value
assignment, and the initial DMA value assignment to the Tx descriptor.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 45 ++-
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index a793f9ea05e7..d3415ee38597 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3532,34 +3532,37 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
   struct ixgbevf_tx_buffer *first,
   const u8 hdr_len)
 {
-   dma_addr_t dma;
struct sk_buff *skb = first->skb;
struct ixgbevf_tx_buffer *tx_buffer;
union ixgbe_adv_tx_desc *tx_desc;
-   struct skb_frag_struct *frag = _shinfo(skb)->frags[0];
-   unsigned int data_len = skb->data_len;
-   unsigned int size = skb_headlen(skb);
-   unsigned int paylen = skb->len - hdr_len;
+   struct skb_frag_struct *frag;
+   dma_addr_t dma;
+   unsigned int data_len, size;
u32 tx_flags = first->tx_flags;
-   __le32 cmd_type;
+   __le32 cmd_type = ixgbevf_tx_cmd_type(tx_flags);
u16 i = tx_ring->next_to_use;
 
tx_desc = IXGBEVF_TX_DESC(tx_ring, i);
 
-   ixgbevf_tx_olinfo_status(tx_desc, tx_flags, paylen);
-   cmd_type = ixgbevf_tx_cmd_type(tx_flags);
+   ixgbevf_tx_olinfo_status(tx_desc, tx_flags, skb->len - hdr_len);
+
+   size = skb_headlen(skb);
+   data_len = skb->data_len;
 
dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
-   if (dma_mapping_error(tx_ring->dev, dma))
-   goto dma_error;
 
-   /* record length, and DMA address */
-   dma_unmap_len_set(first, len, size);
-   dma_unmap_addr_set(first, dma, dma);
+   tx_buffer = first;
 
-   tx_desc->read.buffer_addr = cpu_to_le64(dma);
+   for (frag = _shinfo(skb)->frags[0];; frag++) {
+   if (dma_mapping_error(tx_ring->dev, dma))
+   goto dma_error;
+
+   /* record length, and DMA address */
+   dma_unmap_len_set(tx_buffer, len, size);
+   dma_unmap_addr_set(tx_buffer, dma, dma);
+
+   tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
-   for (;;) {
while (unlikely(size > IXGBE_MAX_DATA_PER_TXD)) {
tx_desc->read.cmd_type_len =
cmd_type | cpu_to_le32(IXGBE_MAX_DATA_PER_TXD);
@@ -3570,12 +3573,12 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
tx_desc = IXGBEVF_TX_DESC(tx_ring, 0);
i = 0;
}
+   tx_desc->read.olinfo_status = 0;
 
dma += IXGBE_MAX_DATA_PER_TXD;
size -= IXGBE_MAX_DATA_PER_TXD;
 
tx_desc->read.buffer_addr = cpu_to_le64(dma);
-   tx_desc->read.olinfo_status = 0;
}
 
if (likely(!data_len))
@@ -3589,23 +3592,15 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
tx_desc = IXGBEVF_TX_DESC(tx_ring, 0);
i = 0;
}
+   tx_desc->read.olinfo_status = 0;
 
size = skb_frag_size(frag);
data_len -= size;
 
dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
   DMA_TO_DEVICE);
-   if (dma_mapping_error(tx_ring->dev, dma))
-   goto dma_error;
 
tx_buffer = _ring->tx_buffer_info[i];
-   dma_unmap_len_set(tx_buffer, len, size);
-   dma_unmap_addr_set(tx_buffer, dma, dma);
-
-   tx_desc->read.buffer_addr = cpu_to_le64(dma);
-   tx_desc->read.olinfo_status = 0;
-
-   frag++;
}
 
/* write last descriptor with RS and EOP bits */
-- 
2.14.3

[net-next 13/15] ixgbe: add support for reporting 5G link speed

2018-01-26 Thread Jeff Kirsher

From: Paul Greenwalt 

Since 5G link speed is supported by some devices, add reporting of 5G link
speed.

Signed-off-by: Paul Greenwalt 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index bbb622f15a77..643c7288ea0f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7259,6 +7259,9 @@ static void ixgbe_watchdog_link_is_up(struct 
ixgbe_adapter *adapter)
case IXGBE_LINK_SPEED_10GB_FULL:
speed_str = "10 Gbps";
break;
+   case IXGBE_LINK_SPEED_5GB_FULL:
+   speed_str = "5 Gbps";
+   break;
case IXGBE_LINK_SPEED_2_5GB_FULL:
speed_str = "2.5 Gbps";
break;
-- 
2.14.3

[net-next 07/15] ixgbevf: clear rx_buffer_info in configure instead of clean

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

Based on commit d2bead576e67
("igb: Clear Rx buffer_info in configure instead of clean")

This change makes it so that instead of going through the entire ring on Rx
cleanup we only go through the region that was designated to be cleaned up
and stop when we reach the region where new allocations should start.

In addition we can avoid having to perform a memset on the Rx buffer_info
structures until we are about to start using the ring again.

Signed-off-by: Emil Tantilov 
Tested-by: Krishneil Singh 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 26 +++
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 350afec3dde8..a793f9ea05e7 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -1773,6 +1773,10 @@ static void ixgbevf_configure_rx_ring(struct 
ixgbevf_adapter *adapter,
IXGBE_WRITE_REG(hw, IXGBE_VFRDT(reg_idx), 0);
ring->tail = adapter->io_addr + IXGBE_VFRDT(reg_idx);
 
+   /* initialize rx_buffer_info */
+   memset(ring->rx_buffer_info, 0,
+  sizeof(struct ixgbevf_rx_buffer) * ring->count);
+
/* initialize Rx descriptor 0 */
rx_desc = IXGBEVF_RX_DESC(ring, 0);
rx_desc->wb.upper.length = 0;
@@ -2131,8 +2135,7 @@ void ixgbevf_up(struct ixgbevf_adapter *adapter)
  **/
 static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring)
 {
-   unsigned long size;
-   unsigned int i;
+   u16 i = rx_ring->next_to_clean;
 
/* Free Rx ring sk_buff */
if (rx_ring->skb) {
@@ -2140,17 +2143,11 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring 
*rx_ring)
rx_ring->skb = NULL;
}
 
-   /* ring already cleared, nothing to do */
-   if (!rx_ring->rx_buffer_info)
-   return;
-
/* Free all the Rx ring pages */
-   for (i = 0; i < rx_ring->count; i++) {
+   while (i != rx_ring->next_to_alloc) {
struct ixgbevf_rx_buffer *rx_buffer;
 
rx_buffer = _ring->rx_buffer_info[i];
-   if (!rx_buffer->page)
-   continue;
 
/* Invalidate cache lines that may have been written to by
 * device so that we avoid corrupting memory.
@@ -2171,11 +2168,14 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring 
*rx_ring)
__page_frag_cache_drain(rx_buffer->page,
rx_buffer->pagecnt_bias);
 
-   rx_buffer->page = NULL;
+   i++;
+   if (i == rx_ring->count)
+   i = 0;
}
 
-   size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count;
-   memset(rx_ring->rx_buffer_info, 0, size);
+   rx_ring->next_to_alloc = 0;
+   rx_ring->next_to_clean = 0;
+   rx_ring->next_to_use = 0;
 }
 
 /**
@@ -3090,7 +3090,7 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_ring 
*rx_ring)
int size;
 
size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count;
-   rx_ring->rx_buffer_info = vzalloc(size);
+   rx_ring->rx_buffer_info = vmalloc(size);
if (!rx_ring->rx_buffer_info)
goto err;
 
-- 
2.14.3

[net-next 14/15] ixgbe: Fix && vs || typo

2018-01-26 Thread Jeff Kirsher

From: Dan Carpenter 

"offset" can't be both 0x0 and 0x so presumably || was intended
instead of &&.  That matches with how this check is done in other
functions.

Fixes: 73834aec7199 ("ixgbe: extend firmware version support")
Signed-off-by: Dan Carpenter 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 7ac7ef9b37ff..61188f343955 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -4087,7 +4087,7 @@ void ixgbe_get_oem_prod_version(struct ixgbe_hw *hw,
hw->eeprom.ops.read(hw, NVM_OEM_PROD_VER_PTR, );
 
/* Return is offset to OEM Product Version block is invalid */
-   if (offset == 0x0 && offset == NVM_INVALID_PTR)
+   if (offset == 0x0 || offset == NVM_INVALID_PTR)
return;
 
/* Read product version block */
-- 
2.14.3

[net-next 15/15] ixgbe: don't set RXDCTL.RLPML for 82599

2018-01-26 Thread Jeff Kirsher

From: Emil Tantilov 

commit 2de6aa3a666e ("ixgbe: Add support for padding packet")

Uses RXDCTL.RLPML to limit the maximum frame size on Rx when using
build_skb. Unfortunately that register does not work on 82599.

Added an explicit check to avoid setting this register on 82599 MAC.

Extended the comment related to the setting of RXDCTL.RLPML to better
explain its purpose.

Signed-off-by: Emil Tantilov 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 643c7288ea0f..0da5aa2c8aba 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -4133,11 +4133,15 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter 
*adapter,
rxdctl &= ~0x3F;
rxdctl |=  0x080420;
 #if (PAGE_SIZE < 8192)
-   } else {
+   /* RXDCTL.RLPML does not work on 82599 */
+   } else if (hw->mac.type != ixgbe_mac_82599EB) {
rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK |
IXGBE_RXDCTL_RLPML_EN);
 
-   /* Limit the maximum frame size so we don't overrun the skb */
+   /* Limit the maximum frame size so we don't overrun the skb.
+* This can happen in SRIOV mode when the MTU of the VF is
+* higher than the MTU of the PF.
+*/
if (ring_uses_build_skb(ring) &&
!test_bit(__IXGBE_RX_3K_BUFFER, >state))
rxdctl |= IXGBE_MAX_2K_FRAME_BUILD_SKB |
-- 
2.14.3

1 2 >

1 - 100 of 193 matches

Mail list logo