date:20160513

[PATCH -next v2 2/4] net: w5100: fix MAC filtering for W5500

2016-05-13 Thread Akinobu Mita

W5500 has different bit position for MAC filter in Socket n mode
register from W5100 and W5200.

Signed-off-by: Akinobu Mita 
Cc: Mike Sinkovsky 
Cc: David S. Miller 
---
* No changes from v1

 drivers/net/ethernet/wiznet/w5100.c | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/wiznet/w5100.c 
b/drivers/net/ethernet/wiznet/w5100.c
index 56ceed9..c80438c 100644
--- a/drivers/net/ethernet/wiznet/w5100.c
+++ b/drivers/net/ethernet/wiznet/w5100.c
@@ -63,8 +63,9 @@ MODULE_LICENSE("GPL");
 #define S0_REGS(priv)  ((priv)->s0_regs)
 
 #define W5100_S0_MR(priv)  (S0_REGS(priv) + W5100_Sn_MR)
-#define   S0_MR_MACRAW   0x04 /* MAC RAW mode (promiscuous) */
-#define   S0_MR_MACRAW_MF0x44 /* MAC RAW mode (filtered) */
+#define   S0_MR_MACRAW   0x04 /* MAC RAW mode */
+#define   S0_MR_MF   0x40 /* MAC Filter for W5100 and W5200 */
+#define   W5500_S0_MR_MF 0x80 /* MAC Filter for W5500 */
 #define W5100_S0_CR(priv)  (S0_REGS(priv) + W5100_Sn_CR)
 #define   S0_CR_OPEN 0x01 /* OPEN command */
 #define   S0_CR_CLOSE0x10 /* CLOSE command */
@@ -702,8 +703,16 @@ static int w5100_hw_reset(struct w5100_priv *priv)
 
 static void w5100_hw_start(struct w5100_priv *priv)
 {
-   w5100_write(priv, W5100_S0_MR(priv), priv->promisc ?
- S0_MR_MACRAW : S0_MR_MACRAW_MF);
+   u8 mode = S0_MR_MACRAW;
+
+   if (!priv->promisc) {
+   if (priv->ops->chip_id == W5500)
+   mode |= W5500_S0_MR_MF;
+   else
+   mode |= S0_MR_MF;
+   }
+
+   w5100_write(priv, W5100_S0_MR(priv), mode);
w5100_command(priv, S0_CR_OPEN);
w5100_enable_intr(priv);
 }
-- 
2.7.4

[PATCH -next v2 4/4] net: w5100-spi: add support to specify MAC address by device tree

2016-05-13 Thread Akinobu Mita

This adds support to specify the MAC address by 'mac-address' or
'local-mac-address' properties in the device tree.  These are common
properties for the Ethernet controller.

Signed-off-by: Akinobu Mita 
Cc: Mike Sinkovsky 
Cc: David S. Miller 
---
* No changes from v1

 drivers/net/ethernet/wiznet/w5100-spi.c | 4 +++-
 drivers/net/ethernet/wiznet/w5100.c | 5 +++--
 drivers/net/ethernet/wiznet/w5100.h | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/wiznet/w5100-spi.c 
b/drivers/net/ethernet/wiznet/w5100-spi.c
index b868e45..93a2d3c 100644
--- a/drivers/net/ethernet/wiznet/w5100-spi.c
+++ b/drivers/net/ethernet/wiznet/w5100-spi.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "w5100.h"
@@ -414,6 +415,7 @@ static int w5100_spi_probe(struct spi_device *spi)
const struct spi_device_id *id = spi_get_device_id(spi);
const struct w5100_ops *ops;
int priv_size;
+   const void *mac = of_get_mac_address(spi->dev.of_node);
 
switch (id->driver_data) {
case W5100:
@@ -432,7 +434,7 @@ static int w5100_spi_probe(struct spi_device *spi)
return -EINVAL;
}
 
-   return w5100_probe(>dev, ops, priv_size, NULL, spi->irq, -EINVAL);
+   return w5100_probe(>dev, ops, priv_size, mac, spi->irq, -EINVAL);
 }
 
 static int w5100_spi_remove(struct spi_device *spi)
diff --git a/drivers/net/ethernet/wiznet/w5100.c 
b/drivers/net/ethernet/wiznet/w5100.c
index 43fdf88..e9ae7e6 100644
--- a/drivers/net/ethernet/wiznet/w5100.c
+++ b/drivers/net/ethernet/wiznet/w5100.c
@@ -1052,7 +1052,7 @@ static const struct net_device_ops w5100_netdev_ops = {
 static int w5100_mmio_probe(struct platform_device *pdev)
 {
struct wiznet_platform_data *data = dev_get_platdata(>dev);
-   u8 *mac_addr = NULL;
+   const void *mac_addr = NULL;
struct resource *mem;
const struct w5100_ops *ops;
int irq;
@@ -1087,7 +1087,8 @@ void *w5100_ops_priv(const struct net_device *ndev)
 EXPORT_SYMBOL_GPL(w5100_ops_priv);
 
 int w5100_probe(struct device *dev, const struct w5100_ops *ops,
-   int sizeof_ops_priv, u8 *mac_addr, int irq, int link_gpio)
+   int sizeof_ops_priv, const void *mac_addr, int irq,
+   int link_gpio)
 {
struct w5100_priv *priv;
struct net_device *ndev;
diff --git a/drivers/net/ethernet/wiznet/w5100.h 
b/drivers/net/ethernet/wiznet/w5100.h
index f8a16fa..17983a3 100644
--- a/drivers/net/ethernet/wiznet/w5100.h
+++ b/drivers/net/ethernet/wiznet/w5100.h
@@ -30,7 +30,8 @@ struct w5100_ops {
 void *w5100_ops_priv(const struct net_device *ndev);
 
 int w5100_probe(struct device *dev, const struct w5100_ops *ops,
-   int sizeof_ops_priv, u8 *mac_addr, int irq, int link_gpio);
+   int sizeof_ops_priv, const void *mac_addr, int irq,
+   int link_gpio);
 int w5100_remove(struct device *dev);
 
 extern const struct dev_pm_ops w5100_pm_ops;
-- 
2.7.4

[PATCH -next v2 3/4] net: w5100: increase TX timeout period

2016-05-13 Thread Akinobu Mita

This increases TX timeout period from one second to 5 seconds which is
the default value if the driver doesn't explicitly set
net_device->watchdog_timeo.

The one second timeout is too short for W5100 with SPI interface mode
which doesn't support burst READ/WRITE processing in the SPI transfer.
If the packet is transmitted while RX packets are being received at a
very high rate, the TX transmittion work in the workqueue is delayed
and the watchdog timer is expired.

Signed-off-by: Akinobu Mita 
Cc: Mike Sinkovsky 
Cc: David S. Miller 
---
* v2
- Remove the watchdong_timeo assignment to set default tx timeout,
  suggested by David Miller.

 drivers/net/ethernet/wiznet/w5100.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/wiznet/w5100.c 
b/drivers/net/ethernet/wiznet/w5100.c
index c80438c..43fdf88 100644
--- a/drivers/net/ethernet/wiznet/w5100.c
+++ b/drivers/net/ethernet/wiznet/w5100.c
@@ -1142,7 +1142,6 @@ int w5100_probe(struct device *dev, const struct 
w5100_ops *ops,
 
ndev->netdev_ops = _netdev_ops;
ndev->ethtool_ops = _ethtool_ops;
-   ndev->watchdog_timeo = HZ;
netif_napi_add(ndev, >napi, w5100_napi_poll, 16);
 
/* This chip doesn't support VLAN packets with normal MTU,
-- 
2.7.4

[PATCH -next v2 0/4] net: w5100: collection of small changes

2016-05-13 Thread Akinobu Mita

This patch series is the collection of relatively small changes for
w5100 driver which includes a cleanup with no functional change,
two fixes, and adding a functionality.

* Changes from v1
- Remove the watchdong_timeo assignment to set default tx timeout,
  suggested by David Miller.

Akinobu Mita (4):
  net: w5100: remove unused is_w5200()
  net: w5100: fix MAC filtering for W5500
  net: w5100: increase TX timeout period
  net: w5100-spi: add support to specify MAC address by device tree

 drivers/net/ethernet/wiznet/w5100-spi.c |  4 +++-
 drivers/net/ethernet/wiznet/w5100.c | 28 
 drivers/net/ethernet/wiznet/w5100.h |  3 ++-
 3 files changed, 21 insertions(+), 14 deletions(-)

Cc: Mike Sinkovsky 
Cc: David S. Miller 
-- 
2.7.4

[PATCH -next v2 1/4] net: w5100: remove unused is_w5200()

2016-05-13 Thread Akinobu Mita

The is_w5200() function is not used anymore by the commit which adds
the W5500 support.

Signed-off-by: Akinobu Mita 
Cc: Mike Sinkovsky 
Cc: David S. Miller 
---
* No changes from v1

 drivers/net/ethernet/wiznet/w5100.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/net/ethernet/wiznet/w5100.c 
b/drivers/net/ethernet/wiznet/w5100.c
index 8ed0c77..56ceed9 100644
--- a/drivers/net/ethernet/wiznet/w5100.c
+++ b/drivers/net/ethernet/wiznet/w5100.c
@@ -173,11 +173,6 @@ struct w5100_priv {
struct work_struct restart_work;
 };
 
-static inline bool is_w5200(struct w5100_priv *priv)
-{
-   return priv->ops->chip_id == W5200;
-}
-
 /
  *
  *  Lowlevel I/O functions
-- 
2.7.4

Re: [RFC PATCH 2/2] Documentation: devictree: Add macb mdio bindings

2016-05-13 Thread Harini Katakam

HI Andrew,

On Fri, May 13, 2016 at 11:13 PM, Andrew Lunn  wrote:
> Hi Harini
>
> Is this backward compatible? Will devices using the old binding still
> work?

It isn't right now.
I will have to assign the bus read/write functions conditionally in order to
do that - I'll see if I can make it clean.

>
>   /* Disable RX and TX (XXX: Should we halt the transmission
>* more gracefully?)
> */
> -   macb_writel(bp, NCR, 0);
> +   ctrl = macb_readl(bp, NCR);
> +   ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE));
> +   macb_writel(bp, NCR, ctrl);
>
> /* Clear the stats registers (XXX: Update stats first?) */
> -   macb_writel(bp, NCR, MACB_BIT(CLRSTAT));
> +   ctrl |= MACB_BIT(CLRSTAT);
> +   macb_writel(bp, NCR, ctrl);
>
> /* Clear all status flags */
> macb_writel(bp, TSR, -1);
>
> It is not clear to me what this part has to do with MDIO.
>

Sorry, I'll move this to a separate patch in my next version.
It is intended to write those registers without disturbing reserved bits.

Regards,
Harini

[PATCH] nf_queue: Make the queue_handler pernet

2016-05-13 Thread Eric W. Biederman


Florian Weber reported:
> Under full load (unshare() in loop -> OOM conditions) we can
> get kernel panic:
>
> BUG: unable to handle kernel NULL pointer dereference at 0008
> IP: [] nfqnl_nf_hook_drop+0x35/0x70
> [..]
> task: 88012dfa3840 ti: 88012dffc000 task.ti: 88012dffc000
> RIP: 0010:[]  [] 
> nfqnl_nf_hook_drop+0x35/0x70
> RSP: :88012dfffd80  EFLAGS: 00010206
> RAX: 0008 RBX: 81add0c0 RCX: 88013fd8
> [..]
> Call Trace:
>  [] nf_queue_nf_hook_drop+0x18/0x20
>  [] nf_unregister_net_hook+0xdb/0x150
>  [] netfilter_net_exit+0x2f/0x60
>  [] ops_exit_list.isra.4+0x38/0x60
>  [] setup_net+0xc2/0x120
>  [] copy_net_ns+0x79/0x120
>  [] create_new_namespaces+0x11b/0x1e0
>  [] unshare_nsproxy_namespaces+0x57/0xa0
>  [] SyS_unshare+0x1b2/0x340
>  [] entry_SYSCALL_64_fastpath+0x1e/0xa8
> Code: 65 00 48 89 e5 41 56 41 55 41 54 53 83 e8 01 48 8b 97 70 12 00 00 48 98 
> 49 89 f4 4c 8b 74 c2 18 4d 8d 6e 08 49 81 c6 88 00 00 00 <49> 8b 5d 00 48 85 
> db 74 1a 48 89 df 4c 89 e2 48 c7 c6 90 68 47
>

The simple fix for this requires a new pernet variable for struct
nf_queue that indicates when it is safe to use the dynamically
allocated nf_queue state.

As we need a variable anyway make nf_register_queue_handler and
nf_unregister_queue_handler pernet.  This allows the existing logic of
when it is safe to use the state from the nfnetlink_queue module to be
reused with no changes except for making it per net.

The syncrhonize_rcu from nf_unregister_queue_handler is moved to a new
function nfnl_queue_net_exit_batch so that the worst case of having a
syncrhonize_rcu in the pernet exit path is not experienced in batch
mode.

Reported-by: Florian Westphal 
Acked-by: Florian Westphal 
Signed-off-by: "Eric W. Biederman" 
---
 include/net/netfilter/nf_queue.h |  4 ++--
 include/net/netns/netfilter.h|  2 ++
 net/netfilter/nf_queue.c | 17 -
 net/netfilter/nfnetlink_queue.c  | 18 --
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 9c5638ad872e..0dbce55437f2 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -28,8 +28,8 @@ struct nf_queue_handler {
struct nf_hook_ops *ops);
 };
 
-void nf_register_queue_handler(const struct nf_queue_handler *qh);
-void nf_unregister_queue_handler(void);
+void nf_register_queue_handler(struct net *net, const struct nf_queue_handler 
*qh);
+void nf_unregister_queue_handler(struct net *net);
 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict);
 
 void nf_queue_entry_get_refs(struct nf_queue_entry *entry);
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 38aa4983e2a9..36d723579af2 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -5,11 +5,13 @@
 
 struct proc_dir_entry;
 struct nf_logger;
+struct nf_queue_handler;
 
 struct netns_nf {
 #if defined CONFIG_PROC_FS
struct proc_dir_entry *proc_netfilter;
 #endif
+   const struct nf_queue_handler __rcu *queue_handler;
const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO];
 #ifdef CONFIG_SYSCTL
struct ctl_table_header *nf_log_dir_header;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 5baa8e24e6ac..b19ad20a705c 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -26,23 +26,21 @@
  * Once the queue is registered it must reinject all packets it
  * receives, no matter what.
  */
-static const struct nf_queue_handler __rcu *queue_handler __read_mostly;
 
 /* return EBUSY when somebody else is registered, return EEXIST if the
  * same handler is registered, return 0 in case of success. */
-void nf_register_queue_handler(const struct nf_queue_handler *qh)
+void nf_register_queue_handler(struct net *net, const struct nf_queue_handler 
*qh)
 {
/* should never happen, we only have one queueing backend in kernel */
-   WARN_ON(rcu_access_pointer(queue_handler));
-   rcu_assign_pointer(queue_handler, qh);
+   WARN_ON(rcu_access_pointer(net->nf.queue_handler));
+   rcu_assign_pointer(net->nf.queue_handler, qh);
 }
 EXPORT_SYMBOL(nf_register_queue_handler);
 
 /* The caller must flush their queue before this */
-void nf_unregister_queue_handler(void)
+void nf_unregister_queue_handler(struct net *net)
 {
-   RCU_INIT_POINTER(queue_handler, NULL);
-   synchronize_rcu();
+   RCU_INIT_POINTER(net->nf.queue_handler, NULL);
 }
 EXPORT_SYMBOL(nf_unregister_queue_handler);
 
@@ -103,7 +101,7 @@ void nf_queue_nf_hook_drop(struct net *net, struct 
nf_hook_ops *ops)
const struct nf_queue_handler *qh;
 
rcu_read_lock();
-   qh = rcu_dereference(queue_handler);
+   qh = rcu_dereference(net->nf.queue_handler);
if (qh)

Re: What ixgbe devices support HWTSTAMP_FILTER_ALL for hardware time stamping?

2016-05-13 Thread Guy Harris

On May 13, 2016, at 4:12 PM, Guy Harris  wrote:

>   drivers/net/ethernet/freescale/gianfar*.c
> 
> reports HWTSTAMP_FILTER_ALL even if the device doesn't have 
> FSL_GIANFAR_DEV_HAS_TIMER set so that it claims that devices without a timer 
> can do hardware timestamping,

Nope - I checked, and it doesn't claim to support any hardware timestamping at 
all if it doesn't have FSL_GIANFAR_DEV_HAS_TIMER set, so that *particular* 
driver is OK.

Re: [PATCH v2 4/5] drivers: net: xgene: fix statistics counters race condition

2016-05-13 Thread Eric Dumazet

On Fri, 2016-05-13 at 16:53 -0700, Iyappan Subramanian wrote:
> This patch fixes the race condition on updating the statistics
> counters by moving the counters to the ring structure.
> 

...

> @@ -1127,12 +1127,31 @@ static struct rtnl_link_stats64 
> *xgene_enet_get_stats64(
>  {
>   struct xgene_enet_pdata *pdata = netdev_priv(ndev);
>   struct rtnl_link_stats64 *stats = >stats;
> + struct xgene_enet_desc_ring *ring;
> + int i;
> +
> + memset(stats, 0, sizeof(struct rtnl_link_stats64));

The memset() should already be done in the caller.

> + for (i = 0; i < pdata->txq_cnt; i++) {
> + ring = pdata->tx_ring[i];
> + if (ring) {
> + stats->tx_packets += ring->tx_packets;
> + stats->tx_bytes += ring->tx_bytes;
> + }
> + }
>  
> - stats->rx_errors += stats->rx_length_errors +
> - stats->rx_crc_errors +
> - stats->rx_frame_errors +
> - stats->rx_fifo_errors;


Right, this piece was completely buggy.

Not a race, but rx_errors was growing every time stats were fetched on
this device.


stats->rx_errors =   sum(...)was probably the intent.



> - memcpy(storage, >stats, sizeof(struct rtnl_link_stats64));
> + for (i = 0; i < pdata->rxq_cnt; i++) {
> + ring = pdata->rx_ring[i];
> + if (ring) {
> + stats->rx_packets += ring->rx_packets;
> + stats->rx_bytes += ring->rx_bytes;
> + stats->rx_errors += ring->rx_length_errors +
> + ring->rx_crc_errors +
> + ring->rx_frame_errors +
> + ring->rx_fifo_errors;
> + stats->rx_dropped += ring->rx_dropped;
> + }
> + }
> + memcpy(storage, stats, sizeof(struct rtnl_link_stats64));
>  
>   return storage;
>  }
> diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h 
> b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
> index cc40c30..092fbec 100644
> --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
> +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
> @@ -121,6 +121,16 @@ struct xgene_enet_desc_ring {
>   struct xgene_enet_raw_desc16 *raw_desc16;
>   };
>   __le64 *exp_bufs;
> + u64 tx_packets;
> + u64 tx_bytes;
> + u64 rx_packets;
> + u64 rx_bytes;
> + u64 rx_dropped;
> + u64 rx_errors;
> + u64 rx_length_errors;
> + u64 rx_crc_errors;
> + u64 rx_frame_errors;
> + u64 rx_fifo_errors;

Note that there is still a race on 32bit arches.

It seems this driver should really use "unsigned long" counters and
provide standard ndo_get_stats()

Re: [PATCH net] net: dsa: mv88e6xxx: remove bridge work

2016-05-13 Thread Vivien Didelot

Hi David,

Vivien Didelot  writes:

> Now that the bridge code defers the switchdev port state setting, there
> is no need to defer the port STP state change within the mv88e6xxx code.
> Thus get rid of the driver's bridge work code.
>
> This also fixes a race condition where the DSA layer assumes that the
> bridge code already set the unbridged port's STP state to Disabled
> before restoring the Forwarding state.
>
> As a consequence, this also fixes the FDB flush for the unbridged port
> which now correctly occurs during the Forwarding to Disabled transition.
>
> Fixes: 0bc05d585d38 ("switchdev: allow caller to explicitly request attr_set 
> as deferred")
> Reported-by: Andrew Lunn 
> Signed-off-by: Vivien Didelot 

This patch doesn't apply to -net, only applies to net-next...

How should I handle that, do I resend a patch for net-next with the good
subject prefix, and a v2 for -net?

Sorry for the noise,

  Vivien

Re: [PATCH v2 0/5] drivers: net: xgene: Bug fixes

2016-05-13 Thread David Miller

From: Iyappan Subramanian 
Date: Fri, 13 May 2016 16:52:56 -0700

> This patch set addresses the following bug fixes that were found during 
> testing.
> 
>   1. IPv4 forward test crash
> - drivers: net: xgene: fix IPv4 forward crash
> 
>   2. Sharing of irqs
> - drivers: net: xgene: fix sharing of irqs
> 
>   3. Ununiform latency across queues
> - drivers: net: xgene: fix ununiform latency across queues
> 
>   4. Fix statistics counters race condition
> - drivers: net: xgene: fix statistics counters race condition
> 
>   5. Correcting register offset and field lengths
> - drivers: net: xgene: fix register offset
> 
> v2: Address review comments from v1
> - Defer TSO fix, and reposting all other patches from v1
> 
> v1:
> - Initial version
> 
> Signed-off-by: Iyappan Subramanian 

Series applied, thanks.

Re: [net-next 00/14][pull request] 1GbE Intel Wired LAN Driver Updates 2016-05-13

2016-05-13 Thread David Miller

From: Jeff Kirsher 
Date: Fri, 13 May 2016 16:43:41 -0700

> This series contains updates to e1000e, igb and igbvf.

Pulled, thanks Jeff.

Re: [PATCH nf V2] netfilter: fix oops in nfqueue during netns error unwinding

2016-05-13 Thread Eric W. Biederman

Florian Westphal  writes:

> Eric W. Biederman  wrote:
>> Florian could you test and verify this patch fixes your issues?
>
> Yes, this seems to work.
>
> Pablo, I'm fine with this patch going into -nf/stable but I do not think
> making the pointers per netns is a desireable option in the long term.
>
>> Unlike the other possibilities that have been discussed this also
>> addresses the nf_queue path as well as the nf_queue_hook_drop path.
>
> The nf_queue path should have been fine, no?
>
> Or putting it differently: can we start processing skbs before a netns
> is fully initialized?

The practical case that worries me is what happens when someone does
"rmmod nfnetlink_queue" while the system is running.  It appears to me
that today we could free the per netns data during the rcu grace period
and cause a similar issue in nfnl_queue_pernet.

That looks like it could affect both the nf_queue path and the
nf_queue_nf_hook_drop path.

Eric

No "unreachable" response for an outgoing TCP connection when using fwmark

2016-05-13 Thread Marcin Szewczyk

Hi,

I hope this is the right group even though it concerns mainly
development but I have not found a better one.

I would like to route locally-generated traffic:
- to a TCP port "A" of host "H" via the interface "A",
- to a TCP port "B" of host "H" via the interface "B".

Interface "B" is sometimes unavailable. Then I would like the kernel to
notify the user with "Network is unreachable" error if one tries to
connect to the TCP port "B".

I have tried the following settings (when the interface "B" is down):

# iptables -t mangle -nvL OUTPUT
Chain OUTPUT (policy ACCEPT 2 packets, 120 bytes)
 pkts bytes target prot opt in out source   destination 

2   120 MARK   all  --  *  *   0.0.0.0/0host-H-ip   
  MARK set 0x64
(for simplicity the rule does not include ports yet)

# ip rule
0:  from all lookup local 
32765:  from all fwmark 0x64 lookup 200 
32766:  from all lookup main 
32767:  from all lookup default

# ip route show table 200
prohibit default

There are no surprises when the interface "B" is available and the 200
table contains a regular routing rule.

But when the interface "B" is not available I get satisfactory results
only for ICMP and UDP:

$ ping host-H-ip
PING host-H-ip (host-H-ip) 56(84) bytes of data.
ping: sendmsg: Network is unreachable

$ nc -nuv host-H-ip 5000
(UNKNOWN) [host-H-ip] 5000 (?) open
foo
too many output retries : Network is unreachable

For TCP it just hangs:
$ nc -nv host-H-ip 5000

I have found some threads that seem to be connected with the topic:
http://marc.info/?l=linux-netdev=136633466813095=2
http://www.spinics.net/lists/netdev/msg281954.html
http://netfilter-devel.vger.kernel.narkive.com/8ggjpr4G/netfilter-core-mangle-table-rules-are-not-taken-into-account-in-preliminary-routing-decision

Unfortunately, I still do not understand why I do not get "Network is
unreachable" for TCP as well. Could anybody explain it to me?

Best regards,

-- 
Marcin Szewczyk
http://wodny.org

[PATCH net] net: dsa: mv88e6xxx: remove bridge work

2016-05-13 Thread Vivien Didelot

Now that the bridge code defers the switchdev port state setting, there
is no need to defer the port STP state change within the mv88e6xxx code.
Thus get rid of the driver's bridge work code.

This also fixes a race condition where the DSA layer assumes that the
bridge code already set the unbridged port's STP state to Disabled
before restoring the Forwarding state.

As a consequence, this also fixes the FDB flush for the unbridged port
which now correctly occurs during the Forwarding to Disabled transition.

Fixes: 0bc05d585d38 ("switchdev: allow caller to explicitly request attr_set as 
deferred")
Reported-by: Andrew Lunn 
Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx.c | 37 -
 drivers/net/dsa/mv88e6xxx.h |  5 -
 2 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index a3f0e7e..ba9dfc9 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1373,6 +1373,7 @@ static void mv88e6xxx_port_stp_state_set(struct 
dsa_switch *ds, int port,
 {
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
int stp_state;
+   int err;
 
if (!mv88e6xxx_has(ps, MV88E6XXX_FLAG_PORTSTATE))
return;
@@ -1394,12 +1395,13 @@ static void mv88e6xxx_port_stp_state_set(struct 
dsa_switch *ds, int port,
break;
}
 
-   /* mv88e6xxx_port_stp_state_set may be called with softirqs disabled,
-* so we can not update the port state directly but need to schedule it.
-*/
-   ps->ports[port].state = stp_state;
-   set_bit(port, ps->port_state_update_mask);
-   schedule_work(>bridge_work);
+   mutex_lock(>smi_mutex);
+   err = _mv88e6xxx_port_state(ps, port, stp_state);
+   mutex_unlock(>smi_mutex);
+
+   if (err)
+   netdev_err(ds->ports[port], "failed to update state to %s\n",
+  mv88e6xxx_port_state_names[stp_state]);
 }
 
 static int _mv88e6xxx_port_pvid(struct mv88e6xxx_priv_state *ps, int port,
@@ -2535,27 +2537,6 @@ static void mv88e6xxx_port_bridge_leave(struct 
dsa_switch *ds, int port)
mutex_unlock(>smi_mutex);
 }
 
-static void mv88e6xxx_bridge_work(struct work_struct *work)
-{
-   struct mv88e6xxx_priv_state *ps;
-   struct dsa_switch *ds;
-   int port;
-
-   ps = container_of(work, struct mv88e6xxx_priv_state, bridge_work);
-   ds = ps->ds;
-
-   mutex_lock(>smi_mutex);
-
-   for (port = 0; port < ps->info->num_ports; ++port)
-   if (test_and_clear_bit(port, ps->port_state_update_mask) &&
-   _mv88e6xxx_port_state(ps, port, ps->ports[port].state))
-   netdev_warn(ds->ports[port],
-   "failed to update state to %s\n",
-   
mv88e6xxx_port_state_names[ps->ports[port].state]);
-
-   mutex_unlock(>smi_mutex);
-}
-
 static int _mv88e6xxx_phy_page_write(struct mv88e6xxx_priv_state *ps,
 int port, int page, int reg, int val)
 {
@@ -3145,8 +3126,6 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 
ps->ds = ds;
 
-   INIT_WORK(>bridge_work, mv88e6xxx_bridge_work);
-
if (mv88e6xxx_has(ps, MV88E6XXX_FLAG_EEPROM))
mutex_init(>eeprom_mutex);
 
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 40e8721..36d0e15 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -543,7 +543,6 @@ struct mv88e6xxx_vtu_stu_entry {
 
 struct mv88e6xxx_priv_port {
struct net_device *bridge_dev;
-   u8 state;
 };
 
 struct mv88e6xxx_priv_state {
@@ -593,10 +592,6 @@ struct mv88e6xxx_priv_state {
 
struct mv88e6xxx_priv_port  ports[DSA_MAX_PORTS];
 
-   DECLARE_BITMAP(port_state_update_mask, DSA_MAX_PORTS);
-
-   struct work_struct bridge_work;
-
/* A switch may have a GPIO line tied to its reset pin. Parse
 * this from the device tree, and use it before performing
 * switch soft reset.
-- 
2.8.2

[PATCH v2 4/5] drivers: net: xgene: fix statistics counters race condition

2016-05-13 Thread Iyappan Subramanian

This patch fixes the race condition on updating the statistics
counters by moving the counters to the ring structure.

Signed-off-by: Iyappan Subramanian 
Tested-by: Toan Le 
---
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c   | 19 ++-
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.h   |  2 ++
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 41 +---
 drivers/net/ethernet/apm/xgene/xgene_enet_main.h | 10 ++
 4 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
index 457f745..2f5638f 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -219,27 +219,30 @@ void xgene_enet_parse_error(struct xgene_enet_desc_ring 
*ring,
struct xgene_enet_pdata *pdata,
enum xgene_enet_err_code status)
 {
-   struct rtnl_link_stats64 *stats = >stats;
-
switch (status) {
case INGRESS_CRC:
-   stats->rx_crc_errors++;
+   ring->rx_crc_errors++;
+   ring->rx_dropped++;
break;
case INGRESS_CHECKSUM:
case INGRESS_CHECKSUM_COMPUTE:
-   stats->rx_errors++;
+   ring->rx_errors++;
+   ring->rx_dropped++;
break;
case INGRESS_TRUNC_FRAME:
-   stats->rx_frame_errors++;
+   ring->rx_frame_errors++;
+   ring->rx_dropped++;
break;
case INGRESS_PKT_LEN:
-   stats->rx_length_errors++;
+   ring->rx_length_errors++;
+   ring->rx_dropped++;
break;
case INGRESS_PKT_UNDER:
-   stats->rx_frame_errors++;
+   ring->rx_frame_errors++;
+   ring->rx_dropped++;
break;
case INGRESS_FIFO_OVERRUN:
-   stats->rx_fifo_errors++;
+   ring->rx_fifo_errors++;
break;
default:
break;
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h 
b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
index ba7da98..ecfeffe 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
@@ -201,6 +201,8 @@ enum xgene_enet_rm {
 #define USERINFO_LEN   32
 #define FPQNUM_POS 32
 #define FPQNUM_LEN 12
+#define ELERR_POS   46
+#define ELERR_LEN   2
 #define NV_POS 50
 #define NV_LEN 1
 #define LL_POS 51
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index e75d1a9..d208b17 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -443,8 +443,8 @@ static netdev_tx_t xgene_enet_start_xmit(struct sk_buff 
*skb,
 
skb_tx_timestamp(skb);
 
-   pdata->stats.tx_packets++;
-   pdata->stats.tx_bytes += skb->len;
+   tx_ring->tx_packets++;
+   tx_ring->tx_bytes += skb->len;
 
pdata->ring_ops->wr_cmd(tx_ring, count);
return NETDEV_TX_OK;
@@ -483,12 +483,12 @@ static int xgene_enet_rx_frame(struct 
xgene_enet_desc_ring *rx_ring,
skb = buf_pool->rx_skb[skb_index];
 
/* checking for error */
-   status = GET_VAL(LERR, le64_to_cpu(raw_desc->m0));
+   status = (GET_VAL(ELERR, le64_to_cpu(raw_desc->m0)) << LERR_LEN) ||
+ GET_VAL(LERR, le64_to_cpu(raw_desc->m0));
if (unlikely(status > 2)) {
dev_kfree_skb_any(skb);
xgene_enet_parse_error(rx_ring, netdev_priv(rx_ring->ndev),
   status);
-   pdata->stats.rx_dropped++;
ret = -EIO;
goto out;
}
@@ -506,8 +506,8 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring 
*rx_ring,
xgene_enet_skip_csum(skb);
}
 
-   pdata->stats.rx_packets++;
-   pdata->stats.rx_bytes += datalen;
+   rx_ring->rx_packets++;
+   rx_ring->rx_bytes += datalen;
napi_gro_receive(_ring->napi, skb);
 out:
if (--rx_ring->nbufpool == 0) {
@@ -1127,12 +1127,31 @@ static struct rtnl_link_stats64 *xgene_enet_get_stats64(
 {
struct xgene_enet_pdata *pdata = netdev_priv(ndev);
struct rtnl_link_stats64 *stats = >stats;
+   struct xgene_enet_desc_ring *ring;
+   int i;
+
+   memset(stats, 0, sizeof(struct rtnl_link_stats64));
+   for (i = 0; i < pdata->txq_cnt; i++) {
+   ring = pdata->tx_ring[i];
+   if (ring) {
+   stats->tx_packets += ring->tx_packets;
+   stats->tx_bytes += ring->tx_bytes;
+   }
+

[PATCH v2 0/5] drivers: net: xgene: Bug fixes

2016-05-13 Thread Iyappan Subramanian

This patch set addresses the following bug fixes that were found during testing.

  1. IPv4 forward test crash
- drivers: net: xgene: fix IPv4 forward crash

  2. Sharing of irqs
- drivers: net: xgene: fix sharing of irqs

  3. Ununiform latency across queues
- drivers: net: xgene: fix ununiform latency across queues

  4. Fix statistics counters race condition
- drivers: net: xgene: fix statistics counters race condition

  5. Correcting register offset and field lengths
- drivers: net: xgene: fix register offset

v2: Address review comments from v1
- Defer TSO fix, and reposting all other patches from v1

v1:
- Initial version

Signed-off-by: Iyappan Subramanian 
---

Iyappan Subramanian (5):
  drivers: net: xgene: fix IPv4 forward crash
  drivers: net: xgene: fix sharing of irqs
  drivers: net: xgene: fix ununiform latency across queues
  drivers: net: xgene: fix statistics counters race condition
  drivers: net: xgene: fix register offset

 drivers/net/ethernet/apm/xgene/xgene_enet_cle.c   | 11 ++--
 drivers/net/ethernet/apm/xgene/xgene_enet_cle.h   |  2 +
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c| 19 +++---
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.h|  8 ++-
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c  | 75 +--
 drivers/net/ethernet/apm/xgene/xgene_enet_main.h  | 18 --
 drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h |  2 +-
 7 files changed, 94 insertions(+), 41 deletions(-)

-- 
1.9.1

[PATCH v2 3/5] drivers: net: xgene: fix ununiform latency across queues

2016-05-13 Thread Iyappan Subramanian

This patch addresses ununiform latency across queues by adding
more queues to match with, upto number of CPU cores.

Also, number of interrupts are increased and the channel numbers
are reordered.

Signed-off-by: Iyappan Subramanian 
Tested-by: Toan Le 
---
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 30 ++--
 drivers/net/ethernet/apm/xgene/xgene_enet_main.h |  8 +++
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index ed90664..e75d1a9 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -1247,6 +1247,13 @@ static int xgene_enet_get_irqs(struct xgene_enet_pdata 
*pdata)
for (i = 0; i < max_irqs; i++) {
ret = platform_get_irq(pdev, i);
if (ret <= 0) {
+   if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) {
+   max_irqs = i;
+   pdata->rxq_cnt = max_irqs / 2;
+   pdata->txq_cnt = max_irqs / 2;
+   pdata->cq_cnt = max_irqs / 2;
+   break;
+   }
dev_err(dev, "Unable to get ENET IRQ\n");
ret = ret ? : -ENXIO;
return ret;
@@ -1450,19 +1457,28 @@ static void xgene_enet_setup_ops(struct 
xgene_enet_pdata *pdata)
pdata->port_ops = _xgport_ops;
pdata->cle_ops = _cle3in_ops;
pdata->rm = RM0;
-   pdata->rxq_cnt = XGENE_NUM_RX_RING;
-   pdata->txq_cnt = XGENE_NUM_TX_RING;
-   pdata->cq_cnt = XGENE_NUM_TXC_RING;
+   if (!pdata->rxq_cnt) {
+   pdata->rxq_cnt = XGENE_NUM_RX_RING;
+   pdata->txq_cnt = XGENE_NUM_TX_RING;
+   pdata->cq_cnt = XGENE_NUM_TXC_RING;
+   }
break;
}
 
if (pdata->enet_id == XGENE_ENET1) {
switch (pdata->port_id) {
case 0:
-   pdata->cpu_bufnum = START_CPU_BUFNUM_0;
-   pdata->eth_bufnum = START_ETH_BUFNUM_0;
-   pdata->bp_bufnum = START_BP_BUFNUM_0;
-   pdata->ring_num = START_RING_NUM_0;
+   if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) {
+   pdata->cpu_bufnum = X2_START_CPU_BUFNUM_0;
+   pdata->eth_bufnum = X2_START_ETH_BUFNUM_0;
+   pdata->bp_bufnum = X2_START_BP_BUFNUM_0;
+   pdata->ring_num = START_RING_NUM_0;
+   } else {
+   pdata->cpu_bufnum = START_CPU_BUFNUM_0;
+   pdata->eth_bufnum = START_ETH_BUFNUM_0;
+   pdata->bp_bufnum = START_BP_BUFNUM_0;
+   pdata->ring_num = START_RING_NUM_0;
+   }
break;
case 1:
if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) {
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h 
b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
index 0a2887b..cc40c30 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
@@ -49,10 +49,10 @@
 #define XGENE_ENET_MSS 1448
 #define XGENE_MIN_ENET_FRAME_SIZE  60
 
-#define XGENE_MAX_ENET_IRQ 8
-#define XGENE_NUM_RX_RING  4
-#define XGENE_NUM_TX_RING  4
-#define XGENE_NUM_TXC_RING 4
+#define XGENE_MAX_ENET_IRQ 16
+#define XGENE_NUM_RX_RING  8
+#define XGENE_NUM_TX_RING  8
+#define XGENE_NUM_TXC_RING 8
 
 #define START_CPU_BUFNUM_0 0
 #define START_ETH_BUFNUM_0 2
-- 
1.9.1

[PATCH v2 1/5] drivers: net: xgene: fix IPv4 forward crash

2016-05-13 Thread Iyappan Subramanian

This patch fixes the crash observed during IPv4 forward test by
setting the drop field in the dbptr.

Signed-off-by: Iyappan Subramanian 
Tested-by: Toan Le 
---
 drivers/net/ethernet/apm/xgene/xgene_enet_cle.c | 11 ++-
 drivers/net/ethernet/apm/xgene/xgene_enet_cle.h |  2 ++
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
index 6479288..472c0fb 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
@@ -43,6 +43,7 @@ static void xgene_cle_idt_to_hw(u32 dstqid, u32 fpsel,
 static void xgene_cle_dbptr_to_hw(struct xgene_enet_pdata *pdata,
  struct xgene_cle_dbptr *dbptr, u32 *buf)
 {
+   buf[0] = SET_VAL(CLE_DROP, dbptr->drop);
buf[4] = SET_VAL(CLE_FPSEL, dbptr->fpsel) |
 SET_VAL(CLE_DSTQIDL, dbptr->dstqid);
 
@@ -412,7 +413,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata 
*pdata)
.branch = {
{
/* IPV4 */
-   .valid = 0,
+   .valid = 1,
.next_packet_pointer = 22,
.jump_bw = JMP_FW,
.jump_rel = JMP_ABS,
@@ -420,7 +421,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata 
*pdata)
.next_node = PKT_PROT_NODE,
.next_branch = 0,
.data = 0x8,
-   .mask = 0x
+   .mask = 0x0
},
{
.valid = 0,
@@ -456,7 +457,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata 
*pdata)
.next_node = RSS_IPV4_TCP_NODE,
.next_branch = 0,
.data = 0x0600,
-   .mask = 0x
+   .mask = 0x00ff
},
{
/* UDP */
@@ -468,7 +469,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata 
*pdata)
.next_node = RSS_IPV4_UDP_NODE,
.next_branch = 0,
.data = 0x1100,
-   .mask = 0x
+   .mask = 0x00ff
},
{
.valid = 0,
@@ -642,7 +643,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata 
*pdata)
{
/* TCP DST Port */
.valid = 0,
-   .next_packet_pointer = 256,
+   .next_packet_pointer = 258,
.jump_bw = JMP_FW,
.jump_rel = JMP_ABS,
.operation = EQT,
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h 
b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
index 13e829a..33c5f6b 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
@@ -83,6 +83,8 @@
 #define CLE_TYPE_POS   0
 #define CLE_TYPE_LEN   2
 
+#define CLE_DROP_POS   28
+#define CLE_DROP_LEN   1
 #define CLE_DSTQIDL_POS25
 #define CLE_DSTQIDL_LEN7
 #define CLE_DSTQIDH_POS0
-- 
1.9.1

[PATCH v2 5/5] drivers: net: xgene: fix register offset

2016-05-13 Thread Iyappan Subramanian

This patch fixes SG_RX_DV_GATE_REG_0_ADDR register offset
and ring state field lengths.

Signed-off-by: Iyappan Subramanian 
Tested-by: Toan Le 
---
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.h| 6 +++---
 drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h 
b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
index ecfeffe..45220be 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
@@ -86,7 +86,7 @@ enum xgene_enet_rm {
 #define RINGADDRL_POS  5
 #define RINGADDRL_LEN  27
 #define RINGADDRH_POS  0
-#define RINGADDRH_LEN  6
+#define RINGADDRH_LEN  7
 #define RINGSIZE_POS   23
 #define RINGSIZE_LEN   3
 #define RINGTYPE_POS   19
@@ -94,9 +94,9 @@ enum xgene_enet_rm {
 #define RINGMODE_POS   20
 #define RINGMODE_LEN   3
 #define RECOMTIMEOUTL_POS  28
-#define RECOMTIMEOUTL_LEN  3
+#define RECOMTIMEOUTL_LEN  4
 #define RECOMTIMEOUTH_POS  0
-#define RECOMTIMEOUTH_LEN  2
+#define RECOMTIMEOUTH_LEN  3
 #define NUMMSGSINQ_POS 1
 #define NUMMSGSINQ_LEN 16
 #define ACCEPTLERR BIT(19)
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h 
b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h
index 29a71b4..002df5a 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h
@@ -33,7 +33,7 @@
 #define LINK_STATUSBIT(2)
 #define LINK_UPBIT(15)
 #define MPA_IDLE_WITH_QMI_EMPTYBIT(12)
-#define SG_RX_DV_GATE_REG_0_ADDR   0x0dfc
+#define SG_RX_DV_GATE_REG_0_ADDR   0x05fc
 
 extern const struct xgene_mac_ops xgene_sgmac_ops;
 extern const struct xgene_port_ops xgene_sgport_ops;
-- 
1.9.1

[PATCH v2 2/5] drivers: net: xgene: fix sharing of irqs

2016-05-13 Thread Iyappan Subramanian

Since hardware doesn't allow sharing of interrupts,
this patch fixes the same by removing IRQF_SHARED flag.

Signed-off-by: Iyappan Subramanian 
Tested-by: Toan Le 
---
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index aa87049..ed90664 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -630,7 +630,7 @@ static int xgene_enet_register_irq(struct net_device *ndev)
ring = pdata->rx_ring[i];
irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY);
ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq,
-  IRQF_SHARED, ring->irq_name, ring);
+  0, ring->irq_name, ring);
if (ret) {
netdev_err(ndev, "Failed to request irq %s\n",
   ring->irq_name);
@@ -641,7 +641,7 @@ static int xgene_enet_register_irq(struct net_device *ndev)
ring = pdata->tx_ring[i]->cp_ring;
irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY);
ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq,
-  IRQF_SHARED, ring->irq_name, ring);
+  0, ring->irq_name, ring);
if (ret) {
netdev_err(ndev, "Failed to request irq %s\n",
   ring->irq_name);
-- 
1.9.1

[net-next 04/14] igb: make igb_update_pf_vlvf static

2016-05-13 Thread Jeff Kirsher

From: Jacob Keller 

Signed-off-by: Jacob Keller 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 0191c5f..cab3069 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6027,7 +6027,7 @@ static int igb_find_vlvf_entry(struct e1000_hw *hw, u32 
vlan)
return idx;
 }
 
-void igb_update_pf_vlvf(struct igb_adapter *adapter, u32 vid)
+static void igb_update_pf_vlvf(struct igb_adapter *adapter, u32 vid)
 {
struct e1000_hw *hw = >hw;
u32 bits, pf_id;
-- 
2.5.5

[net-next 03/14] igb: use BIT() macro or unsigned prefix

2016-05-13 Thread Jeff Kirsher

From: Jacob Keller 

For bitshifts, we should make use of the BIT macro when possible, and
ensure that other bitshifts are marked as unsigned. This helps prevent
signed bitshift errors, and ensures similar style.

Make use of GENMASK and the unsigned postfix where BIT() isn't
appropriate.

Signed-off-by: Jacob Keller 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/igb/e1000_82575.c   |   8 +-
 drivers/net/ethernet/intel/igb/e1000_82575.h   |  30 +++
 drivers/net/ethernet/intel/igb/e1000_defines.h | 108 -
 drivers/net/ethernet/intel/igb/e1000_mac.c |  10 +--
 drivers/net/ethernet/intel/igb/e1000_mbx.c |   4 +-
 drivers/net/ethernet/intel/igb/e1000_nvm.c |   2 +-
 drivers/net/ethernet/intel/igb/e1000_phy.h |   6 +-
 drivers/net/ethernet/intel/igb/igb.h   |  32 
 drivers/net/ethernet/intel/igb/igb_ethtool.c   |  18 ++---
 drivers/net/ethernet/intel/igb/igb_main.c  |  48 +--
 drivers/net/ethernet/intel/igb/igb_ptp.c   |   6 +-
 11 files changed, 136 insertions(+), 136 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.c 
b/drivers/net/ethernet/intel/igb/e1000_82575.c
index a23aa67..a61447f 100644
--- a/drivers/net/ethernet/intel/igb/e1000_82575.c
+++ b/drivers/net/ethernet/intel/igb/e1000_82575.c
@@ -361,7 +361,7 @@ static s32 igb_init_nvm_params_82575(struct e1000_hw *hw)
if (size > 15)
size = 15;
 
-   nvm->word_size = 1 << size;
+   nvm->word_size = BIT(size);
nvm->opcode_bits = 8;
nvm->delay_usec = 1;
 
@@ -380,7 +380,7 @@ static s32 igb_init_nvm_params_82575(struct e1000_hw *hw)
16 : 8;
break;
}
-   if (nvm->word_size == (1 << 15))
+   if (nvm->word_size == BIT(15))
nvm->page_size = 128;
 
nvm->type = e1000_nvm_eeprom_spi;
@@ -391,7 +391,7 @@ static s32 igb_init_nvm_params_82575(struct e1000_hw *hw)
nvm->ops.write = igb_write_nvm_spi;
nvm->ops.validate = igb_validate_nvm_checksum;
nvm->ops.update = igb_update_nvm_checksum;
-   if (nvm->word_size < (1 << 15))
+   if (nvm->word_size < BIT(15))
nvm->ops.read = igb_read_nvm_eerd;
else
nvm->ops.read = igb_read_nvm_spi;
@@ -2107,7 +2107,7 @@ void igb_vmdq_set_anti_spoofing_pf(struct e1000_hw *hw, 
bool enable, int pf)
/* The PF can spoof - it has to in order to
 * support emulation mode NICs
 */
-   reg_val ^= (1 << pf | 1 << (pf + MAX_NUM_VFS));
+   reg_val ^= (BIT(pf) | BIT(pf + MAX_NUM_VFS));
} else {
reg_val &= ~(E1000_DTXSWC_MAC_SPOOF_MASK |
 E1000_DTXSWC_VLAN_SPOOF_MASK);
diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.h 
b/drivers/net/ethernet/intel/igb/e1000_82575.h
index de8805a..199ff98 100644
--- a/drivers/net/ethernet/intel/igb/e1000_82575.h
+++ b/drivers/net/ethernet/intel/igb/e1000_82575.h
@@ -168,16 +168,16 @@ struct e1000_adv_tx_context_desc {
 #define E1000_DCA_CTRL_DCA_MODE_CB2 0x02 /* DCA Mode CB2 */
 
 #define E1000_DCA_RXCTRL_CPUID_MASK 0x001F /* Rx CPUID Mask */
-#define E1000_DCA_RXCTRL_DESC_DCA_EN (1 << 5) /* DCA Rx Desc enable */
-#define E1000_DCA_RXCTRL_HEAD_DCA_EN (1 << 6) /* DCA Rx Desc header enable */
-#define E1000_DCA_RXCTRL_DATA_DCA_EN (1 << 7) /* DCA Rx Desc payload enable */
-#define E1000_DCA_RXCTRL_DESC_RRO_EN (1 << 9) /* DCA Rx rd Desc Relax Order */
+#define E1000_DCA_RXCTRL_DESC_DCA_EN BIT(5) /* DCA Rx Desc enable */
+#define E1000_DCA_RXCTRL_HEAD_DCA_EN BIT(6) /* DCA Rx Desc header enable */
+#define E1000_DCA_RXCTRL_DATA_DCA_EN BIT(7) /* DCA Rx Desc payload enable */
+#define E1000_DCA_RXCTRL_DESC_RRO_EN BIT(9) /* DCA Rx rd Desc Relax Order */
 
 #define E1000_DCA_TXCTRL_CPUID_MASK 0x001F /* Tx CPUID Mask */
-#define E1000_DCA_TXCTRL_DESC_DCA_EN (1 << 5) /* DCA Tx Desc enable */
-#define E1000_DCA_TXCTRL_DESC_RRO_EN (1 << 9) /* Tx rd Desc Relax Order */
-#define E1000_DCA_TXCTRL_TX_WB_RO_EN (1 << 11) /* Tx Desc writeback RO bit */
-#define E1000_DCA_TXCTRL_DATA_RRO_EN (1 << 13) /* Tx rd data Relax Order */
+#define E1000_DCA_TXCTRL_DESC_DCA_EN BIT(5) /* DCA Tx Desc enable */
+#define E1000_DCA_TXCTRL_DESC_RRO_EN BIT(9) /* Tx rd Desc Relax Order */
+#define E1000_DCA_TXCTRL_TX_WB_RO_EN BIT(11) /* Tx Desc writeback RO bit */
+#define E1000_DCA_TXCTRL_DATA_RRO_EN BIT(13) /* Tx rd data Relax Order */
 
 /* Additional DCA related definitions, note change in position of CPUID */
 #define E1000_DCA_TXCTRL_CPUID_MASK_82576 0xFF00 /* Tx CPUID Mask */
@@ -186,8 +186,8 @@ struct e1000_adv_tx_context_desc {
 #define E1000_DCA_RXCTRL_CPUID_SHIFT 24 /* Rx CPUID now in the last byte */
 
 /* ETQF register bit definitions */
-#define E1000_ETQF_FILTER_ENABLE   (1 << 26)
-#define E1000_ETQF_1588

[net-next 06/14] e1000e: e1000e_cyclecounter_read(): fix er32(SYSTIML) overflow check

2016-05-13 Thread Jeff Kirsher

From: Denys Vlasenko 

If two consecutive reads of the counter are the same, it is also
not an overflow.  "systimel_1 < systimel_2" should be
"systimel_1 <= systimel_2".

Before the patch, we could perform an *erroneous* correction:

Let's say that systimel_1 == systimel_2 == 0x.
"systimel_1 < systimel_2" is false, we think it's an overflow,
we read "systimeh = er32(SYSTIMH)" which meanwhile had incremented,
and use "(systimeh << 32) + systimel_2" value which is 2^32 too large.

Signed-off-by: Denys Vlasenko 
CC: intel-wired-...@lists.osuosl.org
Tested-by: Aaron Brown 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index 4969f64..02c64bc 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -4287,7 +4287,7 @@ static cycle_t e1000e_cyclecounter_read(const struct 
cyclecounter *cc)
systimeh = er32(SYSTIMH);
systimel_2 = er32(SYSTIML);
/* Check for overflow. If there was no overflow, use the values */
-   if (systimel_1 < systimel_2) {
+   if (systimel_1 <= systimel_2) {
systim = (cycle_t)systimel_1;
systim |= (cycle_t)systimeh << 32;
} else {
-- 
2.5.5

[net-next 01/14] e1000e: fix ethtool autoneg off for non-copper

2016-05-13 Thread Jeff Kirsher

From: Steve Shih 

This patch fixes the issues for disabling auto-negotiation and forcing
speed and duplex settings for the non-copper media.

For non-copper media, e1000_get_settings should return ETH_TP_MDI_INVALID for
eth_tp_mdix_ctrl instead of ETH_TP_MDI_AUTO so subsequent e1000_set_settings
call would not fail with -EOPNOTSUPP.

e1000_set_spd_dplx should not automatically turn autoneg back on for forced
1000 Mbps full duplex settings for non-copper media.

Cc: xe-ker...@external.cisco.com
Cc: Daniel Walker 
Signed-off-by: Steve Shih 
Tested-by: Aaron Brown 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/e1000e/ethtool.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c 
b/drivers/net/ethernet/intel/e1000e/ethtool.c
index 1e3973a..83a815b 100644
--- a/drivers/net/ethernet/intel/e1000e/ethtool.c
+++ b/drivers/net/ethernet/intel/e1000e/ethtool.c
@@ -201,6 +201,9 @@ static int e1000_get_settings(struct net_device *netdev,
else
ecmd->eth_tp_mdix_ctrl = hw->phy.mdix;
 
+   if (hw->phy.media_type != e1000_media_type_copper)
+   ecmd->eth_tp_mdix_ctrl = ETH_TP_MDI_INVALID;
+
return 0;
 }
 
@@ -236,8 +239,13 @@ static int e1000_set_spd_dplx(struct e1000_adapter 
*adapter, u32 spd, u8 dplx)
mac->forced_speed_duplex = ADVERTISE_100_FULL;
break;
case SPEED_1000 + DUPLEX_FULL:
-   mac->autoneg = 1;
-   adapter->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL;
+   if (adapter->hw.phy.media_type == e1000_media_type_copper) {
+   mac->autoneg = 1;
+   adapter->hw.phy.autoneg_advertised =
+   ADVERTISE_1000_FULL;
+   } else {
+   mac->forced_speed_duplex = ADVERTISE_1000_FULL;
+   }
break;
case SPEED_1000 + DUPLEX_HALF:  /* not supported */
default:
-- 
2.5.5

[net-next 10/14] igbvf: use BIT() macro instead of shifts

2016-05-13 Thread Jeff Kirsher

From: Jacob Keller 

To prevent signed bitshift issues, and improve code readability, use the
BIT() macro. Also make use of GENMASK or the unsigned postfix where this
is more appropriate than BIT()

Signed-off-by: Jacob Keller 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/igbvf/defines.h |  2 +-
 drivers/net/ethernet/intel/igbvf/ethtool.c |  3 ++-
 drivers/net/ethernet/intel/igbvf/igbvf.h   |  4 ++--
 drivers/net/ethernet/intel/igbvf/netdev.c  | 10 +-
 drivers/net/ethernet/intel/igbvf/vf.c  |  2 +-
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/igbvf/defines.h 
b/drivers/net/ethernet/intel/igbvf/defines.h
index ae3f283..ee1ef08 100644
--- a/drivers/net/ethernet/intel/igbvf/defines.h
+++ b/drivers/net/ethernet/intel/igbvf/defines.h
@@ -113,7 +113,7 @@
 #define E1000_RXDCTL_QUEUE_ENABLE  0x0200 /* Enable specific Rx Que */
 
 /* Direct Cache Access (DCA) definitions */
-#define E1000_DCA_TXCTRL_TX_WB_RO_EN   (1 << 11) /* Tx Desc writeback RO bit */
+#define E1000_DCA_TXCTRL_TX_WB_RO_EN   BIT(11) /* Tx Desc writeback RO bit */
 
 #define E1000_VF_INIT_TIMEOUT  200 /* Number of retries to clear RSTI */
 
diff --git a/drivers/net/ethernet/intel/igbvf/ethtool.c 
b/drivers/net/ethernet/intel/igbvf/ethtool.c
index b74ce53..8dea1b1 100644
--- a/drivers/net/ethernet/intel/igbvf/ethtool.c
+++ b/drivers/net/ethernet/intel/igbvf/ethtool.c
@@ -154,7 +154,8 @@ static void igbvf_get_regs(struct net_device *netdev,
 
memset(p, 0, IGBVF_REGS_LEN * sizeof(u32));
 
-   regs->version = (1 << 24) | (adapter->pdev->revision << 16) |
+   regs->version = (1u << 24) |
+   (adapter->pdev->revision << 16) |
adapter->pdev->device;
 
regs_buff[0] = er32(CTRL);
diff --git a/drivers/net/ethernet/intel/igbvf/igbvf.h 
b/drivers/net/ethernet/intel/igbvf/igbvf.h
index f166baa..6f4290d 100644
--- a/drivers/net/ethernet/intel/igbvf/igbvf.h
+++ b/drivers/net/ethernet/intel/igbvf/igbvf.h
@@ -287,8 +287,8 @@ struct igbvf_info {
 };
 
 /* hardware capability, feature, and workaround flags */
-#define IGBVF_FLAG_RX_CSUM_DISABLED(1 << 0)
-#define IGBVF_FLAG_RX_LB_VLAN_BSWAP(1 << 1)
+#define IGBVF_FLAG_RX_CSUM_DISABLEDBIT(0)
+#define IGBVF_FLAG_RX_LB_VLAN_BSWAPBIT(1)
 #define IGBVF_RX_DESC_ADV(R, i) \
(&R).desc))[i].rx_desc))
 #define IGBVF_TX_DESC_ADV(R, i) \
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c 
b/drivers/net/ethernet/intel/igbvf/netdev.c
index 78af4c7..57894a8 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -964,7 +964,7 @@ static void igbvf_assign_vector(struct igbvf_adapter 
*adapter, int rx_queue,
ivar = ivar & 0xFF00;
ivar |= msix_vector | E1000_IVAR_VALID;
}
-   adapter->rx_ring[rx_queue].eims_value = 1 << msix_vector;
+   adapter->rx_ring[rx_queue].eims_value = BIT(msix_vector);
array_ew32(IVAR0, index, ivar);
}
if (tx_queue > IGBVF_NO_QUEUE) {
@@ -979,7 +979,7 @@ static void igbvf_assign_vector(struct igbvf_adapter 
*adapter, int rx_queue,
ivar = ivar & 0x00FF;
ivar |= (msix_vector | E1000_IVAR_VALID) << 8;
}
-   adapter->tx_ring[tx_queue].eims_value = 1 << msix_vector;
+   adapter->tx_ring[tx_queue].eims_value = BIT(msix_vector);
array_ew32(IVAR0, index, ivar);
}
 }
@@ -1014,8 +1014,8 @@ static void igbvf_configure_msix(struct igbvf_adapter 
*adapter)
 
ew32(IVAR_MISC, tmp);
 
-   adapter->eims_enable_mask = (1 << (vector)) - 1;
-   adapter->eims_other = 1 << (vector - 1);
+   adapter->eims_enable_mask = GENMASK(vector - 1, 0);
+   adapter->eims_other = BIT(vector - 1);
e1e_flush();
 }
 
@@ -2089,7 +2089,7 @@ static int igbvf_maybe_stop_tx(struct net_device *netdev, 
int size)
 }
 
 #define IGBVF_MAX_TXD_PWR  16
-#define IGBVF_MAX_DATA_PER_TXD (1 << IGBVF_MAX_TXD_PWR)
+#define IGBVF_MAX_DATA_PER_TXD (1u << IGBVF_MAX_TXD_PWR)
 
 static inline int igbvf_tx_map_adv(struct igbvf_adapter *adapter,
   struct igbvf_ring *tx_ring,
diff --git a/drivers/net/ethernet/intel/igbvf/vf.c 
b/drivers/net/ethernet/intel/igbvf/vf.c
index a13baa9..335ba66 100644
--- a/drivers/net/ethernet/intel/igbvf/vf.c
+++ b/drivers/net/ethernet/intel/igbvf/vf.c
@@ -266,7 +266,7 @@ static s32 e1000_set_vfta_vf(struct e1000_hw *hw, u16 vid, 
bool set)
msgbuf[1] = vid;
/* Setting the 8 bit field MSG INFO to true indicates "add" */
if (set)
-   msgbuf[0] |= 1 << E1000_VT_MSGINFO_SHIFT;
+   msgbuf[0] |= BIT(E1000_VT_MSGINFO_SHIFT);
 
mbx->ops.write_posted(hw, msgbuf, 2);
 
-- 
2.5.5

[net-next 05/14] e1000e: e1000e_cyclecounter_read(): incvalue is 32 bits, not 64

2016-05-13 Thread Jeff Kirsher

From: Denys Vlasenko 

"incvalue" variable holds a result of "er32(TIMINCA) &
E1000_TIMINCA_INCVALUE_MASK" and used in "do_div(temp, incvalue)"
as a divisor.

Thus, "u64 incvalue" declaration is probably a mistake.
Even though it seems to be a harmless one, let's fix it.

Signed-off-by: Denys Vlasenko 
Tested-by: Aaron Brown 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index 671256d..4969f64 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -4300,7 +4300,8 @@ static cycle_t e1000e_cyclecounter_read(const struct 
cyclecounter *cc)
}
 
if ((hw->mac.type == e1000_82574) || (hw->mac.type == e1000_82583)) {
-   u64 incvalue, time_delta, rem, temp;
+   u64 time_delta, rem, temp;
+   u32 incvalue;
int i;
 
/* errata for 82574/82583 possible bad bits read from SYSTIMH/L
-- 
2.5.5

[net-next 09/14] igbvf: remove unused variable and dead code

2016-05-13 Thread Jeff Kirsher

From: Jacob Keller 

The variable rdlen is set but never used, and thus setting it is dead
code. Remove it.

Signed-off-by: Jacob Keller 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/igbvf/netdev.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c 
b/drivers/net/ethernet/intel/igbvf/netdev.c
index c124422..78af4c7 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -1367,7 +1367,7 @@ static void igbvf_configure_rx(struct igbvf_adapter 
*adapter)
struct e1000_hw *hw = >hw;
struct igbvf_ring *rx_ring = adapter->rx_ring;
u64 rdba;
-   u32 rdlen, rxdctl;
+   u32 rxdctl;
 
/* disable receives */
rxdctl = er32(RXDCTL(0));
@@ -1375,8 +1375,6 @@ static void igbvf_configure_rx(struct igbvf_adapter 
*adapter)
e1e_flush();
msleep(10);
 
-   rdlen = rx_ring->count * sizeof(union e1000_adv_rx_desc);
-
/* Setup the HW Rx Head and Tail Descriptor Pointers and
 * the Base and Length of the Rx Descriptor Ring
 */
-- 
2.5.5

[net-next 07/14] e1000e: e1000e_cyclecounter_read(): do overflow check only if needed

2016-05-13 Thread Jeff Kirsher

From: Denys Vlasenko 

SYSTIMH:SYSTIML registers are incremented by 24-bit value TIMINCA[23..0]

er32(SYSTIML) are probably moderately expensive (they are pci bus reads).
Can we avoid one of them? Yes, we can.

If the SYSTIML value we see is smaller than 0xff00, the overflow
into SYSTIMH would require at least two increments.

We do two reads, er32(SYSTIML) and er32(SYSTIMH), in this order.

Even if one increment happens between them, the overflow into SYSTIMH
is impossible, and we can avoid doing another er32(SYSTIML) read
and overflow check.

Signed-off-by: Denys Vlasenko 
Tested-by: Aaron Brown 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index 02c64bc..0d3c00d 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -4275,7 +4275,7 @@ static cycle_t e1000e_cyclecounter_read(const struct 
cyclecounter *cc)
struct e1000_adapter *adapter = container_of(cc, struct e1000_adapter,
 cc);
struct e1000_hw *hw = >hw;
-   u32 systimel_1, systimel_2, systimeh;
+   u32 systimel, systimeh;
cycle_t systim, systim_next;
/* SYSTIMH latching upon SYSTIML read does not work well.
 * This means that if SYSTIML overflows after we read it but before
@@ -4283,21 +4283,21 @@ static cycle_t e1000e_cyclecounter_read(const struct 
cyclecounter *cc)
 * will experience a huge non linear increment in the systime value
 * to fix that we test for overflow and if true, we re-read systime.
 */
-   systimel_1 = er32(SYSTIML);
+   systimel = er32(SYSTIML);
systimeh = er32(SYSTIMH);
-   systimel_2 = er32(SYSTIML);
-   /* Check for overflow. If there was no overflow, use the values */
-   if (systimel_1 <= systimel_2) {
-   systim = (cycle_t)systimel_1;
-   systim |= (cycle_t)systimeh << 32;
-   } else {
-   /* There was an overflow, read again SYSTIMH, and use
-* systimel_2
-*/
-   systimeh = er32(SYSTIMH);
-   systim = (cycle_t)systimel_2;
-   systim |= (cycle_t)systimeh << 32;
+   /* Is systimel is so large that overflow is possible? */
+   if (systimel >= (u32)0x - E1000_TIMINCA_INCVALUE_MASK) {
+   u32 systimel_2 = er32(SYSTIML);
+   if (systimel > systimel_2) {
+   /* There was an overflow, read again SYSTIMH, and use
+* systimel_2
+*/
+   systimeh = er32(SYSTIMH);
+   systimel = systimel_2;
+   }
}
+   systim = (cycle_t)systimel;
+   systim |= (cycle_t)systimeh << 32;
 
if ((hw->mac.type == e1000_82574) || (hw->mac.type == e1000_82583)) {
u64 time_delta, rem, temp;
-- 
2.5.5

[net-next 08/14] igb: adjust PTP timestamps for Tx/Rx latency

2016-05-13 Thread Jeff Kirsher

From: Nathan Sullivan 

Table 7-62 on page 338 of the i210 datasheet lists TX and RX latencies
for the various speeds the chip supports.  To give better PTP timestamp
accuracy, adjust the timestamps by the amounts Intel gives based on
current link speed.

Signed-off-by: Nathan Sullivan 
Tested-by: Aaron Brown 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/igb/igb.h |  8 +++
 drivers/net/ethernet/intel/igb/igb_ptp.c | 36 
 2 files changed, 44 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb.h 
b/drivers/net/ethernet/intel/igb/igb.h
index 7a7bc31..b9609af 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -91,6 +91,14 @@ struct igb_adapter;
 #define NVM_COMB_VER_OFF   0x0083
 #define NVM_COMB_VER_PTR   0x003d
 
+/* Transmit and receive latency (for PTP timestamps) */
+#define IGB_I210_TX_LATENCY_10 9542
+#define IGB_I210_TX_LATENCY_1001024
+#define IGB_I210_TX_LATENCY_1000   178
+#define IGB_I210_RX_LATENCY_10 20662
+#define IGB_I210_RX_LATENCY_1002213
+#define IGB_I210_RX_LATENCY_1000   448
+
 struct vf_data_storage {
unsigned char vf_mac_addresses[ETH_ALEN];
u16 vf_mc_hashes[IGB_MAX_VF_MC_ENTRIES];
diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c 
b/drivers/net/ethernet/intel/igb/igb_ptp.c
index fdb6dfd..f097c5a 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -722,11 +722,29 @@ static void igb_ptp_tx_hwtstamp(struct igb_adapter 
*adapter)
struct e1000_hw *hw = >hw;
struct skb_shared_hwtstamps shhwtstamps;
u64 regval;
+   int adjust = 0;
 
regval = rd32(E1000_TXSTMPL);
regval |= (u64)rd32(E1000_TXSTMPH) << 32;
 
igb_ptp_systim_to_hwtstamp(adapter, , regval);
+   /* adjust timestamp for the TX latency based on link speed */
+   if (adapter->hw.mac.type == e1000_i210) {
+   switch (adapter->link_speed) {
+   case SPEED_10:
+   adjust = IGB_I210_TX_LATENCY_10;
+   break;
+   case SPEED_100:
+   adjust = IGB_I210_TX_LATENCY_100;
+   break;
+   case SPEED_1000:
+   adjust = IGB_I210_TX_LATENCY_1000;
+   break;
+   }
+   }
+
+   shhwtstamps.hwtstamp = ktime_sub_ns(shhwtstamps.hwtstamp, adjust);
+
skb_tstamp_tx(adapter->ptp_tx_skb, );
dev_kfree_skb_any(adapter->ptp_tx_skb);
adapter->ptp_tx_skb = NULL;
@@ -771,6 +789,7 @@ void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector,
struct igb_adapter *adapter = q_vector->adapter;
struct e1000_hw *hw = >hw;
u64 regval;
+   int adjust = 0;
 
/* If this bit is set, then the RX registers contain the time stamp. No
 * other packet will be time stamped until we read these registers, so
@@ -790,6 +809,23 @@ void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector,
 
igb_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb), regval);
 
+   /* adjust timestamp for the RX latency based on link speed */
+   if (adapter->hw.mac.type == e1000_i210) {
+   switch (adapter->link_speed) {
+   case SPEED_10:
+   adjust = IGB_I210_RX_LATENCY_10;
+   break;
+   case SPEED_100:
+   adjust = IGB_I210_RX_LATENCY_100;
+   break;
+   case SPEED_1000:
+   adjust = IGB_I210_RX_LATENCY_1000;
+   break;
+   }
+   }
+   skb_hwtstamps(skb)->hwtstamp =
+   ktime_add_ns(skb_hwtstamps(skb)->hwtstamp, adjust);
+
/* Update the last_rx_timestamp timer in order to enable watchdog check
 * for error case of latched timestamp on a dropped packet.
 */
-- 
2.5.5

[net-next 13/14] igb/igbvf: Add support for GSO partial

2016-05-13 Thread Jeff Kirsher

From: Alexander Duyck 

This patch adds support for partial GSO segmentation in the case of
tunnels.  Specifically with this change the driver an perform segmentation
as long as the frame either has IPv6 inner headers, or we are allowed to
mangle the IP IDs on the inner header.  This is needed because we will not
be modifying any fields from the start of the start of the outer transport
header to the start of the inner transport header as we are treating them
like they are just a block of IP options.

Signed-off-by: Alexander Duyck 
Tested-by: Aaron Brown 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 137 +++---
 drivers/net/ethernet/intel/igbvf/netdev.c | 182 ++
 2 files changed, 203 insertions(+), 116 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index cab3069..2172769 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2087,6 +2087,40 @@ static int igb_ndo_fdb_add(struct ndmsg *ndm, struct 
nlattr *tb[],
return ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, flags);
 }
 
+#define IGB_MAX_MAC_HDR_LEN127
+#define IGB_MAX_NETWORK_HDR_LEN511
+
+static netdev_features_t
+igb_features_check(struct sk_buff *skb, struct net_device *dev,
+  netdev_features_t features)
+{
+   unsigned int network_hdr_len, mac_hdr_len;
+
+   /* Make certain the headers can be described by a context descriptor */
+   mac_hdr_len = skb_network_header(skb) - skb->data;
+   if (unlikely(mac_hdr_len > IGB_MAX_MAC_HDR_LEN))
+   return features & ~(NETIF_F_HW_CSUM |
+   NETIF_F_SCTP_CRC |
+   NETIF_F_HW_VLAN_CTAG_TX |
+   NETIF_F_TSO |
+   NETIF_F_TSO6);
+
+   network_hdr_len = skb_checksum_start(skb) - skb_network_header(skb);
+   if (unlikely(network_hdr_len >  IGB_MAX_NETWORK_HDR_LEN))
+   return features & ~(NETIF_F_HW_CSUM |
+   NETIF_F_SCTP_CRC |
+   NETIF_F_TSO |
+   NETIF_F_TSO6);
+
+   /* We can only support IPV4 TSO in tunnels if we can mangle the
+* inner IP ID field, so strip TSO if MANGLEID is not supported.
+*/
+   if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID))
+   features &= ~NETIF_F_TSO;
+
+   return features;
+}
+
 static const struct net_device_ops igb_netdev_ops = {
.ndo_open   = igb_open,
.ndo_stop   = igb_close,
@@ -2111,7 +2145,7 @@ static const struct net_device_ops igb_netdev_ops = {
.ndo_fix_features   = igb_fix_features,
.ndo_set_features   = igb_set_features,
.ndo_fdb_add= igb_ndo_fdb_add,
-   .ndo_features_check = passthru_features_check,
+   .ndo_features_check = igb_features_check,
 };
 
 /**
@@ -2377,38 +2411,43 @@ static int igb_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
NETIF_F_TSO6 |
NETIF_F_RXHASH |
NETIF_F_RXCSUM |
-   NETIF_F_HW_CSUM |
-   NETIF_F_HW_VLAN_CTAG_RX |
-   NETIF_F_HW_VLAN_CTAG_TX;
+   NETIF_F_HW_CSUM;
 
if (hw->mac.type >= e1000_82576)
netdev->features |= NETIF_F_SCTP_CRC;
 
+#define IGB_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
+ NETIF_F_GSO_GRE_CSUM | \
+ NETIF_F_GSO_IPIP | \
+ NETIF_F_GSO_SIT | \
+ NETIF_F_GSO_UDP_TUNNEL | \
+ NETIF_F_GSO_UDP_TUNNEL_CSUM)
+
+   netdev->gso_partial_features = IGB_GSO_PARTIAL_FEATURES;
+   netdev->features |= NETIF_F_GSO_PARTIAL | IGB_GSO_PARTIAL_FEATURES;
+
/* copy netdev features into list of user selectable features */
-   netdev->hw_features |= netdev->features;
-   netdev->hw_features |= NETIF_F_RXALL;
+   netdev->hw_features |= netdev->features |
+  NETIF_F_HW_VLAN_CTAG_RX |
+  NETIF_F_HW_VLAN_CTAG_TX |
+  NETIF_F_RXALL;
 
if (hw->mac.type >= e1000_i350)
netdev->hw_features |= NETIF_F_NTUPLE;
 
-   /* set this bit last since it cannot be part of hw_features */
-   netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
-
-   netdev->vlan_features |= NETIF_F_SG |
-NETIF_F_TSO |
-NETIF_F_TSO6 |
-

[net-next 14/14] e1000e: don't modify SYSTIM registers during SIOCSHWTSTAMP ioctl

2016-05-13 Thread Jeff Kirsher

From: Jacob Keller 

The e1000e_config_hwtstamp function was incorrectly resetting the SYSTIM
registers every time the ioctl was being run. If you happened to be
running ptp4l and lost the PTP connect (removing cable, or blocking the
UDP traffic for example), then ptp4l will eventually perform a restart
which involves re-requesting timestamp settings. In e1000e this has the
unfortunate and incorrect result of resetting SYSTIME to the kernel
time. Since kernel time is usually in UTC, and PTP time is in TAI, this
results in the leap second being re-applied.

Fix this by extracting the SYSTIME reset out into its own function,
e1000e_ptp_reset, which we call during reset to restore the hardware
registers. This function will (a) restart the timecounter based on the
new system time, (b) restore the previous PPB setting, and (c) restore
the previous hwtstamp settings.

In order to perform (b), I had to modify the adjfreq ptp function
pointer to store the old delta each time it is called. This also has the
side effect of restoring the correct base timinca register correctly.
The driver does not need to explicitly zero the ptp_delta variable since
the entire adapter structure comes zero-initialized.

Reported-by: Brian Walsh 
Signed-off-by: Jacob Keller 
Tested-by: Brian Walsh 
Tested-by: Aaron Brown 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/e1000e/e1000.h  |  1 +
 drivers/net/ethernet/intel/e1000e/netdev.c | 68 +++---
 drivers/net/ethernet/intel/e1000e/ptp.c|  2 +
 3 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h 
b/drivers/net/ethernet/intel/e1000e/e1000.h
index 010e6d6..ef96cd1 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -347,6 +347,7 @@ struct e1000_adapter {
struct ptp_clock *ptp_clock;
struct ptp_clock_info ptp_clock_info;
struct pm_qos_request pm_qos_req;
+   s32 ptp_delta;
 
u16 eee_advert;
 };
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index c597398..75e6089 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -3580,7 +3580,6 @@ static int e1000e_config_hwtstamp(struct e1000_adapter 
*adapter,
bool is_l4 = false;
bool is_l2 = false;
u32 regval;
-   s32 ret_val;
 
if (!(adapter->flags & FLAG_HAS_HW_TIMESTAMP))
return -EINVAL;
@@ -3719,16 +3718,6 @@ static int e1000e_config_hwtstamp(struct e1000_adapter 
*adapter,
er32(RXSTMPH);
er32(TXSTMPH);
 
-   /* Get and set the System Time Register SYSTIM base frequency */
-   ret_val = e1000e_get_base_timinca(adapter, );
-   if (ret_val)
-   return ret_val;
-   ew32(TIMINCA, regval);
-
-   /* reset the ns time counter */
-   timecounter_init(>tc, >cc,
-ktime_to_ns(ktime_get_real()));
-
return 0;
 }
 
@@ -3885,6 +3874,53 @@ static void e1000_flush_desc_rings(struct e1000_adapter 
*adapter)
 }
 
 /**
+ * e1000e_systim_reset - reset the timesync registers after a hardware reset
+ * @adapter: board private structure
+ *
+ * When the MAC is reset, all hardware bits for timesync will be reset to the
+ * default values. This function will restore the settings last in place.
+ * Since the clock SYSTIME registers are reset, we will simply restore the
+ * cyclecounter to the kernel real clock time.
+ **/
+static void e1000e_systim_reset(struct e1000_adapter *adapter)
+{
+   struct ptp_clock_info *info = >ptp_clock_info;
+   struct e1000_hw *hw = >hw;
+   unsigned long flags;
+   u32 timinca;
+   s32 ret_val;
+
+   if (!(adapter->flags & FLAG_HAS_HW_TIMESTAMP))
+   return;
+
+   if (info->adjfreq) {
+   /* restore the previous ptp frequency delta */
+   ret_val = info->adjfreq(info, adapter->ptp_delta);
+   } else {
+   /* set the default base frequency if no adjustment possible */
+   ret_val = e1000e_get_base_timinca(adapter, );
+   if (!ret_val)
+   ew32(TIMINCA, timinca);
+   }
+
+   if (ret_val) {
+   dev_warn(>pdev->dev,
+"Failed to restore TIMINCA clock rate delta: %d\n",
+ret_val);
+   return;
+   }
+
+   /* reset the systim ns time counter */
+   spin_lock_irqsave(>systim_lock, flags);
+   timecounter_init(>tc, >cc,
+ktime_to_ns(ktime_get_real()));
+   spin_unlock_irqrestore(>systim_lock, flags);
+
+   /* restore the previous hwtstamp configuration settings */
+   e1000e_config_hwtstamp(adapter, >hwtstamp_config);
+}
+
+/**
  *

[net-next 12/14] e1000e: mark shifted values as unsigned

2016-05-13 Thread Jeff Kirsher

From: Jacob Keller 

The E1000_ICH_NVM_SIG_MASK value is shifted, out to the 31st bit, which
is the signed bit for signed constants. Mark these values as unsigned to
prevent compiler warnings and issues on platforms which a different
signed bit implementation.

Signed-off-by: Jacob Keller 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/e1000e/ich8lan.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.h 
b/drivers/net/ethernet/intel/e1000e/ich8lan.h
index 2311f60..67163ca 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.h
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.h
@@ -73,10 +73,10 @@
 (ID_LED_OFF1_ON2  <<  4) | \
 (ID_LED_DEF1_DEF2))
 
-#define E1000_ICH_NVM_SIG_WORD 0x13
-#define E1000_ICH_NVM_SIG_MASK 0xC000
-#define E1000_ICH_NVM_VALID_SIG_MASK   0xC0
-#define E1000_ICH_NVM_SIG_VALUE0x80
+#define E1000_ICH_NVM_SIG_WORD 0x13u
+#define E1000_ICH_NVM_SIG_MASK 0xC000u
+#define E1000_ICH_NVM_VALID_SIG_MASK   0xC0u
+#define E1000_ICH_NVM_SIG_VALUE0x80u
 
 #define E1000_ICH8_LAN_INIT_TIMEOUT1500
 
-- 
2.5.5

Re: [PATCH RFT 1/2] phylib: add device reset GPIO support

2016-05-13 Thread Andrew Lunn

On Fri, May 13, 2016 at 11:56:12PM +0300, Sergei Shtylyov wrote:
> On 05/13/2016 11:44 PM, Andrew Lunn wrote:
> 
> >>>Another issue is that on some boards we have one reset line tied to
> >>>multiple PHYs.How do we prevent multiple resets being taking place when 
> >>>each of
> >>>the PHYs are registered?
> >>
> >>   My patch just doesn't address this case -- it's about the
> >>individual resets only.
> >
> >This actually needs to be addresses a layer above. What you have is a
> >bus reset, not a device reset.
> 
>No.
>There's simply no such thing as a bus reset for the xMII/MDIO
> busses, there's simply no reset signaling on them. Every device has
> its own reset signal and its own timing requirements.

Except in the case above, where two phys are sharing the same reset
signal. So although it is not part of the mdio standard to have a bus
reset, this is in effect what the gpio line is doing, resetting all
devices on the bus. If you don't model that as a bus reset, how do you
model it?

  Andrew

[net-next 11/14] e1000e: use BIT() macro for bit defines

2016-05-13 Thread Jeff Kirsher

From: Jacob Keller 

This prevents signed bitshift issues when the shift would overwrite the
signed bit, and prevents making this mistake in the future when copying
and modifying code.

Use GENMASK or the unsigned postfix for cases which aren't suitable for
BIT() macro.

Signed-off-by: Jacob Keller 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/e1000e/80003es2lan.c |  12 +--
 drivers/net/ethernet/intel/e1000e/82571.c   |  30 +++
 drivers/net/ethernet/intel/e1000e/e1000.h   | 108 
 drivers/net/ethernet/intel/e1000e/ethtool.c |  45 +-
 drivers/net/ethernet/intel/e1000e/ich8lan.c |  44 +-
 drivers/net/ethernet/intel/e1000e/mac.c |   2 +-
 drivers/net/ethernet/intel/e1000e/netdev.c  |  28 +++---
 drivers/net/ethernet/intel/e1000e/nvm.c |   2 +-
 drivers/net/ethernet/intel/e1000e/phy.c |   4 +-
 drivers/net/ethernet/intel/e1000e/phy.h |  10 +--
 10 files changed, 143 insertions(+), 142 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/80003es2lan.c 
b/drivers/net/ethernet/intel/e1000e/80003es2lan.c
index 2af603f..cd39137 100644
--- a/drivers/net/ethernet/intel/e1000e/80003es2lan.c
+++ b/drivers/net/ethernet/intel/e1000e/80003es2lan.c
@@ -121,7 +121,7 @@ static s32 e1000_init_nvm_params_80003es2lan(struct 
e1000_hw *hw)
/* EEPROM access above 16k is unsupported */
if (size > 14)
size = 14;
-   nvm->word_size = 1 << size;
+   nvm->word_size = BIT(size);
 
return 0;
 }
@@ -845,27 +845,27 @@ static void e1000_initialize_hw_bits_80003es2lan(struct 
e1000_hw *hw)
 
/* Transmit Descriptor Control 0 */
reg = er32(TXDCTL(0));
-   reg |= (1 << 22);
+   reg |= BIT(22);
ew32(TXDCTL(0), reg);
 
/* Transmit Descriptor Control 1 */
reg = er32(TXDCTL(1));
-   reg |= (1 << 22);
+   reg |= BIT(22);
ew32(TXDCTL(1), reg);
 
/* Transmit Arbitration Control 0 */
reg = er32(TARC(0));
reg &= ~(0xF << 27);/* 30:27 */
if (hw->phy.media_type != e1000_media_type_copper)
-   reg &= ~(1 << 20);
+   reg &= ~BIT(20);
ew32(TARC(0), reg);
 
/* Transmit Arbitration Control 1 */
reg = er32(TARC(1));
if (er32(TCTL) & E1000_TCTL_MULR)
-   reg &= ~(1 << 28);
+   reg &= ~BIT(28);
else
-   reg |= (1 << 28);
+   reg |= BIT(28);
ew32(TARC(1), reg);
 
/* Disable IPv6 extension header parsing because some malformed
diff --git a/drivers/net/ethernet/intel/e1000e/82571.c 
b/drivers/net/ethernet/intel/e1000e/82571.c
index 5f70164..7fd4d54 100644
--- a/drivers/net/ethernet/intel/e1000e/82571.c
+++ b/drivers/net/ethernet/intel/e1000e/82571.c
@@ -185,7 +185,7 @@ static s32 e1000_init_nvm_params_82571(struct e1000_hw *hw)
/* EEPROM access above 16k is unsupported */
if (size > 14)
size = 14;
-   nvm->word_size = 1 << size;
+   nvm->word_size = BIT(size);
break;
}
 
@@ -1163,12 +1163,12 @@ static void e1000_initialize_hw_bits_82571(struct 
e1000_hw *hw)
 
/* Transmit Descriptor Control 0 */
reg = er32(TXDCTL(0));
-   reg |= (1 << 22);
+   reg |= BIT(22);
ew32(TXDCTL(0), reg);
 
/* Transmit Descriptor Control 1 */
reg = er32(TXDCTL(1));
-   reg |= (1 << 22);
+   reg |= BIT(22);
ew32(TXDCTL(1), reg);
 
/* Transmit Arbitration Control 0 */
@@ -1177,11 +1177,11 @@ static void e1000_initialize_hw_bits_82571(struct 
e1000_hw *hw)
switch (hw->mac.type) {
case e1000_82571:
case e1000_82572:
-   reg |= (1 << 23) | (1 << 24) | (1 << 25) | (1 << 26);
+   reg |= BIT(23) | BIT(24) | BIT(25) | BIT(26);
break;
case e1000_82574:
case e1000_82583:
-   reg |= (1 << 26);
+   reg |= BIT(26);
break;
default:
break;
@@ -1193,12 +1193,12 @@ static void e1000_initialize_hw_bits_82571(struct 
e1000_hw *hw)
switch (hw->mac.type) {
case e1000_82571:
case e1000_82572:
-   reg &= ~((1 << 29) | (1 << 30));
-   reg |= (1 << 22) | (1 << 24) | (1 << 25) | (1 << 26);
+   reg &= ~(BIT(29) | BIT(30));
+   reg |= BIT(22) | BIT(24) | BIT(25) | BIT(26);
if (er32(TCTL) & E1000_TCTL_MULR)
-   reg &= ~(1 << 28);
+   reg &= ~BIT(28);
else
-   reg |= (1 << 28);
+   reg |= BIT(28);
ew32(TARC(1), reg);
break;
default:
@@ -1211,7 +1211,7 @@ static void e1000_initialize_hw_bits_82571(struct 
e1000_hw *hw)

[net-next 00/14][pull request] 1GbE Intel Wired LAN Driver Updates 2016-05-13

2016-05-13 Thread Jeff Kirsher

This series contains updates to e1000e, igb and igbvf.

Steve Shih fixes an issue for disabling auto-negotiation and forcing
speed and duplex settings for non-copper media.

Brian Walsh cleanups some inconsistency in the use of return variables
names to avoid confusion.

Jake cleans up the drivers to use the BIT() macro when it can, which will
future proof the drivers for GCC 6 when it gets released.  Cleaned up
dead code which was never being used.  Also fixed e1000e, where it was
incorrectly restting the SYSTIM registers every time the ioctl was being
run.

Denys Vlasenko fixes an oversight where incvalue variable holds a 32
bit value so we should declare it as such, instead of 64 bits.  Also
fixed an overflow check, where two reads are the same, then it is not
an overflow.

Nathan Sullivan fixes the PTP timestamps for transmit and receive
latency based on the current link speed.

Alexander Duyck adds support for partial GSO segmentation in the case
of tunnels for igb and igbvf.

The following are changes since commit ed7cbbce544856b20e5811de373cf92e92499771:
  udp: Resolve NULL pointer dereference over flow-based vxlan device
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 1GbE

Alexander Duyck (1):
  igb/igbvf: Add support for GSO partial

Brian Walsh (1):
  e1000e: Cleanup consistency in ret_val variable usage

Denys Vlasenko (3):
  e1000e: e1000e_cyclecounter_read(): incvalue is 32 bits, not 64
  e1000e: e1000e_cyclecounter_read(): fix er32(SYSTIML) overflow check
  e1000e: e1000e_cyclecounter_read(): do overflow check only if needed

Jacob Keller (7):
  igb: use BIT() macro or unsigned prefix
  igb: make igb_update_pf_vlvf static
  igbvf: remove unused variable and dead code
  igbvf: use BIT() macro instead of shifts
  e1000e: use BIT() macro for bit defines
  e1000e: mark shifted values as unsigned
  e1000e: don't modify SYSTIM registers during SIOCSHWTSTAMP ioctl

Nathan Sullivan (1):
  igb: adjust PTP timestamps for Tx/Rx latency

Steve Shih (1):
  e1000e: fix ethtool autoneg off for non-copper

 drivers/net/ethernet/intel/e1000e/80003es2lan.c |  12 +-
 drivers/net/ethernet/intel/e1000e/82571.c   |  30 ++--
 drivers/net/ethernet/intel/e1000e/e1000.h   | 109 ++---
 drivers/net/ethernet/intel/e1000e/ethtool.c |  57 ---
 drivers/net/ethernet/intel/e1000e/ich8lan.c |  44 +++---
 drivers/net/ethernet/intel/e1000e/ich8lan.h |   8 +-
 drivers/net/ethernet/intel/e1000e/mac.c |   2 +-
 drivers/net/ethernet/intel/e1000e/netdev.c  | 149 +++---
 drivers/net/ethernet/intel/e1000e/nvm.c |   2 +-
 drivers/net/ethernet/intel/e1000e/phy.c |   4 +-
 drivers/net/ethernet/intel/e1000e/phy.h |  10 +-
 drivers/net/ethernet/intel/e1000e/ptp.c |   2 +
 drivers/net/ethernet/intel/igb/e1000_82575.c|   8 +-
 drivers/net/ethernet/intel/igb/e1000_82575.h|  30 ++--
 drivers/net/ethernet/intel/igb/e1000_defines.h  | 108 ++---
 drivers/net/ethernet/intel/igb/e1000_mac.c  |  10 +-
 drivers/net/ethernet/intel/igb/e1000_mbx.c  |   4 +-
 drivers/net/ethernet/intel/igb/e1000_nvm.c  |   2 +-
 drivers/net/ethernet/intel/igb/e1000_phy.h  |   6 +-
 drivers/net/ethernet/intel/igb/igb.h|  40 +++--
 drivers/net/ethernet/intel/igb/igb_ethtool.c|  18 +--
 drivers/net/ethernet/intel/igb/igb_main.c   | 187 ++
 drivers/net/ethernet/intel/igb/igb_ptp.c|  42 -
 drivers/net/ethernet/intel/igbvf/defines.h  |   2 +-
 drivers/net/ethernet/intel/igbvf/ethtool.c  |   3 +-
 drivers/net/ethernet/intel/igbvf/igbvf.h|   4 +-
 drivers/net/ethernet/intel/igbvf/netdev.c   | 196 ++--
 drivers/net/ethernet/intel/igbvf/vf.c   |   2 +-
 28 files changed, 635 insertions(+), 456 deletions(-)

-- 
2.5.5

[net-next 02/14] e1000e: Cleanup consistency in ret_val variable usage

2016-05-13 Thread Jeff Kirsher

From: Brian Walsh 

Fixed the file to use a consistent ret_val for return value checking.

Signed-off-by: Brian Walsh 
Tested-by: Aaron Brown 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index 269087c..671256d 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -3368,12 +3368,12 @@ static int e1000e_write_uc_addr_list(struct net_device 
*netdev)
 * combining
 */
netdev_for_each_uc_addr(ha, netdev) {
-   int rval;
+   int ret_val;
 
if (!rar_entries)
break;
-   rval = hw->mac.ops.rar_set(hw, ha->addr, rar_entries--);
-   if (rval < 0)
+   ret_val = hw->mac.ops.rar_set(hw, ha->addr, 
rar_entries--);
+   if (ret_val < 0)
return -ENOMEM;
count++;
}
@@ -6965,7 +6965,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
int bars, i, err, pci_using_dac;
u16 eeprom_data = 0;
u16 eeprom_apme_mask = E1000_EEPROM_APME;
-   s32 rval = 0;
+   s32 ret_val = 0;
 
if (ei->flags2 & FLAG2_DISABLE_ASPM_L0S)
aspm_disable_flag = PCIE_LINK_STATE_L0S;
@@ -7200,18 +7200,18 @@ static int e1000_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
} else if (adapter->flags & FLAG_APME_IN_CTRL3) {
if (adapter->flags & FLAG_APME_CHECK_PORT_B &&
(adapter->hw.bus.func == 1))
-   rval = e1000_read_nvm(>hw,
+   ret_val = e1000_read_nvm(>hw,
  NVM_INIT_CONTROL3_PORT_B,
  1, _data);
else
-   rval = e1000_read_nvm(>hw,
+   ret_val = e1000_read_nvm(>hw,
  NVM_INIT_CONTROL3_PORT_A,
  1, _data);
}
 
/* fetch WoL from EEPROM */
-   if (rval)
-   e_dbg("NVM read error getting WoL initial values: %d\n", rval);
+   if (ret_val)
+   e_dbg("NVM read error getting WoL initial values: %d\n", 
ret_val);
else if (eeprom_data & eeprom_apme_mask)
adapter->eeprom_wol |= E1000_WUFC_MAG;
 
@@ -7231,10 +7231,10 @@ static int e1000_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
device_wakeup_enable(>dev);
 
/* save off EEPROM version number */
-   rval = e1000_read_nvm(>hw, 5, 1, >eeprom_vers);
+   ret_val = e1000_read_nvm(>hw, 5, 1, >eeprom_vers);
 
-   if (rval) {
-   e_dbg("NVM read error getting EEPROM version: %d\n", rval);
+   if (ret_val) {
+   e_dbg("NVM read error getting EEPROM version: %d\n", ret_val);
adapter->eeprom_vers = 0;
}
 
-- 
2.5.5

What ixgbe devices support HWTSTAMP_FILTER_ALL for hardware time stamping?

2016-05-13 Thread Guy Harris

libpcap offers the ability to request hardware time stamping for packets and to 
inquire which forms of hardware time stamping, if any, are supported for an 
interface.

The Linux implementation currently implements the inquiry by doing a 
ETHTOOL_GET_TS_INFO SIOETHTOOL ioctl and looking at the so_timestamping bits, 
if the linux/ethtool.h header defines ETHTOOL_GET_TS_INFO and the ioctl 
succeeds on the device.

This is inadequate - as libpcap requests hardware time stamping for all 
packets, it should also check whether HWTSTAMP_FILTER_ALL is set in rx_filters, 
and only offer hardware time stamping if it's set.

The code in ixgbe_ptp.c does:

case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
case HWTSTAMP_FILTER_ALL:
/* The X550 controller is capable of timestamping all packets,
 * which allows it to accept any filter.
 */
if (hw->mac.type >= ixgbe_mac_X550) {
tsync_rx_ctl |= IXGBE_TSYNCRXCTL_TYPE_ALL;
config->rx_filter = HWTSTAMP_FILTER_ALL;
adapter->flags |= IXGBE_FLAG_RX_HWTSTAMP_ENABLED;
break;
}
/* fall through */
default:
/*
 * register RXMTRL must be set in order to do V1 packets,
 * therefore it is not possible to time stamp both V1 Sync and
 * Delay_Req messages and hardware does not support
 * timestamping all packets => return error
 */
adapter->flags &= ~(IXGBE_FLAG_RX_HWTSTAMP_ENABLED |
IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER);
config->rx_filter = HWTSTAMP_FILTER_NONE;
return -ERANGE;

which seems to indicate that only the X550 controller supports time stamping 
all packets in hardware.

However, the code in ixgbe_ethtool.c does:

switch (adapter->hw.mac.type) {
case ixgbe_mac_X550:
case ixgbe_mac_X550EM_x:
case ixgbe_mac_X540:
case ixgbe_mac_82599EB:
info->so_timestamping =
SOF_TIMESTAMPING_TX_SOFTWARE |
SOF_TIMESTAMPING_RX_SOFTWARE |
SOF_TIMESTAMPING_SOFTWARE |
SOF_TIMESTAMPING_TX_HARDWARE |
SOF_TIMESTAMPING_RX_HARDWARE |
SOF_TIMESTAMPING_RAW_HARDWARE;

if (adapter->ptp_clock)
info->phc_index = ptp_clock_index(adapter->ptp_clock);
else
info->phc_index = -1;

info->tx_types =
(1 << HWTSTAMP_TX_OFF) |
(1 << HWTSTAMP_TX_ON);

info->rx_filters =
(1 << HWTSTAMP_FILTER_NONE) |
(1 << HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
(1 << HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
(1 << HWTSTAMP_FILTER_PTP_V2_EVENT);
break;
default:
return ethtool_op_get_ts_info(dev, info);
}

which draws no distinction between the X550 controller and the X540 and 82599, 
and doesn't say *any* of them support HWTSTAMP_FILTER_ALL.

Is it the case that only the ixgbe_mac_X550 and ixgbe_mac_X550EM_x controllers 
support HWTSTAMP_FILTER_ALL?  If so, shouldn't ixgbe_get_ts_info() be doing 
something such as:

switch (adapter->hw.mac.type) {
case ixgbe_mac_X550:
case ixgbe_mac_X550EM_x:
case ixgbe_mac_X540:
case ixgbe_mac_82599EB:
info->so_timestamping =
SOF_TIMESTAMPING_TX_SOFTWARE |
SOF_TIMESTAMPING_RX_SOFTWARE |
SOF_TIMESTAMPING_SOFTWARE |
SOF_TIMESTAMPING_TX_HARDWARE |
SOF_TIMESTAMPING_RX_HARDWARE |
SOF_TIMESTAMPING_RAW_HARDWARE;

if (adapter->ptp_clock)
info->phc_index = ptp_clock_index(adapter->ptp_clock);
else
info->phc_index = -1;

info->tx_types =
(1 << HWTSTAMP_TX_OFF) |
(1 << HWTSTAMP_TX_ON);

info->rx_filters =
(1 << HWTSTAMP_FILTER_NONE) |
(1 << HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
(1 << HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
(1 << HWTSTAMP_FILTER_PTP_V2_EVENT);
if (adapter->hw.mac.type >= ixgbe_mac_X550)
info->rx_filters |= (1 << HWTSTAMP_FILTER_ALL);
break;
default:
return ethtool_op_get_ts_info(dev, info);
}

>From a quick scan of drivers/net, it looks as if

drivers/net/ethernet/cavium/liquidio

also support

[PATCH 2/2] lxt: simplify lxt970_config_init()

2016-05-13 Thread Sergei Shtylyov

This function declares the 'err' local variable for no good reason, get rid
of it.

Signed-off-by: Sergei Shtylyov 

---
 drivers/net/phy/lxt.c |6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

Index: net-next/drivers/net/phy/lxt.c
===
--- net-next.orig/drivers/net/phy/lxt.c
+++ net-next/drivers/net/phy/lxt.c
@@ -88,11 +88,7 @@ static int lxt970_config_intr(struct phy
 
 static int lxt970_config_init(struct phy_device *phydev)
 {
-   int err;
-
-   err = phy_write(phydev, MII_LXT970_CONFIG, 0);
-
-   return err;
+   return phy_write(phydev, MII_LXT970_CONFIG, 0);
 }

[PATCH 1/2] lxt: simplify lxt97[01]_config_intr()

2016-05-13 Thread Sergei Shtylyov

Both these functions declare the 'err' local variables for no good reason,
get rid of them.

Signed-off-by: Sergei Shtylyov 

---
 drivers/net/phy/lxt.c |   16 
 1 file changed, 4 insertions(+), 12 deletions(-)

Index: net-next/drivers/net/phy/lxt.c
===
--- net-next.orig/drivers/net/phy/lxt.c
+++ net-next/drivers/net/phy/lxt.c
@@ -80,14 +80,10 @@ static int lxt970_ack_interrupt(struct p
 
 static int lxt970_config_intr(struct phy_device *phydev)
 {
-   int err;
-
if (phydev->interrupts == PHY_INTERRUPT_ENABLED)
-   err = phy_write(phydev, MII_LXT970_IER, MII_LXT970_IER_IEN);
+   return phy_write(phydev, MII_LXT970_IER, MII_LXT970_IER_IEN);
else
-   err = phy_write(phydev, MII_LXT970_IER, 0);
-
-   return err;
+   return phy_write(phydev, MII_LXT970_IER, 0);
 }
 
 static int lxt970_config_init(struct phy_device *phydev)
@@ -112,14 +108,10 @@ static int lxt971_ack_interrupt(struct p
 
 static int lxt971_config_intr(struct phy_device *phydev)
 {
-   int err;
-
if (phydev->interrupts == PHY_INTERRUPT_ENABLED)
-   err = phy_write(phydev, MII_LXT971_IER, MII_LXT971_IER_IEN);
+   return phy_write(phydev, MII_LXT971_IER, MII_LXT971_IER_IEN);
else
-   err = phy_write(phydev, MII_LXT971_IER, 0);
-
-   return err;
+   return phy_write(phydev, MII_LXT971_IER, 0);
 }
 
 /*

[PATCH 0/2] Kill unneeded local variables in the LXT PHY driver

2016-05-13 Thread Sergei Shtylyov

Hello.

[Resending with the correct subject. Sorry about that :-/]

   Here's the set of 2 patches against DaveM's 'net-next.git' repo. We save
several LoCs on the unneeded local variables

[1/2] lxt: simplify lxt97[01]_config_intr()
[2/2] lxt: simplify lxt970_config_init()

MBR, Sergei

[PATCH 0/2] Don't return NULL from get_phy_device() anymore

2016-05-13 Thread Sergei Shtylyov

Hello.

   Here's the set of 2 patches against DaveM's 'net-next.git' repo. We save
several LoCs on the unneeded local variables

[1/2] lxt: simplify lxt97[01]_config_intr()
[2/2] lxt: simplify lxt970_config_init()

MBR, Sergei

Re: QRTR merge conflict resolution (was: Re: linux-next: build failure after merge of the net-next tree)

2016-05-13 Thread Andy Gross

On 13 May 2016 at 17:19, Bjorn Andersson  wrote:
> On Fri 13 May 14:01 PDT 2016, Arnd Bergmann wrote:
>
>> On Tuesday 10 May 2016 11:39:34 Bjorn Andersson wrote:
> [..]
>> > I assume we could have the QRTR go through Andy and arm-soc, with
>> > David's approval and this fix squashed in. But we're running rather late
>> > in this cycle, perhaps we should just back the QRTR patches out and I
>> > can respin and resend them after the merge window (for v4.8 instead)?
>>
>> I'd suggest you do a merge of next-next with the qcom/soc-2 branch that
>> we have in arm-soc and resolve the conflict in the merge, then send
>> a pull request with the merge to davem.
>>
>
> Hi David,
>
> In case you missed this thread, linux-next highlighted an upcoming merge
> conflict between the net-next and one of the branches included in the
> arm-soc trees.
>
> I have prepared the merge of net-next and the conflicting tag from the
> Qualcomm SOC, please include this in your pull towards Linus to avoid
> the merge conflict.
>
> Regards,
> Bjorn
>
> The following changes since commit ed7cbbce544856b20e5811de373cf92e92499771:
>
>   udp: Resolve NULL pointer dereference over flow-based vxlan device 
> (2016-05-13 01:56:14 -0400)


OK. The contents look good to me.

Acked-by: Andy Gross

QRTR merge conflict resolution (was: Re: linux-next: build failure after merge of the net-next tree)

2016-05-13 Thread Bjorn Andersson

On Fri 13 May 14:01 PDT 2016, Arnd Bergmann wrote:

> On Tuesday 10 May 2016 11:39:34 Bjorn Andersson wrote:
[..]
> > I assume we could have the QRTR go through Andy and arm-soc, with
> > David's approval and this fix squashed in. But we're running rather late
> > in this cycle, perhaps we should just back the QRTR patches out and I
> > can respin and resend them after the merge window (for v4.8 instead)?
> 
> I'd suggest you do a merge of next-next with the qcom/soc-2 branch that
> we have in arm-soc and resolve the conflict in the merge, then send
> a pull request with the merge to davem.
> 

Hi David,

In case you missed this thread, linux-next highlighted an upcoming merge
conflict between the net-next and one of the branches included in the
arm-soc trees.

I have prepared the merge of net-next and the conflicting tag from the
Qualcomm SOC, please include this in your pull towards Linus to avoid
the merge conflict.

Regards,
Bjorn

The following changes since commit ed7cbbce544856b20e5811de373cf92e92499771:

  udp: Resolve NULL pointer dereference over flow-based vxlan device 
(2016-05-13 01:56:14 -0400)

are available in the git repository at:

  git://github.com/andersson/kernel tags/net-next-qcom-soc-4.7-2-merge

for you to fetch changes up to f79a917e69e1f5cd86e864b67f06147f1b0340f4:

  Merge tag 'qcom-soc-for-4.7-2' into net-next (2016-05-13 14:42:23 -0700)


Merge tag 'qcom-soc-for-4.7-2' into net-next

This merges the Qualcomm SOC tree with the net-next, solving the
merge conflict in the SMD API between the two.


Andy Gross (1):
  Merge tag 'qcom-soc-for-4.7' into soc-for-4.7-p2

Bjorn Andersson (9):
  soc: qcom: smem_state: Add stubs for disabled smem_state
  soc: qcom: smd: Introduce callback setter
  soc: qcom: smd: Split discovery and state change work
  soc: qcom: smd: Refactor channel open and close handling
  soc: qcom: smd: Support multiple channels per sdev
  soc: qcom: smd: Support opening additional channels
  soc: qcom: smem: Use write-combine remap for SMEM
  soc: qcom: smd: Make callback pass channel reference
  Merge tag 'qcom-soc-for-4.7-2' into net-next

Lina Iyer (1):
  drivers: qcom: spm: avoid module usage in non-modular SPM driver

Srinivas Kandagatla (2):
  MAINTAINERS: add qcom i2c and spi drivers to list
  MAINTAINERS: add qcom clocks to the maintainers list

 MAINTAINERS |   3 +
 drivers/soc/qcom/smd-rpm.c  |   9 +-
 drivers/soc/qcom/smd.c  | 247 +++-
 drivers/soc/qcom/smem.c |   3 +-
 drivers/soc/qcom/spm.c  |   8 +-
 drivers/soc/qcom/wcnss_ctrl.c   |   8 +-
 include/linux/soc/qcom/smd.h|  33 -
 include/linux/soc/qcom/smem_state.h |  35 +
 net/qrtr/smd.c  |   9 +-
 9 files changed, 278 insertions(+), 77 deletions(-)

> Alternatively, in case Linus merges net-next before we get that fix
> in, I could send Stephen's fix to Linus along with the pull requests.
> 
>   Arnd

Re: [RESEND PATCH v7 17/21] IB/hns: Add QP operations support

2016-05-13 Thread Doug Ledford

On 05/09/2016 11:04 PM, Lijun Ou wrote:
> +static void hns_roce_v1_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
> +  struct hns_roce_srq *srq)
> +{
> + spin_lock_irq(_cq->lock);
> + hns_roce_v1_clean_cq(hr_cq, qpn, srq);
> + spin_unlock_irq(_cq->lock);
> +}

This is a perfect example of what I was talking about in my last email.
The convention here would be to name the main function
__hns_roce_v1_cq_clean and the wrapper hns_roce_v1_cq_clean.  Instead,
you have one named cq_clean and one named clean_cq.  Keeping straight
which of those locks itself and which needs to be called with the lock
held is nigh impossible.

-- 
Doug Ledford 
  GPG KeyID: 0E572FDD

signature.asc
Description: OpenPGP digital signature

Re: [RESEND PATCH v7 17/21] IB/hns: Add QP operations support

2016-05-13 Thread Doug Ledford

On 05/09/2016 11:04 PM, Lijun Ou wrote:
> +int __hns_roce_cmd(struct hns_roce_dev *hr_dev, u64 in_param, u64 *out_param,
> +unsigned long in_modifier, u8 op_modifier, u16 op,
> +unsigned long timeout);
> +
> +/* Invoke a command with no output parameter */
> +static inline int hns_roce_cmd(struct hns_roce_dev *hr_dev, u64 in_param,
> +unsigned long in_modifier, u8 op_modifier,
> +u16 op, unsigned long timeout)
> +{
> + return __hns_roce_cmd(hr_dev, in_param, NULL, in_modifier,
> +   op_modifier, op, timeout);
> +}
> +
> +/* Invoke a command with an output mailbox */
> +static inline int hns_roce_cmd_box(struct hns_roce_dev *hr_dev, u64 in_param,
> +u64 out_param, unsigned long in_modifier,
> +u8 op_modifier, u16 op,
> +unsigned long timeout)
> +{
> + return __hns_roce_cmd(hr_dev, in_param, _param, in_modifier,
> +   op_modifier, op, timeout);
> +}

This will make people scratch their head in the future.  You are using
two commands to map to one command without there being any locking
involved.  The typical convention for routine_1() -> __routine_1() is
that the __ version requires that it be called while locked, and the
version without a __ does the locking before calling it.  That way a
used can always know if they aren't currently holding the appropriate
lock, then they6 call routine_1() and if they are, they call
__routine_1() to avoid a deadlock.  I would suggest changing the name of
__hns_roce_cmd to hns_roce_cmd_box and completely remove the existing
hns_roce_cmd_box inline, and then change the hns_roce_cmd() inline to
directly call hns_roce_cmd_box() which will then select between
event/poll command sends.

-- 
Doug Ledford 
  GPG KeyID: 0E572FDD

signature.asc
Description: OpenPGP digital signature

Re: [PATCH RFT 1/2] phylib: add device reset GPIO support

2016-05-13 Thread Sergei Shtylyov


Hello.

On 05/13/2016 10:06 AM, Uwe Kleine-König wrote:

[...]

Index: net-next/drivers/of/of_mdio.c
===
--- net-next.orig/drivers/of/of_mdio.c
+++ net-next/drivers/of/of_mdio.c
@@ -44,6 +44,7 @@ static int of_get_phy_id(struct device_n
static void of_mdiobus_register_phy(struct mii_bus *mdio,
struct device_node *child, u32 addr)
{
+   struct gpio_desc *gpiod;
struct phy_device *phy;
bool is_c45;
int rc;
@@ -52,10 +53,17 @@ static void of_mdiobus_register_phy(stru
is_c45 = of_device_is_compatible(child,
 "ethernet-phy-ieee802.3-c45");

+   gpiod = fwnode_get_named_gpiod(>fwnode, "reset-gpios");
+   /* Deassert the reset signal */
+   if (!IS_ERR(gpiod))
+   gpiod_direction_output(gpiod, 0);


This is wrong I think. You must only ignore -ENODEV, all other error


   At least -ENOSYS should also be ignored (it's returned when gpiolib is
not configured), right?


No, that's a common misconception. If GPIOLIB is off you cannot
determine if dt specified a reset gpio. So you have the choice between:

 a) ignore -ENOSYS and so don't handle the reset line in the cases where
it's necessary probably yielding an "Error: phy not found" message.
 b) fail to probe even if a reset handling isn't necessary, yielding
"Error: failed to get hold of reset gpio".

I say b) is the better one. It's easier to debug because the error
message is better, GPIOLIB is enabled in most cases anyhow (still maybe
select it?) and it's ensured that we're operating in the limits of the
hardware specs (maybe a phy returns a random value when a register is
read while reset is applied?).


   It returns all ones in my case.


When does -ENODEV gets returned (it's not easy to follow)?


I don't know for sure for fwnode_get_named_gpiod, but the gpiod_get*()
family returns -ENODEV if the node doesn't have a reset-gpio property.


   Are you sure it's not -ENOENT?


codes should be passed to the caller.


   The caller doesn't care anyway...


(I see that's not trivial because
of_mdiobus_register_phy returns void.)


   I've made this function *void* in net-next.


I'd say this is a step in the wrong direction. For example this makes it
impossible to handle the case where the phy should be probed, the gpio
driver isn't available yet, though. Normally you'd want to return
-EPROBE_DEFER in this case and retry probing later.


   Well, of_mdiobus_register() is not an easy function to add the checks 
whiuch were never there (and undo the done stuff on failure). I'll see what I 
can do but no promises...



In my patch I used devm_gpiod_get_array which has the nice property that
I can already pass GPIOD_OUT_LOW in flags. Also this binds the lifetime
of the gpio to the device which is nice and IMHO the right direction for
the phylib (i.e. better embracing of the device model).

This cannot be used here easily however because there is no struct
device yet and this is only created after the phy id is determined.


   Your last patch [1] didn't make use of the managed device API (devm)
either, I didn't quite get to the bottom of that...


Right, devm didn't work. My patch was a prototype for a way that allowed
to bind the lifetime of the gpio to the device. This might be longer
than the call to mdiobus_unregister.


   Ah, that was the reason... Well, then you hardly achieved anything with 
rehashing the code...



See the problems that i2c handles
because it doesn't handle lifetimes correctly in drivers/i2c/i2c-core.c
at the end of i2c_del_adapter.


The phy id is either read from the device tree or must be read from
the phy which might fail if reset is not deasserted.



Principally there is no reason however that the phy_id must be known
before the struct device is created however.


   It's just that the code is cleaner that way...


I don't agree, I don't see that

determine_phyid()
allocate_device()

is better than

allocate_device()
determine_phyid()


   This is an oversimplified view. Actually, it is:

error = determine_phyid()
if (error)
return
allocate_device()

versus

allocate_device()
error = determine_phyid()
if (error)
free_device()


. The former is maybe easier because that's the status quo and it
doesn't need patching. But IMHO the result is on a similar level of
cleanliness.


   I disagree. And I don't see why it's necessary at all. Just to use another 
gpiolib API?



Best regards
Uwe


MBR, Sergei

Re: [patch iproute2 00/11] devlink: add support for shared buffer configuration and control

2016-05-13 Thread Stephen Hemminger

On Fri, 15 Apr 2016 09:51:42 +0200
Jiri Pirko  wrote:

> From: Jiri Pirko 
> 
> Jiri Pirko (11):
>   devlink: fix "devlink port" help message
>   list: add list_for_each_entry_reverse macro
>   list: add list_add_tail helper
>   devlink: introduce pr_out_port_handle helper
>   devlink: introduce helper to print out nice names (ifnames)
>   devlink: split dl_argv_parse_put to parse and put parts
>   devlink: introduce dump filtering function
>   devlink: allow to parse both devlink and port handle in the same time
>   devlink: implement shared buffer support
>   devlink: implement shared buffer occupancy control
>   devlink: add manpage for shared buffer
> 
>  devlink/devlink.c  | 1310 
> +---
>  include/linux/devlink.h|   63 +++
>  include/list.h |   16 +
>  man/man8/devlink-dev.8 |2 +
>  man/man8/devlink-monitor.8 |1 +
>  man/man8/devlink-port.8|2 +
>  man/man8/devlink-sb.8  |  313 +++
>  man/man8/devlink.8 |5 +
>  8 files changed, 1636 insertions(+), 76 deletions(-)
>  create mode 100644 man/man8/devlink-sb.8
> 

I had to revert all this. Because you are depending on headers in net-next.
Please resubmit against net-next tree of iproute.

Re: [PATCH iproute2] geneve: fix IPv6 remote address reporting

2016-05-13 Thread Stephen Hemminger

On Fri, 6 May 2016 15:28:25 +0100
Edward Cree  wrote:

> Since we can only configure unicast, we probably want to be able to
> display unicast, rather than multicast.
> 
> Fixes: 906ac5437ab8 ("geneve: add support for IPv6 link partners")
> Signed-off-by: Edward Cree 
> ---
> I'm assuming this is what was intended, but tbh I don't know why we
> need to check for multicast on the display side at all, rather than
> just displaying whatever the kernel gives us.
> 
>  ip/iplink_geneve.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Applied thanks

Re: [RESEND PATCH v7 07/21] IB/hns: Add event queue support

2016-05-13 Thread Doug Ledford

On 05/09/2016 11:04 PM, Lijun Ou wrote:

> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h

> @@ -29,10 +31,93 @@
>  #define HNS_ROCE_AEQE_VEC_NUM1
>  #define HNS_ROCE_AEQE_OF_VEC_NUM 1
>  
> +#define ADDR_SHIFT_1212
>  #define ADDR_SHIFT_3232
> +#define ADDR_SHIFT_4444

I saw some of the early review requests to replace hard coded values
with defines so that things made more sense.  And you've done a lot of
that quite well.  However, this one is not so helpful.  Sure, it's an
address shift, but the define gives no context as to why these shifts
are what they are.  Someone from the x86 world might mistakenly think
the 12 shift is all about page size shifting, but on arm you have
multiple different page sizes and 12 may not work at all.  So, some
context in the name for these items, or else a comment above these
defines letting us know what's special about these three shift values
would be helpful.

-- 
Doug Ledford 
  GPG KeyID: 0E572FDD

signature.asc
Description: OpenPGP digital signature

Re: [PATCH nf V2] netfilter: fix oops in nfqueue during netns error unwinding

2016-05-13 Thread Florian Westphal

Eric W. Biederman  wrote:
> Florian could you test and verify this patch fixes your issues?

Yes, this seems to work.

Pablo, I'm fine with this patch going into -nf/stable but I do not think
making the pointers per netns is a desireable option in the long term.

> Unlike the other possibilities that have been discussed this also
> addresses the nf_queue path as well as the nf_queue_hook_drop path.

The nf_queue path should have been fine, no?

Or putting it differently: can we start processing skbs before a netns
is fully initialized?

Re: [PATCH RFT 1/2] phylib: add device reset GPIO support

2016-05-13 Thread Sergei Shtylyov


Hello.

On 05/13/2016 07:06 AM, Andrew Lunn wrote:


+   gpiod = fwnode_get_named_gpiod(>fwnode, "reset-gpios");
+   /* Deassert the reset signal */
+   if (!IS_ERR(gpiod))
+   gpiod_direction_output(gpiod, 0);


This is wrong I think. You must only ignore -ENODEV, all other error


   At least -ENOSYS should also be ignored (it's returned when
gpiolib is not configured), right? When does -ENODEV gets returned
(it's not easy to follow)?


codes should be passed to the caller.


   The caller doesn't care anyway...


It should do.


   Tell that to Florian. So far, everybody has been happy with 
of_mdiobus_register(). Until I had to touch this code. :-)



What if fwnode_get_named_gpiod() returns -EPROBE_DEFER
because the GPIO driver has not been loaded yet?


   Bad luck. :-)
   Seriously, I'll see what I can do but it's not a trivial case.


Andrew


MBR, Sergei

Re: [RESEND PATCH v7 00/21] Add HiSilicon RoCE driver

2016-05-13 Thread Doug Ledford

On 05/09/2016 11:04 PM, Lijun Ou wrote:
> The HiSilicon Network Substem is a long term evolution IP which is
> supposed to be used in HiSilicon ICT SoCs. HNS (HiSilicon Network
> Sybsystem) also has a hardware support of performing RDMA with
> RoCEE.
> The driver for HiSilicon RoCEE(RoCE Engine) is a platform driver and
> will support mulitple versions of SOCs in future. This version of driver
> is meant to support Hip06 SoC(which confirms to RoCEEv1 hardware
> specifications).
> 
> Changes v6 -> v7:
> 1. modify some type of parameter, use bool replace the original type.
> 2. add the Signed-off-by signatures in the first patch.
> 3. delete the improper print sentence in hns_roce_create_eq.

One thing you should be aware of (but which you may not care about,
which is fine) is that if you want your driver to be available as part
of the OpenFabric Alliances OFED distribution, the license on the files
would need to be "GPL or BSD" (see the remainder of the RDMA stack for
examples).  If you don't want that license on your files, I'm fine with
that, and I won't hold up your submission based on the license.  If,
however, you *do* want your code to be shipable as part of OFED, it is
much easier to update the license on the files before you submit them
and third parties modify them.  If you want to change the license later,
you have to get every person that has made any substantive change to the
file to agree to the license change.

-- 
Doug Ledford 
  GPG KeyID: 0E572FDD

signature.asc
Description: OpenPGP digital signature

Re: [PATCH nf V2] netfilter: fix oops in nfqueue during netns error unwinding

2016-05-13 Thread Florian Westphal

Eric W. Biederman  wrote:
> Florian Westphal  writes:
> 
> > Eric W. Biederman  wrote:
> >> > AFAICS no other callers do something similar, but yes,
> >> > we'd need this all over the place if there are others.
> >> >
> >> > Maybe we need a saner fix, e.g. by adding bounds check to net_generic(),
> >> > and making sure that net_generic() returns non-NULL only if the per
> >> > netns memory was allocated properly.
> >> 
> >> As a first approximiation I am thinking we should fix by making
> >> nf_queue_register_handler a per netns operation.
> >
> > We can do that but then we'd end up storing the very same address
> > for each namespace which I think isn't nice either.
> 
> Sort of.  At the same time we fundamentally need a per namespace
> variable to ask if things were initialized.  So using the address as
> that variable changes the equation not at all, and keeps the code
> simpler.

We already do this for nf_conntrack_event_cb and nf_expect_event_cb so
we store 16 byte per netns where one bit would suffice.

> >> Either that or give nfnetlink_queue it's own dedicated place in
> >> struct net for struct nfnl_queue_net.
> >
> > That would work too, but then I don't see why that is preferrable
> > to this proposed patch. We could add a helper for this so that
> > we have something like

Addendum: its also a non-starter as nfqueue data is quite big
and the module won't be loaded in most configurations.

Looking at 70e9942f17a6193e9172a804e6569a8806633d6b again it seems
the problem is somewhat related to this one, although not identical.

Perhaps we should consider adding

nf_netns_feature_ok(struct net *, enum nfnet_feature);

enum nfnet_feature {
NFNET_QUEUE,
NFNET_EVENT,
};

and then set/unset that from the netns init/exit handlers.

nf_netns_feature_ok would then just need one bit in struct nf_net
to store wheter whatever feature we care about is available.

We could then de-netnsify those pointers again.

> > static bool netns_was_inited(const struct net *n)
> > {
> > return net->list.next != NULL;
> > }
> 
> That does not work because the real question were the things this piece
> of code cares about initialized.

Hmm, AFAIU we can't have packets queued without the netns being on
the list, no?

> Very much no.
> 
> I believe that changing net_generic to return a value so that
> nfqnl_nf_hook_drop can detect if the timing is wrong is an error.

Well, I don't see how this is fixable without having some means of
detecting wheter 'timing' is wrong...

You could say that making calls that end up in nfnetlink_queue
without a successful ->init() is illegal but in that case you
will need some condition by which the netfilter core can know
wheter that call is ok in the first place.

So I do not think that detecting it within nfnetlink_queue is worse
than detecting this in the caller, in fact, detecting it in callee
is better since the caller can't forget the test...

> I also don't think your changes to net_generic will work as the number
> of pointers should have been allocted properly from the start.

As I said I did not test it.
I only saw that net_assign_generic() can expand this array so I
figured one has to check if ptr->[id] is useable.

> Your proposed change is just papering over the problem and fixing it at
> the wrong level.  (The obvious level yes) but still the wrong level.

Well, I have no more ideas.

Looking at the number of function pointers used within netfilter
I'm worried that we will end up pushing more and more redundant info
into struct net to solve this "problem".

Re: linux-next: build failure after merge of the net-next tree

2016-05-13 Thread Arnd Bergmann

On Tuesday 10 May 2016 11:39:34 Bjorn Andersson wrote:
> On Mon 09 May 18:29 PDT 2016, Stephen Rothwell wrote:
> 
> > Hi all,
> > 
> > After merging the net-next tree, today's linux-next build (x86_64
> > allmodconfig) failed like this:
> > 
> > net/qrtr/smd.c:106:14: error: initialization from incompatible pointer type 
> > [-Werror=incompatible-pointer-types]
> >   .callback = qcom_smd_qrtr_callback,
> >   ^
> > net/qrtr/smd.c:106:14: note: (near initialization for 
> > 'qcom_smd_qrtr_driver.callback')
> > 
> > Caused by commit
> > 
> >   bdabad3e363d ("net: Add Qualcomm IPC router")
> > 
> > interacting with commit
> > 
> >   b853cb9628bf ("soc: qcom: smd: Make callback pass channel reference")
> > 
> > from the arm-soc tree.
> > 
> > I added the following merge fix patch (and it turned out I needed the
> > new stubs).
> > 
> 
> Sorry for not spotting this issue earlier, I missed Andy's second pull
> request towards arm-soc and thought the SMD changes missed this cycle.
> 
> 
> Your patch looks good, but I'm not sure how we should approach the merge
> window; Andy can't pick the patch because he doesn't have the qrtr code
> and David doesn't have the SMD patches coming through Andy.
> 
> FWIW, Reviewed-by: Bjorn Andersson 
> 
> 
> I assume we could have the QRTR go through Andy and arm-soc, with
> David's approval and this fix squashed in. But we're running rather late
> in this cycle, perhaps we should just back the QRTR patches out and I
> can respin and resend them after the merge window (for v4.8 instead)?

I'd suggest you do a merge of next-next with the qcom/soc-2 branch that
we have in arm-soc and resolve the conflict in the merge, then send
a pull request with the merge to davem.

Alternatively, in case Linus merges net-next before we get that fix
in, I could send Stephen's fix to Linus along with the pull requests.

Arnd

Re: [PATCH] Bluetooth: Fix l2cap_sock_teardown_cb race condition with bt_accept_dequeue

2016-05-13 Thread Yichen Zhao

Hi Marcel,

> so I am not big fan of the conditional locking in case of parent is set or 
> not. Do you have a test case that reproduces the mentioned race. It would 
> love to have that in tools/l2cap-tester or similar.

So far I could only reproduce the bug by repeatedly performing RFCOMM 
connections and resets. I'll try to implement something in rfcomm-tester or 
l2cap-tester.

Since this is a race condition, I'm not confident that I can help you reproduce 
the bug reliably on a different test setup. I'd appreciate it very much if you 
can offer any tips on triggering a race condition faster in a test case.

> Maybe the code needs some restructuring to avoid the conditional locking.

I agree that my patch is not very elegant, and I'd love any way to improve it.
I have some ideas, but I'm not familiar enough with kernel development to know 
whether other solutions are safe to implement, such as:

* Removing bt_accept_unlink from l2cap_teardown_cb, and relying on 
bt_accept_dequeue to unlink the socket when it's enumerated. Is it safe to 
leave a zapped sock in accept_q?
* Perform "unlock sock; lock parent; lock sock" before calling bt_accept_unlink 
in teardown_cb. This is still conditional locking, but around a smaller block 
of code. Is it safe to unlock a zapped sock?
* Use RCU for handling accept_q. Is this appropriate?

Please let me know what you think.

Regards,

Yichen Zhao

[PATCH 3/3] netfilter: ipset: use setup_timer() and mod_timer().

2016-05-13 Thread Muhammad Falak R Wani

Use setup_timer() and instead of init_timer(), being the preferred way
of setting up a timer.

Also, quoting the mod_timer() function comment:
-> mod_timer() is a more efficient way to update the expire field of an
   active timer (if the timer is inactive it will be activated).

Use setup_timer() and mod_timer() to setup and arm a timer, making the
code compact and easier to read.

Signed-off-by: Muhammad Falak R Wani 
---
 net/netfilter/ipset/ip_set_list_set.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_list_set.c 
b/net/netfilter/ipset/ip_set_list_set.c
index a2a89e4..530da56 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -570,11 +570,8 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned 
long ul_set))
 {
struct list_set *map = set->data;
 
-   init_timer(>gc);
-   map->gc.data = (unsigned long)set;
-   map->gc.function = gc;
-   map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
-   add_timer(>gc);
+   setup_timer(>gc, gc, (unsigned long)set);
+   mod_timer(>gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
 }
 
 /* Create list:set type of sets */
-- 
1.9.1

[PATCH 1/3] netfilter: ipset: use setup_timer() and mod_timer().

2016-05-13 Thread Muhammad Falak R Wani

Use setup_timer() and instead of init_timer(), being the preferred way
of setting up a timer.

Also, quoting the mod_timer() function comment:
-> mod_timer() is a more efficient way to update the expire field of an
   active timer (if the timer is inactive it will be activated).

Use setup_timer() and mod_timer() to setup and arm a timer, making the
code compact and easier to read.

Signed-off-by: Muhammad Falak R Wani 
---
 net/netfilter/ipset/ip_set_bitmap_gen.h | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h 
b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 2e8e7e5..29ae5d8 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -40,11 +40,8 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long 
ul_set))
 {
struct mtype *map = set->data;
 
-   init_timer(>gc);
-   map->gc.data = (unsigned long)set;
-   map->gc.function = gc;
-   map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
-   add_timer(>gc);
+   setup_timer(>gc, gc, (unsigned long)set);
+   mod_timer(>gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
 }
 
 static void
-- 
1.9.1

[PATCH 2/3] netfilter: ipset: use setup_timer() and mod_timer().

2016-05-13 Thread Muhammad Falak R Wani

Use setup_timer() and instead of init_timer(), being the preferred way
of setting up a timer.

Also, quoting the mod_timer() function comment:
-> mod_timer() is a more efficient way to update the expire field of an
   active timer (if the timer is inactive it will be activated).

Use setup_timer() and mod_timer() to setup and arm a timer, making the
code compact and easier to read.

Signed-off-by: Muhammad Falak R Wani 
---
 net/netfilter/ipset/ip_set_hash_gen.h | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_hash_gen.h 
b/net/netfilter/ipset/ip_set_hash_gen.h
index d32fd6b..5a06ff2 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -444,11 +444,8 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long 
ul_set))
 {
struct htype *h = set->data;
 
-   init_timer(>gc);
-   h->gc.data = (unsigned long)set;
-   h->gc.function = gc;
-   h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
-   add_timer(>gc);
+   setup_timer(>gc, gc, (unsigned long)set);
+   mod_timer(>gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
pr_debug("gc initialized, run in every %u\n",
 IPSET_GC_PERIOD(set->timeout));
 }
-- 
1.9.1

Re: [PATCH RFT 1/2] phylib: add device reset GPIO support

2016-05-13 Thread Sergei Shtylyov


On 05/13/2016 11:44 PM, Andrew Lunn wrote:


Another issue is that on some boards we have one reset line tied to
multiple PHYs.How do we prevent multiple resets being taking place when each of
the PHYs are registered?


   My patch just doesn't address this case -- it's about the
individual resets only.


This actually needs to be addresses a layer above. What you have is a
bus reset, not a device reset.


   No.
   There's simply no such thing as a bus reset for the xMII/MDIO busses, 
there's simply no reset signaling on them. Every device has its own reset 
signal and its own timing requirements.



So the gpio line is associated to the mdio bus, not a PHY.


   No.


Either your MDIO driver needs to handle the gpio
line,  or in __mdio_register(),


   __mdiobus_register(), you mean?


before it starts looking at the
children.


   It's basically the same thing.
   The MDIO bus reset is a misconception.



Andrew


MBR, Sergei

Re: [PATCH nf V2] netfilter: fix oops in nfqueue during netns error unwinding

2016-05-13 Thread Eric W. Biederman


Florian could you test and verify this patch fixes your issues?

Unlike the other possibilities that have been discussed this also
addresses the nf_queue path as well as the nf_queue_hook_drop path.

And frankly that is the best argument I have for fixing it this way
because potentially nf_queue could have similar issues.

Eric

From: "Eric W. Biederman" 
Date: Fri, 13 May 2016 15:26:03 -0500
Subject: [PATCH] nf_queue: Make the queue_handler pernet

Florian Weber reported:
> Under full load (unshare() in loop -> OOM conditions) we can
> get kernel panic:
>
> BUG: unable to handle kernel NULL pointer dereference at 0008
> IP: [] nfqnl_nf_hook_drop+0x35/0x70
> [..]
> task: 88012dfa3840 ti: 88012dffc000 task.ti: 88012dffc000
> RIP: 0010:[]  [] 
> nfqnl_nf_hook_drop+0x35/0x70
> RSP: :88012dfffd80  EFLAGS: 00010206
> RAX: 0008 RBX: 81add0c0 RCX: 88013fd8
> [..]
> Call Trace:
>  [] nf_queue_nf_hook_drop+0x18/0x20
>  [] nf_unregister_net_hook+0xdb/0x150
>  [] netfilter_net_exit+0x2f/0x60
>  [] ops_exit_list.isra.4+0x38/0x60
>  [] setup_net+0xc2/0x120
>  [] copy_net_ns+0x79/0x120
>  [] create_new_namespaces+0x11b/0x1e0
>  [] unshare_nsproxy_namespaces+0x57/0xa0
>  [] SyS_unshare+0x1b2/0x340
>  [] entry_SYSCALL_64_fastpath+0x1e/0xa8
> Code: 65 00 48 89 e5 41 56 41 55 41 54 53 83 e8 01 48 8b 97 70 12 00 00 48 98 
> 49 89 f4 4c 8b 74 c2 18 4d 8d 6e 08 49 81 c6 88 00 00 00 <49> 8b 5d 00 48 85 
> db 74 1a 48 89 df 4c 89 e2 48 c7 c6 90 68 47
>

The simple fix for this requires a new pernet variable for struct
nf_queue that indicates when it is safe to use the dynamically
allocated nf_queue state.

As we need a variable anyway make nf_register_queue_handler and
nf_unregister_queue_handler pernet.  This allows the existing logic of
when it is safe to use the state from the nfnetlink_queue module to be
reused with no changes except for making it per net.

The syncrhonize_rcu from nf_unregister_queue_handler is moved to a new
function nfnl_queue_net_exit_batch so that the worst case of having a
syncrhonize_rcu in the pernet exit path is not experienced in batch
mode.

Reported-by: Florian Westphal 
Signed-off-by: "Eric W. Biederman" 
---
 include/net/netfilter/nf_queue.h |  4 ++--
 include/net/netns/netfilter.h|  2 ++
 net/netfilter/nf_queue.c | 17 -
 net/netfilter/nfnetlink_queue.c  | 18 --
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 9c5638ad872e..0dbce55437f2 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -28,8 +28,8 @@ struct nf_queue_handler {
struct nf_hook_ops *ops);
 };
 
-void nf_register_queue_handler(const struct nf_queue_handler *qh);
-void nf_unregister_queue_handler(void);
+void nf_register_queue_handler(struct net *net, const struct nf_queue_handler 
*qh);
+void nf_unregister_queue_handler(struct net *net);
 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict);
 
 void nf_queue_entry_get_refs(struct nf_queue_entry *entry);
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 38aa4983e2a9..36d723579af2 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -5,11 +5,13 @@
 
 struct proc_dir_entry;
 struct nf_logger;
+struct nf_queue_handler;
 
 struct netns_nf {
 #if defined CONFIG_PROC_FS
struct proc_dir_entry *proc_netfilter;
 #endif
+   const struct nf_queue_handler __rcu *queue_handler;
const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO];
 #ifdef CONFIG_SYSCTL
struct ctl_table_header *nf_log_dir_header;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 5baa8e24e6ac..b19ad20a705c 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -26,23 +26,21 @@
  * Once the queue is registered it must reinject all packets it
  * receives, no matter what.
  */
-static const struct nf_queue_handler __rcu *queue_handler __read_mostly;
 
 /* return EBUSY when somebody else is registered, return EEXIST if the
  * same handler is registered, return 0 in case of success. */
-void nf_register_queue_handler(const struct nf_queue_handler *qh)
+void nf_register_queue_handler(struct net *net, const struct nf_queue_handler 
*qh)
 {
/* should never happen, we only have one queueing backend in kernel */
-   WARN_ON(rcu_access_pointer(queue_handler));
-   rcu_assign_pointer(queue_handler, qh);
+   WARN_ON(rcu_access_pointer(net->nf.queue_handler));
+   rcu_assign_pointer(net->nf.queue_handler, qh);
 }
 EXPORT_SYMBOL(nf_register_queue_handler);
 
 /* The caller must flush their queue before this */
-void nf_unregister_queue_handler(void)
+void nf_unregister_queue_handler(struct net *net)
 {
-

Re: [PATCH RFT 1/2] phylib: add device reset GPIO support

2016-05-13 Thread Andrew Lunn

> >Another issue is that on some boards we have one reset line tied to
> >multiple PHYs.How do we prevent multiple resets being taking place when each 
> >of
> >the PHYs are registered?
> 
>My patch just doesn't address this case -- it's about the
> individual resets only.

This actually needs to be addresses a layer above. What you have is a
bus reset, not a device reset. So the gpio line is associated to the
mdio bus, not a PHY. Either your MDIO driver needs to handle the gpio
line, or in __mdio_register(), before it starts looking at the
children.

Andrew

Re: [PATCH] i40e: constify i40e_client_ops structure

2016-05-13 Thread Doug Ledford

On 05/13/2016 03:40 PM, Jeff Kirsher wrote:
> On Fri, 2016-05-13 at 12:50 -0400, Doug Ledford wrote:
>> On 05/01/2016 08:07 AM, Julia Lawall wrote:
>>> The i40e_client_ops structure is never modified, so declare it as
>> const.
>>>  
>>> Done with the help of Coccinelle.
>>>  
>>> Signed-off-by: Julia Lawall 
>>
>> Thanks, applied.
> 
> I already have this patch queued up to push through David Miller's net-next 
> tree.
> 
> Julia- please do not shotgun blast several maintainers and several lists,
> it causes confusion on who is *REALLY* going to accept and apply the patch.
> Please use scripts/get_maintainer.pl to get the correct mailing lists and
> maintainers to add to the To: and CC: lines.
> 

I've dropped it from my tree.  There won't be a conflict.

-- 
Doug Ledford 
  GPG KeyID: 0E572FDD




signature.asc
Description: OpenPGP digital signature

Re: [PATCH nf V2] netfilter: fix oops in nfqueue during netns error unwinding

2016-05-13 Thread Eric W. Biederman

Florian Westphal  writes:

> Eric W. Biederman  wrote:
>> > AFAICS no other callers do something similar, but yes,
>> > we'd need this all over the place if there are others.
>> >
>> > Maybe we need a saner fix, e.g. by adding bounds check to net_generic(),
>> > and making sure that net_generic() returns non-NULL only if the per
>> > netns memory was allocated properly.
>> 
>> As a first approximiation I am thinking we should fix by making
>> nf_queue_register_handler a per netns operation.
>
> We can do that but then we'd end up storing the very same address
> for each namespace which I think isn't nice either.

Sort of.  At the same time we fundamentally need a per namespace
variable to ask if things were initialized.  So using the address as
that variable changes the equation not at all, and keeps the code
simpler.

>> Either that or give nfnetlink_queue it's own dedicated place in
>> struct net for struct nfnl_queue_net.
>
> That would work too, but then I don't see why that is preferrable
> to this proposed patch. We could add a helper for this so that
> we have something like
>
> static bool netns_was_inited(const struct net *n)
> {
>   return net->list.next != NULL;
> }

That does not work because the real question were the things this piece
of code cares about initialized.

> Another alternative is this patch (not even compile tested, just to
> illustrate the idea):

Very much no.

I believe that changing net_generic to return a value so that
nfqnl_nf_hook_drop can detect if the timing is wrong is an error.

I also don't think your changes to net_generic will work as the number
of pointers should have been allocted properly from the start.  

Your proposed change is just papering over the problem and fixing it at
the wrong level.  (The obvious level yes) but still the wrong level.

> What do you think?

Eric

abonnieren netdev

2016-05-13 Thread Marian Schu

majord...@vger.kernel.org

netdev

2016-05-13 Thread Marian Schu

Re: [PATCH nf V2] netfilter: fix oops in nfqueue during netns error unwinding

2016-05-13 Thread Florian Westphal

Eric W. Biederman  wrote:
> > AFAICS no other callers do something similar, but yes,
> > we'd need this all over the place if there are others.
> >
> > Maybe we need a saner fix, e.g. by adding bounds check to net_generic(),
> > and making sure that net_generic() returns non-NULL only if the per
> > netns memory was allocated properly.
> 
> As a first approximiation I am thinking we should fix by making
> nf_queue_register_handler a per netns operation.

We can do that but then we'd end up storing the very same address
for each namespace which I think isn't nice either.

> Either that or give nfnetlink_queue it's own dedicated place in
> struct net for struct nfnl_queue_net.

That would work too, but then I don't see why that is preferrable
to this proposed patch. We could add a helper for this so that
we have something like

static bool netns_was_inited(const struct net *n)
{
return net->list.next != NULL;
}

Another alternative is this patch (not even compile tested, just to
illustrate the idea):

diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -38,7 +38,11 @@ static inline void *net_generic(const struct net *net, int 
id)
 
rcu_read_lock();
ng = rcu_dereference(net->gen);
-   ptr = ng->ptr[id - 1];
+
+   if (ng->len < id)
+   ptr = ng->ptr[id - 1];
+   else
+   ptr = NULL;
rcu_read_unlock();
 
return ptr;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -111,6 +111,7 @@ static int ops_init(const struct pernet_operations *ops, 
struct net *net)
return 0;
 
 cleanup:
+   net_assign_generic(net, *ops->id, NULL);
kfree(data);
 
 out:
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -927,6 +927,9 @@ static void nfqnl_nf_hook_drop(struct net *net, struct 
nf_hook_ops *hook)
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
int i;
 
+   if (!q)
+   return;
+
rcu_read_lock();
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct nfqnl_instance *inst;

What do you think?

Re: [PATCH] Add support for configuring Infiniband GUIDs

2016-05-13 Thread Eli Cohen

Hi Stephen,

can you please comment on this or are you going to apply this?

On Mon, May 09, 2016 at 03:52:02PM +, Eli Cohen wrote:
> Hi Stephen,
> 
> ibstat is usually used with MLNX_OFED distributions and I don't think it is a 
> part of libibverbs.
> Moreover, in a previous patch I sent 
> https://patchwork.ozlabs.org/patch/596492/ I documented in the change log the 
> same format I mention in this patch. So if I change the format here we will 
> have the changelog in the kernel tree invalid.
> 
> How do you think this should be addressed?
>

Re: [PATCH nf V2] netfilter: fix oops in nfqueue during netns error unwinding

2016-05-13 Thread Eric W. Biederman

Florian Westphal  writes:

> Eric W. Biederman  wrote:
>> > On Wed, May 11, 2016 at 05:41:13PM +0200, Florian Westphal wrote:
>> >> diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
>> >> index 5baa8e2..9722819 100644
>> >> --- a/net/netfilter/nf_queue.c
>> >> +++ b/net/netfilter/nf_queue.c
>> >> @@ -102,6 +102,13 @@ void nf_queue_nf_hook_drop(struct net *net, struct 
>> >> nf_hook_ops *ops)
>> >>  {
>> >>   const struct nf_queue_handler *qh;
>> >>  
>> >> + /* netns wasn't initialized, error unwind in progress.
>> >> +  * Its possible that the nfq netns init function was not even
>> >> +  * called, in which case nfq pernetns data is in undefined state.
>> >> +  */
>> >> + if (!net->list.next)
>> >> + return;
>> >
>> > Thanks Florian.
>> >
>> > Question for the netns folks: Is this a stable assumption? My only
>> > concern with this is that it relies on internal the netns
>> > representation, so I'd like to make sure this thing doesn't break
>> > again.
>> >
>> > Let me know, thanks.
>> 
>> Ugh. I got distracted before I finished replying.
>> 
>> I don't think the analysis is correct.
>
> I am afraid it is, see below for example
>
>> The unwind and the netns exit path should maintain the same constraints.
>> If they don't there is a bug, perhaps in initialization ordering.
>
> Since 8405a8fff3f8545c888a872d6e3c0c8eecd4d348 any call to
> nf_unregister_hook invokes nf_queue_nf_hook_drop().
>
> If the nfnetlink_queue module is loaded, that means we call
> nfqnl_nf_hook_drop function via the nf_queue nf_queue_handler struct
> in nf_queue.c .
>
> And since nfqnl_nf_hook_drop() uses net_generic() ...
>
>> Code should be able to count on net_generic() from the beginning of the
>> corresponding .init to the end of the corresponding .fini
>
> ... we cannot count on this.

I guess what I meant was that we should see if we can fix this.

> Example:
> 1. we try to create new netns
> 2. net_namespace.c:ops_init walks the netns op list and calls ->init()
> 3. some of these init hooks register netfilter hooks
> 4. we call init hook for nfnetlink_queue, but this yields -EFOO 
> (alternatively, another ->init handler before this failed)
> 5. netns init code invokes the ->exit() handlers in reverse order for 
> unwinding
> 6. hooks get unregistered, which makes a call into nf_queue_nf_hook_drop
> 7. nf_queue then calls q->nfqnl_nf_hook_drop(net)
> 8. oops as nfnetlink_queue wasn't setup in this net ns and net_generic
> returns some undefined result
>
>> Something like that is not happening and if we can I would like to track
>> down why.
>> 
>> Otherwise we need code like the above all over the place.
>
> AFAICS no other callers do something similar, but yes,
> we'd need this all over the place if there are others.
>
> Maybe we need a saner fix, e.g. by adding bounds check to net_generic(),
> and making sure that net_generic() returns non-NULL only if the per
> netns memory was allocated properly.

As a first approximiation I am thinking we should fix by making
nf_queue_register_handler a per netns operation.

Either that or give nfnetlink_queue it's own dedicated place in
struct net for struct nfnl_queue_net.

Let's sort out the interface so we can't get this wrong.

Eric

Re: [PATCH] i40e: constify i40e_client_ops structure

2016-05-13 Thread Julia Lawall

On Fri, 13 May 2016, Jeff Kirsher wrote:

> On Fri, 2016-05-13 at 12:50 -0400, Doug Ledford wrote:
> > On 05/01/2016 08:07 AM, Julia Lawall wrote:
> > > The i40e_client_ops structure is never modified, so declare it as
> > const.
> > > 
> > > Done with the help of Coccinelle.
> > > 
> > > Signed-off-by: Julia Lawall 
> > 
> > Thanks, applied.
> 
> I already have this patch queued up to push through David Miller's net-next 
> tree.
> 
> Julia- please do not shotgun blast several maintainers and several lists,
> it causes confusion on who is *REALLY* going to accept and apply the patch.
> Please use scripts/get_maintainer.pl to get the correct mailing lists and
> maintainers to add to the To: and CC: lines.

I do.  Probably the patch affects files maintained by different people.  
My assumption would be that everyone responsible for the code that is 
affected by the patch would want to see it.  Perhaps there would have been 
a way to separate it, but the const annotations typically have to be 
consistent everywhere.

julia

Re: [PATCH] i40e: constify i40e_client_ops structure

2016-05-13 Thread Jeff Kirsher

On Fri, 2016-05-13 at 12:50 -0400, Doug Ledford wrote:
> On 05/01/2016 08:07 AM, Julia Lawall wrote:
> > The i40e_client_ops structure is never modified, so declare it as
> const.
> > 
> > Done with the help of Coccinelle.
> > 
> > Signed-off-by: Julia Lawall 
> 
> Thanks, applied.

I already have this patch queued up to push through David Miller's net-next 
tree.

Julia- please do not shotgun blast several maintainers and several lists,
it causes confusion on who is *REALLY* going to accept and apply the patch.
Please use scripts/get_maintainer.pl to get the correct mailing lists and
maintainers to add to the To: and CC: lines.

signature.asc
Description: This is a digitally signed message part

Re: [PATCH RFT 1/2] phylib: add device reset GPIO support

2016-05-13 Thread Sergei Shtylyov


Hello.

On 05/13/2016 12:07 PM, Roger Quadros wrote:

[...]


+}
+EXPORT_SYMBOL(mdio_device_reset);
+
 /**
  * mdio_probe - probe an MDIO device
  * @dev: device to probe
@@ -117,9 +126,16 @@ static int mdio_probe(struct device *dev
struct mdio_driver *mdiodrv = to_mdio_driver(drv);
int err = 0;

-   if (mdiodrv->probe)
+   if (mdiodrv->probe) {
+   /* Deassert the reset signal */
+   mdio_device_reset(mdiodev, 0);
+
err = mdiodrv->probe(mdiodev);

+   /* Assert the reset signal */
+   mdio_device_reset(mdiodev, 1);


I wonder if it's safe to do this in general. What if ->probe does
something with the phy that is lost by resetting but that is relied on
later?


mdio_probe is called for non PHY devices only, right?


   Yes, those also can have a reset signal.


I'm a bit lost as to why we're de-asserting reset at multiple places. i.e.
mdio_probe(), phy_device_register(), phy_init_hw(), phy_probe(), 
of_mdiobus_register_phy().



Isn't it simpler to just de-assert it once at the topmost level?


   I wasn't after simplicity here. The intent was to save some power putting 
the device at reset when it's not needed. Florian Fainelly (the phylib 
maintainer) actually wanted me to go even further with that and assert reset 
in phy_suspend() but it was too much for me.



i.e. of_mdiobus_register_device() f and of_mdiobus_register_phy()?

Also, how about issuing a reset pulse instead of just de-asserting it.


   If it's already held at reset, my assumption is that it's enough time 
passed already.



This would tackle the case where PHY is in invalid state with reset already
de-asserted.


Good question. I haven't yet met such cases though.


Another issue is that on some boards we have one reset line tied to
multiple PHYs.How do we prevent multiple resets being taking place when each of
the PHYs are registered?


   My patch just doesn't address this case -- it's about the individual 
resets only.



Do we just specify the reset line only for one PHY in
the DT or can we have the reset gpio in the mdio_bus node for such case?


   I think there's mii_bus::reset() method for that case. Some Ethernet 
drivers even use it

(mostly instead of the code being suggested here).


cheers,
-roger


MBR, Sergei

[PATCH net-next] net: vrf: protect changes to private data with rcu

2016-05-13 Thread David Ahern

One cpu can be processing packets which includes using the cached route
entries in the vrf device's private data and on another cpu the device
gets deleted which releases the routes and sets the pointers in net_vrf
to NULL. This results in datapath dereferencing a NULL pointer.

Fix by protecting access to dst's with rcu.

Fixes: 193125dbd8eb ("net: Introduce VRF device driver")
Fixes: 35402e313663 ("net: Add IPv6 support to VRF device")
Signed-off-by: David Ahern 
---
Dave: I can handle the backports to stable branches if you prefer.

 drivers/net/vrf.c | 70 +--
 1 file changed, 47 insertions(+), 23 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 0ea29345eb2e..dff08842f26d 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -43,8 +43,8 @@
 #define DRV_VERSION"1.0"
 
 struct net_vrf {
-   struct rtable   *rth;
-   struct rt6_info *rt6;
+   struct rtable __rcu *rth;
+   struct rt6_info __rcu   *rt6;
u32 tb_id;
 };
 
@@ -273,10 +273,15 @@ static int vrf_output6(struct net *net, struct sock *sk, 
struct sk_buff *skb)
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
 
+/* holding rtnl */
 static void vrf_rt6_release(struct net_vrf *vrf)
 {
-   dst_release(>rt6->dst);
-   vrf->rt6 = NULL;
+   struct rt6_info *rt6 = rtnl_dereference(vrf->rt6);
+
+   rcu_assign_pointer(vrf->rt6, NULL);
+
+   if (rt6)
+   dst_release(>dst);
 }
 
 static int vrf_rt6_create(struct net_device *dev)
@@ -300,7 +305,8 @@ static int vrf_rt6_create(struct net_device *dev)
 
rt6->rt6i_table = rt6i_table;
rt6->dst.output = vrf_output6;
-   vrf->rt6 = rt6;
+   rcu_assign_pointer(vrf->rt6, rt6);
+
rc = 0;
 out:
return rc;
@@ -374,29 +380,35 @@ static int vrf_output(struct net *net, struct sock *sk, 
struct sk_buff *skb)
!(IPCB(skb)->flags & IPSKB_REROUTED));
 }
 
+/* holding rtnl */
 static void vrf_rtable_release(struct net_vrf *vrf)
 {
-   struct dst_entry *dst = (struct dst_entry *)vrf->rth;
+   struct rtable *rth = rtnl_dereference(vrf->rth);
+
+   rcu_assign_pointer(vrf->rth, NULL);
 
-   dst_release(dst);
-   vrf->rth = NULL;
+   if (rth)
+   dst_release(>dst);
 }
 
-static struct rtable *vrf_rtable_create(struct net_device *dev)
+static int vrf_rtable_create(struct net_device *dev)
 {
struct net_vrf *vrf = netdev_priv(dev);
struct rtable *rth;
 
if (!fib_new_table(dev_net(dev), vrf->tb_id))
-   return NULL;
+   return -ENOMEM;
 
rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0);
-   if (rth) {
-   rth->dst.output = vrf_output;
-   rth->rt_table_id = vrf->tb_id;
-   }
+   if (!rth)
+   return -ENOMEM;
 
-   return rth;
+   rth->dst.output = vrf_output;
+   rth->rt_table_id = vrf->tb_id;
+
+   rcu_assign_pointer(vrf->rth, rth);
+
+   return 0;
 }
 
 / device handling /
@@ -484,8 +496,7 @@ static int vrf_dev_init(struct net_device *dev)
goto out_nomem;
 
/* create the default dst which points back to us */
-   vrf->rth = vrf_rtable_create(dev);
-   if (!vrf->rth)
+   if (vrf_rtable_create(dev) != 0)
goto out_stats;
 
if (vrf_rt6_create(dev) != 0)
@@ -528,8 +539,13 @@ static struct rtable *vrf_get_rtable(const struct 
net_device *dev,
if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) {
struct net_vrf *vrf = netdev_priv(dev);
 
-   rth = vrf->rth;
-   dst_hold(>dst);
+   rcu_read_lock();
+
+   rth = rcu_dereference(vrf->rth);
+   if (likely(rth))
+   dst_hold(>dst);
+
+   rcu_read_unlock();
}
 
return rth;
@@ -665,16 +681,24 @@ static struct sk_buff *vrf_l3_rcv(struct net_device 
*vrf_dev,
 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
 const struct flowi6 *fl6)
 {
-   struct rt6_info *rt = NULL;
+   struct dst_entry *dst = NULL;
 
if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) {
struct net_vrf *vrf = netdev_priv(dev);
+   struct rt6_info *rt;
+
+   rcu_read_lock();
+
+   rt = rcu_dereference(vrf->rt6);
+   if (likely(rt)) {
+   dst = >dst;
+   dst_hold(dst);
+   }
 
-   rt = vrf->rt6;
-   dst_hold(>dst);
+   rcu_read_unlock();
}
 
-   return (struct dst_entry *)rt;
+   return dst;
 }
 #endif
 
-- 
2.1.4

Re: [PATCH RFT 1/2] phylib: add device reset GPIO support

2016-05-13 Thread Sergei Shtylyov


Hello.

On 05/13/2016 12:35 AM, Sergei Shtylyov wrote:


[we already talked about this patch in #armlinux, I'm now just
forwarding my comments on the list. Background was that I sent an easier
and less complete patch with the same idea. See
http://patchwork.ozlabs.org/patch/621418/]

[added Linus Walleij to Cc, there is a question for you/him below]

On Fri, Apr 29, 2016 at 01:12:54AM +0300, Sergei Shtylyov wrote:

--- net-next.orig/Documentation/devicetree/bindings/net/phy.txt
+++ net-next/Documentation/devicetree/bindings/net/phy.txt
@@ -35,6 +35,8 @@ Optional Properties:
 - broken-turn-around: If set, indicates the PHY device does not correctly
   release the turn around line low at the end of a MDIO transaction.

+- reset-gpios: The GPIO phandle and specifier for the PHY reset signal.
+
 Example:

 ethernet-phy@0 {


This is great.


Index: net-next/drivers/net/phy/at803x.c
===
--- net-next.orig/drivers/net/phy/at803x.c
+++ net-next/drivers/net/phy/at803x.c
@@ -65,7 +65,6 @@ MODULE_LICENSE("GPL");
[...]


My patch breaks this driver. I wasn't aware of it.


   I tried to be as careful as I could but still it looks that I didn't
succeed at that too...


   Hm, I'm starting to forget the vital details about my patch...


[...]

Index: net-next/drivers/net/phy/mdio_device.c
===
--- net-next.orig/drivers/net/phy/mdio_device.c
+++ net-next/drivers/net/phy/mdio_device.c

[...]

@@ -117,9 +126,16 @@ static int mdio_probe(struct device *dev
 struct mdio_driver *mdiodrv = to_mdio_driver(drv);
 int err = 0;

-if (mdiodrv->probe)
+if (mdiodrv->probe) {
+/* Deassert the reset signal */
+mdio_device_reset(mdiodev, 0);
+
 err = mdiodrv->probe(mdiodev);

+/* Assert the reset signal */
+mdio_device_reset(mdiodev, 1);


I wonder if it's safe to do this in general. What if ->probe does
something with the phy that is lost by resetting but that is relied on
later?


   Well, I thought that config_init() method is designed for that but indeed
the LXT driver writes to BMCR in its probe() method and hence is broken.

> Thank you for noticing...

   It's broken even without my patch. The phylib will cause a PHY soft reset 
when attaching to the PHY device, so all BMCR programming dpone in the probe() 
method will be lost. My patch does make sense as is. Looks like I should also 
look into fixing lxt.c. Florian, what do you think?


[...]

MBR, Sergei

Re: [PATCH] phy: add support for a reset-gpio specification

2016-05-13 Thread Sergei Shtylyov


Hello.

On 05/13/2016 09:26 AM, Uwe Kleine-König wrote:


The framework only asserts (for now) that the reset gpio is not active.

Signed-off-by: Uwe Kleine-König 
---
Documentation/devicetree/bindings/net/phy.txt | 3 +++
drivers/net/phy/phy_device.c  | 8 
2 files changed, 11 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/phy.txt 
b/Documentation/devicetree/bindings/net/phy.txt
index bc1c3c8bf8fa..c00a9a894547 100644
--- a/Documentation/devicetree/bindings/net/phy.txt
+++ b/Documentation/devicetree/bindings/net/phy.txt
@@ -35,6 +35,8 @@ Optional Properties:
- broken-turn-around: If set, indicates the PHY device does not correctly
  release the turn around line low at the end of a MDIO transaction.

+- reset-gpios: Reference to a GPIO used to reset the phy.
+
Example:

ethernet-phy@0 {
@@ -42,4 +44,5 @@ ethernet-phy@0 {
interrupt-parent = <4>;
interrupts = <35 1>;
reg = <0>;
+   reset-gpios = < 17 GPIO_ACTIVE_LOW>;
 };


   OK, the example has PHY ID specified in the "compatible" prop. It's a
vital detail, you should mention it somewhere...


Something like that in the commit log:

Note that reset is only deasserted when the PHY ID is already
determined. If your phy doesn't support reading the PHY ID while
reset is asserted it's recommended to specify it in the
compatible string. Otherwise the device might fail to probe.


   Yes, that should do.


diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index e551f3a89cfd..7d666ab47271 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c

[...]

@@ -1569,9 +1570,16 @@ static int phy_probe(struct device *dev)
struct device_driver *drv = phydev->mdio.dev.driver;
struct phy_driver *phydrv = to_phy_driver(drv);
int err = 0;
+   struct gpio_descs *reset_gpios;

phydev->drv = phydrv;

+   /* take phy out of reset */
+   reset_gpios = devm_gpiod_get_array_optional(dev, "reset",
+   GPIOD_OUT_LOW);
+   if (IS_ERR(reset_gpios))
+   return PTR_ERR(reset_gpios);
+


   It would have been logical to assert back the reset GPIO in phy_remove(),



Not necessarily. If you want to save some energy it might make sense. If


   Yes, we'd like to do that. Florian (the phylib maintainer) even wanted 
asserting reset during the PM suspended state.



you only focus on making the device work, ensuring that reset is not
applied is enough.



no? Also, you'd need to store the GPIO permanently somewhere...


I need to store it, iff I want to do something with the reset line
later. Currently that's not the case.


   Oops, I seem to somewhat muddled up here. Of course, you need to store it 
if you intend to de-assert it later.



Best regards
Uwe


MBR, Sergei

Re: [patch net-next 1/4] netdevice: add SW statistics ndo

2016-05-13 Thread Roopa Prabhu

On 5/12/16, 11:03 PM, Jiri Pirko wrote:
> Thu, May 12, 2016 at 11:10:08PM CEST, ro...@cumulusnetworks.com wrote:
>> On 5/12/16, 4:48 AM, Jiri Pirko wrote:
>>> From: Nogah Frankel 
>>>
>>> Till now we had a ndo statistics function that returned SW statistics.
>>> We want to change the "basic" statistics to return HW statistics if
>>> available.
>>> In this case we need to expose a new ndo to return the SW statistics.
>>> Add a new ndo declaration to get SW statistics
>>> Add a function that gets SW statistics if a competible ndo exist
>>>
>>> Signed-off-by: Nogah Frankel 
>>> Reviewed-by: Ido Schimmel 
>>> Signed-off-by: Jiri Pirko 
>>> ---
>>>
>> To me netdev stats is  combined 'SW + HW' stats for that netdev.
>> ndo_get_stats64 callback into the drivers does the magic of adding HW stats
>> to SW (netdev) stats and returning (see enic_get_stats). HW stats is 
>> available for netdevs
>> that are offloaded or are backed by hardware. SW stats is the stats that the 
>> driver maintains
>> (logical or physical). HW stats is queried and added to the SW stats.
> I'm not sure I follow. HW stats already contain SW stats. Because on
> slow path every packet that is not offloaded and goes through kernel is
> counted into HW stats as well (because it goes through HW port). 
yes, correct... we don't want to double count those. But since these stats are
generally queried from hw, I am calling them HW stats.
you will not really maintain a software counter for this. But, the driver can 
maintain its own
 counters for rx and tx errors etc and I call these SW stats. They are counted 
at the driver.

> If you
> do HW stats + SW stats, what you get makes no sense. Am I missing something?
If you go by my definition of HW and SW stats above, on a ndo_get_stats64() 
call,
you will add the SW counters + HW counters and return. In my definition, the 
pkts
that was rx'ed or tx'ed successfully are always in the HW count.

> Btw, looking at enic_get_stats, looks exactly what we introduce for
> mlxsw in this patchset.

In enic_get_stats, the ones counted in software are the ones taken from 'enic->'
 net_stats->rx_over_errors = enic->rq_truncated_pkts;
 net_stats->rx_crc_errors = enic->rq_bad_fcs;

>
> With this patchset, we only allow user to se the actual stats for
> slow-path aka SW stats.
hmm...ok. But i am not sure how many will use this new attribute.
When you do 'ip -s link show' you really want all counters on that port
hardware or software does not matter at that point.

My suggestion to move this to ethtool like attribute is because that is an 
existing
 way to break down your stats which ever way you want. And the best part is it 
can be
customized (say rx_pkts_cpu_saw)

Thanks,
Roopa

Re: [PATCH 3.2 085/115] veth: don???t modify ip_summed; doing so treats packets with bad checksums as good.

2016-05-13 Thread Ben Greear

On 05/13/2016 11:21 AM, David Miller wrote:

From: Ben Greear 
Date: Fri, 13 May 2016 09:57:19 -0700

How do you feel about a new socket-option to allow a socket to
request the old veth behaviour?

I depend upon the opinions of the experts who work upstream on and
maintain these components, since it is an area I am not so familiar
with.

Generally speaking asking me directly for opinions on matters like
this isn't the way to go, in fact I kind of find it irritating.  It
can't all be on me.

Fair enough, thanks for your time.

Ben

--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com

Re: [PATCH 3.2 085/115] veth: don???t modify ip_summed; doing so treats packets with bad checksums as good.

2016-05-13 Thread David Miller

From: Ben Greear 
Date: Fri, 13 May 2016 09:57:19 -0700

> How do you feel about a new socket-option to allow a socket to
> request the old veth behaviour?

I depend upon the opinions of the experts who work upstream on and
maintain these components, since it is an area I am not so familiar
with.

Generally speaking asking me directly for opinions on matters like
this isn't the way to go, in fact I kind of find it irritating.  It
can't all be on me.

Re: [PATCH v2 net-next 2/2] net: cls_u32: Add support for skip-sw flag to tc u32 classifier.

2016-05-13 Thread John Fastabend

On 16-05-12 05:08 PM, Sridhar Samudrala wrote:
> On devices that support TC U32 offloads, this flag enables a filter to be
> added only to HW. skip-sw and skip-hw are mutually exclusive flags. By
> default without any flags, the filter is added to both HW and SW, but no
> error checks are done in case of failure to add to HW. With skip-sw,
> failure to add to HW is treated as an error.
> 
> Here is a sample script that adds 2 filters, one with skip-sw and the other
> with skip-hw flag.
> 
># add ingress qdisc
>tc qdisc add dev p4p1 ingress
> 
># enable hw tc offload.
>ethtool -K p4p1 hw-tc-offload on
> 
># add u32 filter with skip-sw flag.
>tc filter add dev p4p1 parent : protocol ip prio 99 \
>   handle 800:0:1 u32 ht 800: flowid 800:1 \
>   skip-sw \
>   match ip src 192.168.1.0/24 \
>   action drop
> 
># add u32 filter with skip-hw flag.
>tc filter add dev p4p1 parent : protocol ip prio 99 \
>   handle 800:0:2 u32 ht 800: flowid 800:2 \
>   skip-hw \
>   match ip src 192.168.2.0/24 \
>   action drop
> 
> Signed-off-by: Sridhar Samudrala 
> ---

Looks good to me thanks for doing this.

Acked-by: John Fastabend

Re: [RFC PATCH 2/2] Documentation: devictree: Add macb mdio bindings

2016-05-13 Thread Andrew Lunn

Hi Harini

Is this backward compatible? Will devices using the old binding still
work?

  /* Disable RX and TX (XXX: Should we halt the transmission
   * more gracefully?)
*/
-   macb_writel(bp, NCR, 0);
+   ctrl = macb_readl(bp, NCR);
+   ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE));
+   macb_writel(bp, NCR, ctrl);
 
/* Clear the stats registers (XXX: Update stats first?) */
-   macb_writel(bp, NCR, MACB_BIT(CLRSTAT));
+   ctrl |= MACB_BIT(CLRSTAT);
+   macb_writel(bp, NCR, ctrl);
 
/* Clear all status flags */
macb_writel(bp, TSR, -1);

It is not clear to me what this part has to do with MDIO.

   Andrew

Re: [PATCH v2 net-next 1/2] net: sched: Move TCA_CLS_FLAGS_SKIP_HW to uapi header file.

2016-05-13 Thread John Fastabend

On 16-05-12 05:08 PM, Sridhar Samudrala wrote:
> Signed-off-by: Sridhar Samudrala 
> ---

Acked-by: John Fastabend

Re: [RFC PATCH 2/2] Documentation: devictree: Add macb mdio bindings

2016-05-13 Thread Andrew Lunn

> + mdio {
> + compatible = "cdns,macb-mdio";
> + reg = <0x0 0xff0b 0x0 0x1000>;
> + clocks = <>, <>, <>;
> + clock-names = "pclk", "hclk", "tx_clk";
> + ethernet_phyC: ethernet-phy@C {
> + reg = ;

I think that needs an 0x prefix.

  Andrew

Re: [RFC PATCH 0/2] net: threadable napi poll loop

2016-05-13 Thread Eric Dumazet

On Fri, May 13, 2016 at 10:19 AM, Paolo Abeni  wrote:

> The difference is small, in the noise range:
>
> [with this patch applied]
> super_netperf 100 -H 192.168.122.1 -t UDP_STREAM -l 60 -- -m 1
> 9.00
>
> [adding the test into __local_bh_enable_ip(), too]
> super_netperf 100 -H 192.168.122.1 -t UDP_STREAM -l 60 -- -m 1
> 9.14
>
> but reproducible, in my experiments.
> I have similar data for different number of flows.
>
>> I believe I did this so that we factorize the logic in do_softirq()
>> and keep the code local to kernel/softirq.c
>>
>> Otherwise, netif_rx_ni() could also process softirq while ksoftirqd
>> was scheduled,
>> so I would have to  'export' the ksoftirqd_running(void) helper in an
>> include file.
>
> The idea could be to add the test in __local_bh_enable_ip(), maintaining
> the test also in do_softirq() (as currently done, i.e for
> local_softirq_pending())
>

Then I guess even the !in_interrupt() test we do is expensive and
could be avoided,
since do_softirq() is doing it again in the unlikely case it really is needed.

@@ -162,7 +170,8 @@ void __local_bh_enable_ip(unsigned long ip,
unsigned int cnt)
 */
preempt_count_sub(cnt - 1);

-   if (unlikely(!in_interrupt() && local_softirq_pending())) {
+   if (unlikely(local_softirq_pending()) &&
+!ksoftirqd_running()) {
/*
 * Run softirq if any pending. And do it in its own stack
 * as we may be calling this deep in a task call stack already.

Re: [patch net-next v3 2/4] rtnetlink: add HW/SW stats distinction in rtnl_fill_stats

2016-05-13 Thread Eric Dumazet

On Fri, 2016-05-13 at 16:54 +0200, Jiri Pirko wrote:
> From: Nogah Frankel 
> 
> Since hardware stats are now returned by default, add a way to
> query only software stats.
> They are saved in IFLA_SW_STATS64.
> (This option is valid only if the driver return HW stats in the
> default ndo stats)
> 
> Signed-off-by: Nogah Frankel 
> Signed-off-by: Jiri Pirko 
> ---
> v2->v3:
> - avoided memcpy as requerted by DaveM
> v1->v2:
> - no change
> ---
>  include/uapi/linux/if_link.h |  1 +
>  net/core/rtnetlink.c | 13 +
>  2 files changed, 14 insertions(+)
> 
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index bb36bd5..98175e7 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -156,6 +156,7 @@ enum {
>   IFLA_GSO_MAX_SEGS,
>   IFLA_GSO_MAX_SIZE,
>   IFLA_PAD,
> + IFLA_SW_STATS64,
>   __IFLA_MAX
>  };
>  
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index d69c464..b0ebac6 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -1054,12 +1054,25 @@ static noinline_for_stack int rtnl_fill_stats(struct 
> sk_buff *skb,
>  
>   attr = nla_reserve_64bit(skb, IFLA_STATS64,
>sizeof(struct rtnl_link_stats64), IFLA_PAD);
> +
>   if (!attr)
>   return -EMSGSIZE;
>  
>   sp = nla_data(attr);
>   dev_get_stats(dev, sp);
>  
> + if (dev_have_sw_stats(dev)) {
> + attr = nla_reserve_64bit(skb, IFLA_SW_STATS64,
> +  sizeof(struct rtnl_link_stats64),
> +  IFLA_PAD);
> +
> + if (!attr)
> + return -EMSGSIZE;
> +
> + sp = nla_data(attr);
> + dev_get_sw_stats(dev, sp);
> + }
> +
>   attr = nla_reserve(skb, IFLA_STATS,
>  sizeof(struct rtnl_link_stats));
>   if (!attr)


You forgot to change if_nlmsg_size() ?

nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) is quite big.

Re: [RFC PATCH 0/2] net: threadable napi poll loop

2016-05-13 Thread Paolo Abeni

On Fri, 2016-05-13 at 10:03 -0700, Eric Dumazet wrote:
> On Fri, May 13, 2016 at 9:50 AM, Paolo Abeni  wrote:
> 
> >> Indeed, and the patch looks quite simple now ;)
> >>
> >> diff --git a/kernel/softirq.c b/kernel/softirq.c
> >> index 
> >> 17caf4b63342d7839528f367b283a386413b0362..23c364485d03618773c385d943c0ef39f5931d09
> >>  100644
> >> --- a/kernel/softirq.c
> >> +++ b/kernel/softirq.c
> >> @@ -57,6 +57,11 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] 
> >> __cacheline_aligned_in_smp
> >>
> >>  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
> >>
> >> +static inline bool ksoftirqd_running(void)
> >> +{
> >> + return __this_cpu_read(ksoftirqd)->state == TASK_RUNNING;
> >> +}
> >> +
> >>  const char * const softirq_to_name[NR_SOFTIRQS] = {
> >>   "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
> >>   "TASKLET", "SCHED", "HRTIMER", "RCU"
> >> @@ -313,7 +318,7 @@ asmlinkage __visible void do_softirq(void)
> >>
> >>   pending = local_softirq_pending();
> >>
> >> - if (pending)
> >> + if (pending && !ksoftirqd_running())
> >>   do_softirq_own_stack();
> >>
> >>   local_irq_restore(flags);
> >> @@ -340,6 +345,9 @@ void irq_enter(void)
> >>
> >>  static inline void invoke_softirq(void)
> >>  {
> >> + if (ksoftirqd_running())
> >> + return;
> >> +
> >>   if (!force_irqthreads) {
> >>  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
> >>   /*
> >
> > In this version of the path, the chunk affecting __local_bh_enable_ip()
> > has been removed.
> >
> > I think it is beneficial, because it allows avoiding a
> > local_irq_save()/local_irq_restore() pairs per local_bh_enable under heavy 
> > load.
> >
> 
> Interesting, do you have any numbers ?

The difference is small, in the noise range:

[with this patch applied]
super_netperf 100 -H 192.168.122.1 -t UDP_STREAM -l 60 -- -m 1 
9.00

[adding the test into __local_bh_enable_ip(), too]
super_netperf 100 -H 192.168.122.1 -t UDP_STREAM -l 60 -- -m 1 
9.14

but reproducible, in my experiments.
I have similar data for different number of flows.

> I believe I did this so that we factorize the logic in do_softirq()
> and keep the code local to kernel/softirq.c
> 
> Otherwise, netif_rx_ni() could also process softirq while ksoftirqd
> was scheduled,
> so I would have to  'export' the ksoftirqd_running(void) helper in an
> include file.

The idea could be to add the test in __local_bh_enable_ip(), maintaining
the test also in do_softirq() (as currently done, i.e for
local_softirq_pending())

Cheers,

Paolo

Re: [patch net-next v3 1/4] netdevice: add SW statistics ndo

2016-05-13 Thread Eric Dumazet

On Fri, 2016-05-13 at 16:54 +0200, Jiri Pirko wrote:
> From: Nogah Frankel 
]
> +bool dev_have_sw_stats(struct net_device *dev)
> +{
> + return (dev->netdev_ops->ndo_get_sw_stats64 != NULL);
> +}
> +EXPORT_SYMBOL(dev_have_sw_stats);
> +

2 things : const, and return is not a function.

bool dev_have_sw_stats(const struct net_device *dev)
{
   return dev->netdev_ops->ndo_get_sw_stats64 != NULL;
}

[PATCH net-next 03/10] bpf: split HAVE_BPF_JIT into cBPF and eBPF variant

2016-05-13 Thread Daniel Borkmann

Split the HAVE_BPF_JIT into two for distinguishing cBPF and eBPF JITs.

Current cBPF ones:

  # git grep -n HAVE_CBPF_JIT arch/
  arch/arm/Kconfig:44:select HAVE_CBPF_JIT
  arch/mips/Kconfig:18:   select HAVE_CBPF_JIT if !CPU_MICROMIPS
  arch/powerpc/Kconfig:129:   select HAVE_CBPF_JIT
  arch/sparc/Kconfig:35:  select HAVE_CBPF_JIT

Current eBPF ones:

  # git grep -n HAVE_EBPF_JIT arch/
  arch/arm64/Kconfig:61:  select HAVE_EBPF_JIT
  arch/s390/Kconfig:126:  select HAVE_EBPF_JIT if PACK_STACK && 
HAVE_MARCH_Z196_FEATURES
  arch/x86/Kconfig:94:select HAVE_EBPF_JITif X86_64

Later code also needs this facility to check for eBPF JITs.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 arch/arm/Kconfig |  2 +-
 arch/arm64/Kconfig   |  2 +-
 arch/mips/Kconfig|  2 +-
 arch/powerpc/Kconfig |  2 +-
 arch/s390/Kconfig|  2 +-
 arch/sparc/Kconfig   |  2 +-
 arch/x86/Kconfig |  2 +-
 net/Kconfig  | 14 +++---
 8 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index cdfa6c2..2315b0d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -41,7 +41,7 @@ config ARM
select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
select HAVE_ARCH_TRACEHOOK
select HAVE_ARM_SMCCC if CPU_V7
-   select HAVE_BPF_JIT
+   select HAVE_CBPF_JIT
select HAVE_CC_STACKPROTECTOR
select HAVE_CONTEXT_TRACKING
select HAVE_C_RECORDMCOUNT
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 4f43622..e6761ea 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -58,7 +58,7 @@ config ARM64
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
-   select HAVE_BPF_JIT
+   select HAVE_EBPF_JIT
select HAVE_C_RECORDMCOUNT
select HAVE_CC_STACKPROTECTOR
select HAVE_CMPXCHG_DOUBLE
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 2018c2b..3ee1ea6 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -15,7 +15,7 @@ config MIPS
select HAVE_ARCH_KGDB
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
-   select HAVE_BPF_JIT if !CPU_MICROMIPS
+   select HAVE_CBPF_JIT if !CPU_MICROMIPS
select HAVE_FUNCTION_TRACER
select HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7cd32c0..2fdb73d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -126,7 +126,7 @@ config PPC
select IRQ_FORCED_THREADING
select HAVE_RCU_TABLE_FREE if SMP
select HAVE_SYSCALL_TRACEPOINTS
-   select HAVE_BPF_JIT
+   select HAVE_CBPF_JIT
select HAVE_ARCH_JUMP_LABEL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf24ab1..a883981 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -126,7 +126,7 @@ config S390
select HAVE_ARCH_SOFT_DIRTY
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
-   select HAVE_BPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
+   select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
select HAVE_DEBUG_KMEMLEAK
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..d500381 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -32,7 +32,7 @@ config SPARC
select ARCH_WANT_IPC_PARSE_VERSION
select GENERIC_PCI_IOMAP
select HAVE_NMI_WATCHDOG if SPARC64
-   select HAVE_BPF_JIT
+   select HAVE_CBPF_JIT
select HAVE_DEBUG_BUGVERBOSE
select GENERIC_SMP_IDLE_THREAD
select GENERIC_CLOCKEVENTS
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2dc18605..ae83046 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -91,7 +91,7 @@ config X86
select HAVE_ARCH_SOFT_DIRTY if X86_64
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
-   select HAVE_BPF_JIT if X86_64
+   select HAVE_EBPF_JITif X86_64
select HAVE_CC_STACKPROTECTOR
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
diff --git a/net/Kconfig b/net/Kconfig
index b841c42..f7148f2 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -289,7 +289,7 @@ config BQL
 
 config BPF_JIT
bool "enable BPF Just In Time compiler"
-   depends on HAVE_BPF_JIT
+   depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
depends on MODULES
---help---
  Berkeley Packet Filter filtering capabilities are normally handled
@@ -419,6 +419,14 @@ config MAY_USE_DEVLINK
 
 endif   # if NET
 
-# Used by archs to tell that they support BPF_JIT
-config HAVE_BPF_JIT
+# Used by archs to

[PATCH net-next 10/10] bpf, s390: add support for constant blinding

2016-05-13 Thread Daniel Borkmann

This patch adds recently added constant blinding helpers into the
s390 eBPF JIT. In the bpf_int_jit_compile() path, requirements are
to utilize bpf_jit_blind_constants()/bpf_jit_prog_release_other()
pair for rewriting the program into a blinded one, and to map the
BPF_REG_AX register to a CPU register. The mapping of BPF_REG_AX
is at r12 and similarly like in x86 case performs reloading when
ld_abs/ind is used. When blinding is not used, there's no additional
overhead in the generated image.

When BPF_REG_AX is used, we don't need to emit skb->data reload when
helper function changed skb->data, as this will be reloaded later
on anyway from stack on ld_abs/ind, where skb->data is needed. s390
allows for this w/o much additional complexity unlike f.e. x86.

Signed-off-by: Daniel Borkmann 
Signed-off-by: Michael Holzheu 
---
 arch/s390/net/bpf_jit_comp.c | 73 +---
 1 file changed, 56 insertions(+), 17 deletions(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index fcf301a..9133b0e 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -54,16 +54,17 @@ struct bpf_jit {
 #define SEEN_FUNC  16  /* calls C functions */
 #define SEEN_TAIL_CALL 32  /* code uses tail calls */
 #define SEEN_SKB_CHANGE64  /* code changes skb data */
+#define SEEN_REG_AX128 /* code uses constant blinding */
 #define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB)
 
 /*
  * s390 registers
  */
-#define REG_W0 (__MAX_BPF_REG+0)   /* Work register 1 (even) */
-#define REG_W1 (__MAX_BPF_REG+1)   /* Work register 2 (odd) */
-#define REG_SKB_DATA   (__MAX_BPF_REG+2)   /* SKB data register */
-#define REG_L  (__MAX_BPF_REG+3)   /* Literal pool register */
-#define REG_15 (__MAX_BPF_REG+4)   /* Register 15 */
+#define REG_W0 (MAX_BPF_JIT_REG + 0)   /* Work register 1 (even) */
+#define REG_W1 (MAX_BPF_JIT_REG + 1)   /* Work register 2 (odd) */
+#define REG_SKB_DATA   (MAX_BPF_JIT_REG + 2)   /* SKB data register */
+#define REG_L  (MAX_BPF_JIT_REG + 3)   /* Literal pool register */
+#define REG_15 (MAX_BPF_JIT_REG + 4)   /* Register 15 */
 #define REG_0  REG_W0  /* Register 0 */
 #define REG_1  REG_W1  /* Register 1 */
 #define REG_2  BPF_REG_1   /* Register 2 */
@@ -88,6 +89,8 @@ static const int reg2hex[] = {
[BPF_REG_9] = 10,
/* BPF stack pointer */
[BPF_REG_FP]= 13,
+   /* Register for blinding (shared with REG_SKB_DATA) */
+   [BPF_REG_AX]= 12,
/* SKB data pointer */
[REG_SKB_DATA]  = 12,
/* Work registers for s390x backend */
@@ -385,7 +388,7 @@ static void save_restore_regs(struct bpf_jit *jit, int op)
 /*
  * For SKB access %b1 contains the SKB pointer. For "bpf_jit.S"
  * we store the SKB header length on the stack and the SKB data
- * pointer in REG_SKB_DATA.
+ * pointer in REG_SKB_DATA if BPF_REG_AX is not used.
  */
 static void emit_load_skb_data_hlen(struct bpf_jit *jit)
 {
@@ -397,9 +400,10 @@ static void emit_load_skb_data_hlen(struct bpf_jit *jit)
   offsetof(struct sk_buff, data_len));
/* stg %w1,ST_OFF_HLEN(%r0,%r15) */
EMIT6_DISP_LH(0xe300, 0x0024, REG_W1, REG_0, REG_15, STK_OFF_HLEN);
-   /* lg %skb_data,data_off(%b1) */
-   EMIT6_DISP_LH(0xe300, 0x0004, REG_SKB_DATA, REG_0,
- BPF_REG_1, offsetof(struct sk_buff, data));
+   if (!(jit->seen & SEEN_REG_AX))
+   /* lg %skb_data,data_off(%b1) */
+   EMIT6_DISP_LH(0xe300, 0x0004, REG_SKB_DATA, REG_0,
+ BPF_REG_1, offsetof(struct sk_buff, data));
 }
 
 /*
@@ -487,6 +491,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
s32 imm = insn->imm;
s16 off = insn->off;
 
+   if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
+   jit->seen |= SEEN_REG_AX;
switch (insn->code) {
/*
 * BPF_MOV
@@ -1188,7 +1194,7 @@ call_fn:
/*
 * Implicit input:
 *  BPF_REG_6(R7) : skb pointer
-*  REG_SKB_DATA (R12): skb data pointer
+*  REG_SKB_DATA (R12): skb data pointer (if no BPF_REG_AX)
 *
 * Calculated input:
 *  BPF_REG_2(R3) : offset of byte(s) to fetch in skb
@@ -1209,6 +1215,11 @@ call_fn:
/* agfr %b2,%src (%src is s32 here) */
EMIT4(0xb918, BPF_REG_2, src_reg);
 
+   /* Reload REG_SKB_DATA if BPF_REG_AX is used */
+   if (jit->seen & SEEN_REG_AX)
+   /* lg %skb_data,data_off(%b6) */
+   EMIT6_DISP_LH(0xe300, 0x0004, REG_SKB_DATA, REG_0,
+

[PATCH net-next 02/10] bpf: move bpf_jit_enable declaration

2016-05-13 Thread Daniel Borkmann

Move the bpf_jit_enable declaration to the filter.h file where
most other core code is declared, also since we're going to add
a second knob there.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 include/linux/filter.h| 2 ++
 include/linux/netdevice.h | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ec1411c..4ff0e64 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -496,6 +496,8 @@ void bpf_int_jit_compile(struct bpf_prog *fp);
 bool bpf_helper_changes_skb_data(void *func);
 
 #ifdef CONFIG_BPF_JIT
+extern int bpf_jit_enable;
+
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
 struct bpf_binary_header *
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 63580e6..c23fd73 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3757,7 +3757,6 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 
*stats64,
 extern int netdev_max_backlog;
 extern int netdev_tstamp_prequeue;
 extern int weight_p;
-extern int bpf_jit_enable;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device 
*upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
-- 
1.9.3

[PATCH net-next 09/10] bpf, arm64: add support for constant blinding

2016-05-13 Thread Daniel Borkmann

This patch adds recently added constant blinding helpers into the
arm64 eBPF JIT. In the bpf_int_jit_compile() path, requirements are
to utilize bpf_jit_blind_constants()/bpf_jit_prog_release_other()
pair for rewriting the program into a blinded one, and to map the
BPF_REG_AX register to a CPU register. The mapping is on x9.

Signed-off-by: Daniel Borkmann 
Acked-by: Zi Shen Lim 
Acked-by: Yang Shi 
Tested-by: Yang Shi 
---
 arch/arm64/net/bpf_jit_comp.c | 52 +--
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index b421df7..031ed08 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -31,8 +31,8 @@
 
 int bpf_jit_enable __read_mostly;
 
-#define TMP_REG_1 (MAX_BPF_REG + 0)
-#define TMP_REG_2 (MAX_BPF_REG + 1)
+#define TMP_REG_1 (MAX_BPF_JIT_REG + 0)
+#define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
 
 /* Map BPF registers to A64 registers */
 static const int bpf2a64[] = {
@@ -54,6 +54,8 @@ static const int bpf2a64[] = {
/* temporary register for internal BPF JIT */
[TMP_REG_1] = A64_R(23),
[TMP_REG_2] = A64_R(24),
+   /* temporary register for blinding constants */
+   [BPF_REG_AX] = A64_R(9),
 };
 
 struct jit_ctx {
@@ -763,26 +765,43 @@ void bpf_jit_compile(struct bpf_prog *prog)
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
+   struct bpf_prog *tmp, *orig_prog = prog;
struct bpf_binary_header *header;
+   bool tmp_blinded = false;
struct jit_ctx ctx;
int image_size;
u8 *image_ptr;
 
if (!bpf_jit_enable)
-   return prog;
+   return orig_prog;
+
+   tmp = bpf_jit_blind_constants(prog);
+   /* If blinding was requested and we failed during blinding,
+* we must fall back to the interpreter.
+*/
+   if (IS_ERR(tmp))
+   return orig_prog;
+   if (tmp != prog) {
+   tmp_blinded = true;
+   prog = tmp;
+   }
 
memset(, 0, sizeof(ctx));
ctx.prog = prog;
 
ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
-   if (ctx.offset == NULL)
-   return prog;
+   if (ctx.offset == NULL) {
+   prog = orig_prog;
+   goto out;
+   }
 
/* 1. Initial fake pass to compute ctx->idx. */
 
/* Fake pass to fill in ctx->offset and ctx->tmp_used. */
-   if (build_body())
-   goto out;
+   if (build_body()) {
+   prog = orig_prog;
+   goto out_off;
+   }
 
build_prologue();
 
@@ -793,8 +812,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
image_size = sizeof(u32) * ctx.idx;
header = bpf_jit_binary_alloc(image_size, _ptr,
  sizeof(u32), jit_fill_hole);
-   if (header == NULL)
-   goto out;
+   if (header == NULL) {
+   prog = orig_prog;
+   goto out_off;
+   }
 
/* 2. Now, the actual pass. */
 
@@ -805,7 +826,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 
if (build_body()) {
bpf_jit_binary_free(header);
-   goto out;
+   prog = orig_prog;
+   goto out_off;
}
 
build_epilogue();
@@ -813,7 +835,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
/* 3. Extra pass to validate JITed code. */
if (validate_code()) {
bpf_jit_binary_free(header);
-   goto out;
+   prog = orig_prog;
+   goto out_off;
}
 
/* And we're done. */
@@ -825,8 +848,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
set_memory_ro((unsigned long)header, header->pages);
prog->bpf_func = (void *)ctx.image;
prog->jited = 1;
-out:
+
+out_off:
kfree(ctx.offset);
+out:
+   if (tmp_blinded)
+   bpf_jit_prog_release_other(prog, prog == orig_prog ?
+  tmp : orig_prog);
return prog;
 }
 
-- 
1.9.3

[PATCH net-next 00/10] BPF updates

2016-05-13 Thread Daniel Borkmann

This set implements constant blinding for BPF, first couple of
patches are some preparatory cleanups, followed by the blinding.
Please see individual patches for details.

Thanks a lot!

Daniel Borkmann (10):
  bpf: minor cleanups in ebpf code
  bpf: move bpf_jit_enable declaration
  bpf: split HAVE_BPF_JIT into cBPF and eBPF variant
  bpf, x86/arm64: remove useless checks on prog
  bpf: add bpf_patch_insn_single helper
  bpf: prepare bpf_int_jit_compile/bpf_prog_select_runtime apis
  bpf: add generic constant blinding for use in jits
  bpf, x86: add support for constant blinding
  bpf, arm64: add support for constant blinding
  bpf, s390: add support for constant blinding

 Documentation/sysctl/net.txt  |  11 ++
 arch/arm/Kconfig  |   2 +-
 arch/arm64/Kconfig|   2 +-
 arch/arm64/net/bpf_jit_comp.c |  56 +---
 arch/mips/Kconfig |   2 +-
 arch/powerpc/Kconfig  |   2 +-
 arch/s390/Kconfig |   2 +-
 arch/s390/net/bpf_jit_comp.c  |  77 ---
 arch/sparc/Kconfig|   2 +-
 arch/x86/Kconfig  |   2 +-
 arch/x86/net/bpf_jit_comp.c   |  70 +++---
 include/linux/filter.h|  52 +++-
 include/linux/netdevice.h |   1 -
 kernel/bpf/core.c | 294 +-
 kernel/bpf/syscall.c  |   2 +-
 kernel/bpf/verifier.c |  53 ++--
 lib/test_bpf.c|   5 +-
 net/Kconfig   |  21 ++-
 net/core/filter.c |  40 +++---
 net/core/sysctl_net_core.c|   9 ++
 20 files changed, 569 insertions(+), 136 deletions(-)

-- 
1.9.3

[PATCH net-next 01/10] bpf: minor cleanups in ebpf code

2016-05-13 Thread Daniel Borkmann

Besides others, remove redundant comments where the code is self
documenting enough, and properly indent various bpf_verifier_ops
and bpf_prog_type_list declarations. Moreover, remove two exports
that actually have no module user.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 kernel/bpf/core.c |  2 --
 net/core/filter.c | 34 +++---
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d781b07..5313d09 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -129,14 +129,12 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog 
*fp_old, unsigned int size,
 
return fp;
 }
-EXPORT_SYMBOL_GPL(bpf_prog_realloc);
 
 void __bpf_prog_free(struct bpf_prog *fp)
 {
kfree(fp->aux);
vfree(fp);
 }
-EXPORT_SYMBOL_GPL(__bpf_prog_free);
 
 #ifdef CONFIG_BPF_JIT
 struct bpf_binary_header *
diff --git a/net/core/filter.c b/net/core/filter.c
index 71c2a1f..ea51b47 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2069,16 +2069,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
-   /* check bounds */
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
-
-   /* disallow misaligned access */
+   /* The verifier guarantees that size > 0. */
if (off % size != 0)
return false;
-
-   /* all __sk_buff fields are __u32 */
-   if (size != 4)
+   if (size != sizeof(__u32))
return false;
 
return true;
@@ -2097,7 +2093,7 @@ static bool sk_filter_is_valid_access(int off, int size,
if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct __sk_buff, cb[0]) ...
-   offsetof(struct __sk_buff, cb[4]):
+offsetof(struct __sk_buff, cb[4]):
break;
default:
return false;
@@ -2278,30 +2274,30 @@ static u32 bpf_net_convert_ctx_access(enum 
bpf_access_type type, int dst_reg,
 }
 
 static const struct bpf_verifier_ops sk_filter_ops = {
-   .get_func_proto = sk_filter_func_proto,
-   .is_valid_access = sk_filter_is_valid_access,
-   .convert_ctx_access = bpf_net_convert_ctx_access,
+   .get_func_proto = sk_filter_func_proto,
+   .is_valid_access= sk_filter_is_valid_access,
+   .convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops tc_cls_act_ops = {
-   .get_func_proto = tc_cls_act_func_proto,
-   .is_valid_access = tc_cls_act_is_valid_access,
-   .convert_ctx_access = bpf_net_convert_ctx_access,
+   .get_func_proto = tc_cls_act_func_proto,
+   .is_valid_access= tc_cls_act_is_valid_access,
+   .convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
-   .ops = _filter_ops,
-   .type = BPF_PROG_TYPE_SOCKET_FILTER,
+   .ops= _filter_ops,
+   .type   = BPF_PROG_TYPE_SOCKET_FILTER,
 };
 
 static struct bpf_prog_type_list sched_cls_type __read_mostly = {
-   .ops = _cls_act_ops,
-   .type = BPF_PROG_TYPE_SCHED_CLS,
+   .ops= _cls_act_ops,
+   .type   = BPF_PROG_TYPE_SCHED_CLS,
 };
 
 static struct bpf_prog_type_list sched_act_type __read_mostly = {
-   .ops = _cls_act_ops,
-   .type = BPF_PROG_TYPE_SCHED_ACT,
+   .ops= _cls_act_ops,
+   .type   = BPF_PROG_TYPE_SCHED_ACT,
 };
 
 static int __init register_sk_filter_ops(void)
-- 
1.9.3

[PATCH net-next 05/10] bpf: add bpf_patch_insn_single helper

2016-05-13 Thread Daniel Borkmann

Move the functionality to patch instructions out of the verifier
code and into the core as the new bpf_patch_insn_single() helper
will be needed later on for blinding as well. No changes in
functionality.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 include/linux/filter.h |  3 +++
 kernel/bpf/core.c  | 71 ++
 kernel/bpf/verifier.c  | 53 +++--
 3 files changed, 83 insertions(+), 44 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4ff0e64..c4aae49 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -495,6 +495,9 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_int_jit_compile(struct bpf_prog *fp);
 bool bpf_helper_changes_skb_data(void *func);
 
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+  const struct bpf_insn *patch, u32 len);
+
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5313d09..49b5538 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,77 @@ void __bpf_prog_free(struct bpf_prog *fp)
vfree(fp);
 }
 
+static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
+{
+   return BPF_CLASS(insn->code) == BPF_JMP  &&
+  /* Call and Exit are both special jumps with no
+   * target inside the BPF instruction image.
+   */
+  BPF_OP(insn->code) != BPF_CALL &&
+  BPF_OP(insn->code) != BPF_EXIT;
+}
+
+static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+{
+   struct bpf_insn *insn = prog->insnsi;
+   u32 i, insn_cnt = prog->len;
+
+   for (i = 0; i < insn_cnt; i++, insn++) {
+   if (!bpf_is_jmp_and_has_target(insn))
+   continue;
+
+   /* Adjust offset of jmps if we cross boundaries. */
+   if (i < pos && i + insn->off + 1 > pos)
+   insn->off += delta;
+   else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
+   insn->off -= delta;
+   }
+}
+
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+  const struct bpf_insn *patch, u32 len)
+{
+   u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+   struct bpf_prog *prog_adj;
+
+   /* Since our patchlet doesn't expand the image, we're done. */
+   if (insn_delta == 0) {
+   memcpy(prog->insnsi + off, patch, sizeof(*patch));
+   return prog;
+   }
+
+   insn_adj_cnt = prog->len + insn_delta;
+
+   /* Several new instructions need to be inserted. Make room
+* for them. Likely, there's no need for a new allocation as
+* last page could have large enough tailroom.
+*/
+   prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
+   GFP_USER);
+   if (!prog_adj)
+   return NULL;
+
+   prog_adj->len = insn_adj_cnt;
+
+   /* Patching happens in 3 steps:
+*
+* 1) Move over tail of insnsi from next instruction onwards,
+*so we can patch the single target insn with one or more
+*new ones (patching is always from 1 to n insns, n > 0).
+* 2) Inject new instructions at the target location.
+* 3) Adjust branch offsets if necessary.
+*/
+   insn_rest = insn_adj_cnt - off - len;
+
+   memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
+   sizeof(*patch) * insn_rest);
+   memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
+
+   bpf_adj_branches(prog_adj, off, insn_delta);
+
+   return prog_adj;
+}
+
 #ifdef CONFIG_BPF_JIT
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 84bff68..a08d662 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2587,26 +2587,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env 
*env)
insn->src_reg = 0;
 }
 
-static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
-{
-   struct bpf_insn *insn = prog->insnsi;
-   int insn_cnt = prog->len;
-   int i;
-
-   for (i = 0; i < insn_cnt; i++, insn++) {
-   if (BPF_CLASS(insn->code) != BPF_JMP ||
-   BPF_OP(insn->code) == BPF_CALL ||
-   BPF_OP(insn->code) == BPF_EXIT)
-   continue;
-
-   /* adjust offset of jmps if necessary */
-   if (i < pos && i + insn->off + 1 > pos)
-   insn->off += delta;
-   else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
-   insn->off -= delta;
-   }
-}
-

[PATCH net-next 06/10] bpf: prepare bpf_int_jit_compile/bpf_prog_select_runtime apis

2016-05-13 Thread Daniel Borkmann

Since the blinding is strictly only called from inside eBPF JITs,
we need to change signatures for bpf_int_jit_compile() and
bpf_prog_select_runtime() first in order to prepare that the
eBPF program we're dealing with can change underneath. Hence,
for call sites, we need to return the latest prog. No functional
change in this patch.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 arch/arm64/net/bpf_jit_comp.c |  7 ---
 arch/s390/net/bpf_jit_comp.c  |  8 +---
 arch/x86/net/bpf_jit_comp.c   |  7 ---
 include/linux/filter.h|  5 +++--
 kernel/bpf/core.c | 18 ++
 kernel/bpf/syscall.c  |  2 +-
 lib/test_bpf.c|  5 -
 net/core/filter.c |  6 +-
 8 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index a7fb209..b421df7 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -761,7 +761,7 @@ void bpf_jit_compile(struct bpf_prog *prog)
/* Nothing to do here. We support Internal BPF. */
 }
 
-void bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
struct bpf_binary_header *header;
struct jit_ctx ctx;
@@ -769,14 +769,14 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
u8 *image_ptr;
 
if (!bpf_jit_enable)
-   return;
+   return prog;
 
memset(, 0, sizeof(ctx));
ctx.prog = prog;
 
ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
if (ctx.offset == NULL)
-   return;
+   return prog;
 
/* 1. Initial fake pass to compute ctx->idx. */
 
@@ -827,6 +827,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
prog->jited = 1;
 out:
kfree(ctx.offset);
+   return prog;
 }
 
 void bpf_jit_free(struct bpf_prog *prog)
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 3c0bfc1..fcf301a 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1262,18 +1262,19 @@ void bpf_jit_compile(struct bpf_prog *fp)
 /*
  * Compile eBPF program "fp"
  */
-void bpf_int_jit_compile(struct bpf_prog *fp)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
struct bpf_binary_header *header;
struct bpf_jit jit;
int pass;
 
if (!bpf_jit_enable)
-   return;
+   return fp;
+
memset(, 0, sizeof(jit));
jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
if (jit.addrs == NULL)
-   return;
+   return fp;
/*
 * Three initial passes:
 *   - 1/2: Determine clobbered registers
@@ -1305,6 +1306,7 @@ void bpf_int_jit_compile(struct bpf_prog *fp)
}
 free_addrs:
kfree(jit.addrs);
+   return fp;
 }
 
 /*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f5bfd4f..6b2d23e 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1073,7 +1073,7 @@ void bpf_jit_compile(struct bpf_prog *prog)
 {
 }
 
-void bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
struct bpf_binary_header *header = NULL;
int proglen, oldproglen = 0;
@@ -1084,11 +1084,11 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
int i;
 
if (!bpf_jit_enable)
-   return;
+   return prog;
 
addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
if (!addrs)
-   return;
+   return prog;
 
/* Before first pass, make a rough estimation of addrs[]
 * each bpf instruction is translated to less than 64 bytes
@@ -1140,6 +1140,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
}
 out:
kfree(addrs);
+   return prog;
 }
 
 void bpf_jit_free(struct bpf_prog *fp)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c4aae49..891852c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 
 int sk_filter(struct sock *sk, struct sk_buff *skb);
 
-int bpf_prog_select_runtime(struct bpf_prog *fp);
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
 void bpf_prog_free(struct bpf_prog *fp);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
@@ -492,7 +492,8 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter 
*fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
-void bpf_int_jit_compile(struct bpf_prog *fp);
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 bool bpf_helper_changes_skb_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog,

[PATCH net-next 08/10] bpf, x86: add support for constant blinding

2016-05-13 Thread Daniel Borkmann

This patch adds recently added constant blinding helpers into the
x86 eBPF JIT. In the bpf_int_jit_compile() path, requirements are
to utilize bpf_jit_blind_constants()/bpf_jit_prog_release_other()
pair for rewriting the program into a blinded one, and to map the
BPF_REG_AX register to a CPU register. The mapping of BPF_REG_AX
is at non-callee saved register r10, and thus shared with cached
skb->data used for ld_abs/ind and not in every program type needed.
When blinding is not used, there's zero additional overhead in the
generated image.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 arch/x86/net/bpf_jit_comp.c | 66 -
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 6b2d23e..fe04a04 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -110,11 +110,16 @@ static void bpf_flush_icache(void *start, void *end)
((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : 
func##_positive_offset)
 
 /* pick a register outside of BPF range for JIT internal work */
-#define AUX_REG (MAX_BPF_REG + 1)
+#define AUX_REG (MAX_BPF_JIT_REG + 1)
 
-/* the following table maps BPF registers to x64 registers.
- * x64 register r12 is unused, since if used as base address register
- * in load/store instructions, it always needs an extra byte of encoding
+/* The following table maps BPF registers to x64 registers.
+ *
+ * x64 register r12 is unused, since if used as base address
+ * register in load/store instructions, it always needs an
+ * extra byte of encoding and is callee saved.
+ *
+ *  r9 caches skb->len - skb->data_len
+ * r10 caches skb->data, and used for blinding (if enabled)
  */
 static const int reg2hex[] = {
[BPF_REG_0] = 0,  /* rax */
@@ -128,6 +133,7 @@ static const int reg2hex[] = {
[BPF_REG_8] = 6,  /* r14 callee saved */
[BPF_REG_9] = 7,  /* r15 callee saved */
[BPF_REG_FP] = 5, /* rbp readonly */
+   [BPF_REG_AX] = 2, /* r10 temp register */
[AUX_REG] = 3,/* r11 temp register */
 };
 
@@ -141,7 +147,8 @@ static bool is_ereg(u32 reg)
 BIT(AUX_REG) |
 BIT(BPF_REG_7) |
 BIT(BPF_REG_8) |
-BIT(BPF_REG_9));
+BIT(BPF_REG_9) |
+BIT(BPF_REG_AX));
 }
 
 /* add modifiers if 'reg' maps to x64 registers r8..r15 */
@@ -182,6 +189,7 @@ static void jit_fill_hole(void *area, unsigned int size)
 struct jit_context {
int cleanup_addr; /* epilogue code offset */
bool seen_ld_abs;
+   bool seen_ax_reg;
 };
 
 /* maximum number of bytes emitted while JITing one eBPF insn */
@@ -345,6 +353,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 
*image,
struct bpf_insn *insn = bpf_prog->insnsi;
int insn_cnt = bpf_prog->len;
bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
+   bool seen_ax_reg = ctx->seen_ax_reg | (oldproglen == 0);
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
int i, cnt = 0;
@@ -367,6 +376,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 
*image,
int ilen;
u8 *func;
 
+   if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
+   ctx->seen_ax_reg = seen_ax_reg = true;
+
switch (insn->code) {
/* ALU */
case BPF_ALU | BPF_ADD | BPF_X:
@@ -1002,6 +1014,10 @@ common_load:
 * sk_load_* helpers also use %r10 and %r9d.
 * See bpf_jit.S
 */
+   if (seen_ax_reg)
+   /* r10 = skb->data, mov %r10, off32(%rbx) */
+   EMIT3_off32(0x4c, 0x8b, 0x93,
+   offsetof(struct sk_buff, data));
EMIT1_off32(0xE8, jmp_offset); /* call */
break;
 
@@ -1076,19 +1092,34 @@ void bpf_jit_compile(struct bpf_prog *prog)
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
struct bpf_binary_header *header = NULL;
+   struct bpf_prog *tmp, *orig_prog = prog;
int proglen, oldproglen = 0;
struct jit_context ctx = {};
+   bool tmp_blinded = false;
u8 *image = NULL;
int *addrs;
int pass;
int i;
 
if (!bpf_jit_enable)
-   return prog;
+   return orig_prog;
+
+   tmp = bpf_jit_blind_constants(prog);
+   /* If blinding was requested and we failed during blinding,
+* we must fall back to the interpreter.
+*/
+   if (IS_ERR(tmp))
+   return orig_prog;
+   if (tmp != prog) {
+

[PATCH net-next 04/10] bpf, x86/arm64: remove useless checks on prog

2016-05-13 Thread Daniel Borkmann

There is never such a situation, where bpf_int_jit_compile() is
called with either prog as NULL or len as 0, so the tests are
unnecessary and confusing as people would just copy them. s390
doesn't have them, so no change is needed there.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 arch/arm64/net/bpf_jit_comp.c | 3 ---
 arch/x86/net/bpf_jit_comp.c   | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index a34420a..a7fb209 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -771,9 +771,6 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
if (!bpf_jit_enable)
return;
 
-   if (!prog || !prog->len)
-   return;
-
memset(, 0, sizeof(ctx));
ctx.prog = prog;
 
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4286f36..f5bfd4f 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1086,9 +1086,6 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
if (!bpf_jit_enable)
return;
 
-   if (!prog || !prog->len)
-   return;
-
addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
if (!addrs)
return;
-- 
1.9.3

[PATCH net-next 07/10] bpf: add generic constant blinding for use in jits

2016-05-13 Thread Daniel Borkmann

This work adds a generic facility for use from eBPF JIT compilers
that allows for further hardening of JIT generated images through
blinding constants. In response to the original work on BPF JIT
spraying published by Keegan McAllister [1], most BPF JITs were
changed to make images read-only and start at a randomized offset
in the page, where the rest was filled with trap instructions. We
have this nowadays in x86, arm, arm64 and s390 JIT compilers.
Additionally, later work also made eBPF interpreter images read
only for kernels supporting DEBUG_SET_MODULE_RONX, that is, x86,
arm, arm64 and s390 archs as well currently. This is done by
default for mentioned JITs when JITing is enabled. Furthermore,
we had a generic and configurable constant blinding facility on our
todo for quite some time now to further make spraying harder, and
first implementation since around netconf 2016.

We found that for systems where untrusted users can load cBPF/eBPF
code where JIT is enabled, start offset randomization helps a bit
to make jumps into crafted payload harder, but in case where larger
programs that cross page boundary are injected, we again have some
part of the program opcodes at a page start offset. With improved
guessing and more reliable payload injection, chances can increase
to jump into such payload. Elena Reshetova recently wrote a test
case for it [2, 3]. Moreover, eBPF comes with 64 bit constants, which
can leave some more room for payloads. Note that for all this,
additional bugs in the kernel are still required to make the jump
(and of course to guess right, to not jump into a trap) and naturally
the JIT must be enabled, which is disabled by default.

For helping mitigation, the general idea is to provide an option
bpf_jit_harden that admins can tweak along with bpf_jit_enable, so
that for cases where JIT should be enabled for performance reasons,
the generated image can be further hardened with blinding constants
for unpriviledged users (bpf_jit_harden == 1), with trading off
performance for these, but not for privileged ones. We also added
the option of blinding for all users (bpf_jit_harden == 2), which
is quite helpful for testing f.e. with test_bpf.ko. There are no
further e.g. hardening levels of bpf_jit_harden switch intended,
rationale is to have it dead simple to use as on/off. Since this
functionality would need to be duplicated over and over for JIT
compilers to use, which are already complex enough, we provide a
generic eBPF byte-code level based blinding implementation, which is
then just transparently JITed. JIT compilers need to make only a few
changes to integrate this facility and can be migrated one by one.

This option is for eBPF JITs and will be used in x86, arm64, s390
without too much effort, and soon ppc64 JITs, thus that native eBPF
can be blinded as well as cBPF to eBPF migrations, so that both can
be covered with a single implementation. The rule for JITs is that
bpf_jit_blind_constants() must be called from bpf_int_jit_compile(),
and in case blinding is disabled, we follow normally with JITing the
passed program. In case blinding is enabled and we fail during the
process of blinding itself, we must return with the interpreter.
Similarly, in case the JITing process after the blinding failed, we
return normally to the interpreter with the non-blinded code. Meaning,
interpreter doesn't change in any way and operates on eBPF code as
usual. For doing this pre-JIT blinding step, we need to make use of
a helper/auxiliary register, here BPF_REG_AX. This is strictly internal
to the JIT and not in any way part of the eBPF architecture. Just like
in the same way as JITs internally make use of some helper registers
when emitting code, only that here the helper register is one
abstraction level higher in eBPF bytecode, but nevertheless in JIT
phase. That helper register is needed since f.e. manually written
program can issue loads to all registers of eBPF architecture.

The core concept with the additional register is: blind out all 32
and 64 bit constants by converting BPF_K based instructions into a
small sequence from K_VAL into ((RND ^ K_VAL) ^ RND). Therefore, this
is transformed into: BPF_REG_AX := (RND ^ K_VAL), BPF_REG_AX ^= RND,
and REG  BPF_REG_AX, so actual operation on the target register
is translated from BPF_K into BPF_X one that is operating on
BPF_REG_AX's content. During rewriting phase when blinding, RND is
newly generated via prandom_u32() for each processed instruction.
64 bit loads are split into two 32 bit loads to make translation and
patching not too complex. Only basic thing required by JITs is to
call the helper bpf_jit_blind_constants()/bpf_jit_prog_release_other()
pair, and to map BPF_REG_AX into an unused register.

Small bpf_jit_disasm extract from [2] when applied to x86 JIT:

echo 0 > /proc/sys/net/core/bpf_jit_harden

  a034f5e9 + :
  [...]
  39:   mov$0xa8909090,%eax
  3e:   mov$0xa8909090,%eax
  43:   mov$0xa8ff3148,%eax
  48:   mov

Re: [RFC PATCH 0/2] net: threadable napi poll loop

2016-05-13 Thread Eric Dumazet

On Fri, May 13, 2016 at 9:50 AM, Paolo Abeni  wrote:

>> Indeed, and the patch looks quite simple now ;)
>>
>> diff --git a/kernel/softirq.c b/kernel/softirq.c
>> index 
>> 17caf4b63342d7839528f367b283a386413b0362..23c364485d03618773c385d943c0ef39f5931d09
>>  100644
>> --- a/kernel/softirq.c
>> +++ b/kernel/softirq.c
>> @@ -57,6 +57,11 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] 
>> __cacheline_aligned_in_smp
>>
>>  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
>>
>> +static inline bool ksoftirqd_running(void)
>> +{
>> + return __this_cpu_read(ksoftirqd)->state == TASK_RUNNING;
>> +}
>> +
>>  const char * const softirq_to_name[NR_SOFTIRQS] = {
>>   "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
>>   "TASKLET", "SCHED", "HRTIMER", "RCU"
>> @@ -313,7 +318,7 @@ asmlinkage __visible void do_softirq(void)
>>
>>   pending = local_softirq_pending();
>>
>> - if (pending)
>> + if (pending && !ksoftirqd_running())
>>   do_softirq_own_stack();
>>
>>   local_irq_restore(flags);
>> @@ -340,6 +345,9 @@ void irq_enter(void)
>>
>>  static inline void invoke_softirq(void)
>>  {
>> + if (ksoftirqd_running())
>> + return;
>> +
>>   if (!force_irqthreads) {
>>  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
>>   /*
>
> In this version of the path, the chunk affecting __local_bh_enable_ip()
> has been removed.
>
> I think it is beneficial, because it allows avoiding a
> local_irq_save()/local_irq_restore() pairs per local_bh_enable under heavy 
> load.
>

Interesting, do you have any numbers ?

I believe I did this so that we factorize the logic in do_softirq()
and keep the code local to kernel/softirq.c

Otherwise, netif_rx_ni() could also process softirq while ksoftirqd
was scheduled,
so I would have to  'export' the ksoftirqd_running(void) helper in an
include file.

I noticed that simply doing a "ping -n gateway" while the UDP flood
was occurring, my udp receiver had quite a different efficiency.

Thanks.

Re: [PATCH 3.2 085/115] veth: don???t modify ip_summed; doing so treats packets with bad checksums as good.

2016-05-13 Thread Ben Greear


Mr Miller:

How do you feel about a new socket-option to allow a socket to
request the old veth behaviour?

Thanks,
Ben

On 04/30/2016 10:30 PM, Willy Tarreau wrote:

On Sat, Apr 30, 2016 at 03:43:51PM -0700, Ben Greear wrote:

On 04/30/2016 03:01 PM, Vijay Pandurangan wrote:

Consider:

- App A  sends out corrupt packets 50% of the time and discards inbound data.

(...)

How can you make a generic app C know how to do this?  The path could be,
for instance:

eth0 <-> user-space-A <-> vethA <-> vethB <-> { kernel routing logic } <-> vethC <-> 
vethD <-> appC

There are no sockets on vethB, but it does need to have special behaviour to 
elide
csums.  Even if appC is hacked to know how to twiddle some thing on it's veth 
port,
mucking with vethD will have no effect on vethB.

With regard to your example above, why would A corrupt packets?  My guess:

1)  It has bugs (so, fix the bugs, it could equally create incorrect data with 
proper checksums,
 so just enabling checksumming adds no useful protection.)


I agree with Ben here, what he needs is the ability for userspace to be
trusted when *forwarding* a packet. Ideally you'd only want to receive
the csum status per packet on the packet socket and pass the same value
on the vethA interface, with this status being kept when the packet
reaches vethB.

If A purposely corrupts packet, it's A's problem. It's similar to designing
a NIC which intentionally corrupts packets and reports "checksum good".

The real issue is that in order to do things right, the userspace bridge
(here, "A") would really need to pass this status. In Ben's case as he
says, bad checksum packets are dropped before reaching A, so that
simplifies the process quite a bit and that might be what causes some
confusion, but ideally we'd rather have recvmsg() and sendmsg() with
these flags.

I faced the exact same issue 3 years ago when playing with netmap, it was
slow as hell because it would lose all checksum information when packets
were passing through userland, resulting in GRO/GSO etc being disabled,
and had to modify it to let userland preserve it. That's especially
important when you have to deal with possibly corrupted packets not yet
detected in the chain because the NIC did not validate their checksums.

Willy




--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com

Re: [RFC PATCH 0/2] net: threadable napi poll loop

2016-05-13 Thread Paolo Abeni

On Wed, 2016-05-11 at 14:56 -0700, Eric Dumazet wrote:
> On Wed, 2016-05-11 at 08:55 +0200, Peter Zijlstra wrote:
> > On Tue, May 10, 2016 at 03:51:37PM -0700, Eric Dumazet wrote:
> > > diff --git a/kernel/softirq.c b/kernel/softirq.c
> > > index 17caf4b63342..22463217e3cf 100644
> > > --- a/kernel/softirq.c
> > > +++ b/kernel/softirq.c
> > > @@ -56,6 +56,7 @@ EXPORT_SYMBOL(irq_stat);
> > >  static struct softirq_action softirq_vec[NR_SOFTIRQS] 
> > > __cacheline_aligned_in_smp;
> > >  
> > >  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
> > > +DEFINE_PER_CPU(bool, ksoftirqd_scheduled);
> > >  
> > >  const char * const softirq_to_name[NR_SOFTIRQS] = {
> > >   "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
> > > @@ -73,8 +74,10 @@ static void wakeup_softirqd(void)
> > >   /* Interrupts are disabled: no need to stop preemption */
> > >   struct task_struct *tsk = __this_cpu_read(ksoftirqd);
> > >  
> > > - if (tsk && tsk->state != TASK_RUNNING)
> > > + if (tsk && tsk->state != TASK_RUNNING) {
> > > + __this_cpu_write(ksoftirqd_scheduled, true);
> > >   wake_up_process(tsk);
> > 
> > Since we're already looking at tsk->state, and the wake_up_process()
> > ensures the thing becomes TASK_RUNNING, you could add:
> > 
> > static inline bool ksoftirqd_running(void)
> > {
> > return __this_cpu_read(ksoftirqd)->state == TASK_RUNNING;
> > }
> 
> Indeed, and the patch looks quite simple now ;)
> 
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 
> 17caf4b63342d7839528f367b283a386413b0362..23c364485d03618773c385d943c0ef39f5931d09
>  100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -57,6 +57,11 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] 
> __cacheline_aligned_in_smp
>  
>  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
>  
> +static inline bool ksoftirqd_running(void)
> +{
> + return __this_cpu_read(ksoftirqd)->state == TASK_RUNNING;
> +}
> +
>  const char * const softirq_to_name[NR_SOFTIRQS] = {
>   "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
>   "TASKLET", "SCHED", "HRTIMER", "RCU"
> @@ -313,7 +318,7 @@ asmlinkage __visible void do_softirq(void)
>  
>   pending = local_softirq_pending();
>  
> - if (pending)
> + if (pending && !ksoftirqd_running())
>   do_softirq_own_stack();
>  
>   local_irq_restore(flags);
> @@ -340,6 +345,9 @@ void irq_enter(void)
>  
>  static inline void invoke_softirq(void)
>  {
> + if (ksoftirqd_running())
> + return;
> +
>   if (!force_irqthreads) {
>  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
>   /*

In this version of the path, the chunk affecting __local_bh_enable_ip()
has been removed.

I think it is beneficial, because it allows avoiding a
local_irq_save()/local_irq_restore() pairs per local_bh_enable under heavy load.

Cheers,

Paolo

1 2 >

1 - 100 of 195 matches

Mail list logo