Re: [PATCH 16/16] thunderbolt: Add support for networking over Thunderbolt cable

2017-09-19 Thread Mika Westerberg
On Tue, Sep 19, 2017 at 01:21:44AM +0200, Andrew Lunn wrote:
> On Mon, Sep 18, 2017 at 06:30:49PM +0300, Mika Westerberg wrote:
> > From: Amir Levy 
> > 
> > ThunderboltIP is a protocol created by Apple to tunnel IP/ethernet
> > traffic over a Thunderbolt cable. The protocol consists of configuration
> > phase where each side sends ThunderboltIP login packets (the protocol is
> > determined by UUID in the XDomain packet header) over the configuration
> > channel. Once both sides get positive acknowledgment to their login
> > packet, they configure high-speed DMA path accordingly. This DMA path is
> > then used to transmit and receive networking traffic.
> > 
> > This patch creates a virtual ethernet interface the host software can
> > use in the same way as any other networking interface. Once the
> > interface is brought up successfully network packets get tunneled over
> > the Thunderbolt cable to the remote host and back.
> > 
> > The connection is terminated by sending a ThunderboltIP logout packet
> > over the configuration channel. We do this when the network interface is
> > brought down by user or the driver is unloaded.
> > 
> > Signed-off-by: Amir Levy 
> > Signed-off-by: Michael Jamet 
> > Signed-off-by: Mika Westerberg 
> > Reviewed-by: Yehezkel Bernat 
> > ---
> >  Documentation/admin-guide/thunderbolt.rst |   24 +
> >  drivers/thunderbolt/Kconfig   |   12 +
> >  drivers/thunderbolt/Makefile  |3 +
> >  drivers/thunderbolt/net.c | 1392 
> > +
> >  4 files changed, 1431 insertions(+)
> >  create mode 100644 drivers/thunderbolt/net.c
> 
> Hi Mika
> 
> Could this be renamed to driver/net/thunderbolt.c?

I pondered between drivers/thunderbolt/net.c and
drivers/net/thunderbolt.c and then decided to go with the former because
it follows drivers/firewire/net.c and kind of makes it easier for user
to enabled.

But no problem moving it into drivers/net if that's what networking
people prefer.

> At minimum, it needs a MAINTAINER entry pointing to netdev, so patches
> get reviewed by netdev people. However, since the driver seems to be a
> lot more netdev than thunderbolt, placing it in driver/net could be
> better.

OK.


[PATCH,net-next,2/2] tun: enable napi_gro_frags() for TUN/TAP driver

2017-09-19 Thread Petar Penkov
Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.

Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.

The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities.

Signed-off-by: Petar Penkov 
Cc: Eric Dumazet 
Cc: Mahesh Bandewar 
Cc: Willem de Bruijn 
Cc: da...@davemloft.net
Cc: ppen...@stanford.edu
---
 drivers/net/tun.c   | 135 ++--
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 130 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 46cca1094c91..ebe0d7dc7de6 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -120,8 +121,15 @@ do {   
\
 #define TUN_VNET_LE 0x8000
 #define TUN_VNET_BE 0x4000
 
+#if IS_ENABLED(CONFIG_TUN_NAPI)
+#define TUN_FEATURES_EXTRA IFF_NAPI_FRAGS
+#else
+#define TUN_FEATURES_EXTRA 0
+#endif
+
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
- IFF_MULTI_QUEUE)
+ IFF_MULTI_QUEUE | TUN_FEATURES_EXTRA)
+
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -173,6 +181,7 @@ struct tun_file {
unsigned int ifindex;
};
struct napi_struct napi;
+   struct mutex napi_mutex;/* Protects access to the above napi */
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
@@ -276,6 +285,7 @@ static void tun_napi_init(struct tun_struct *tun, struct 
tun_file *tfile)
netif_napi_add(tun->dev, >napi, tun_napi_poll,
   NAPI_POLL_WEIGHT);
napi_enable(>napi);
+   mutex_init(>napi_mutex);
}
 }
 
@@ -291,6 +301,11 @@ static void tun_napi_del(struct tun_file *tfile)
netif_napi_del(>napi);
 }
 
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+   return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -1034,7 +1049,8 @@ static void tun_poll_controller(struct net_device *dev)
 * supports polling, which enables bridge devices in virt setups to
 * still use netconsole
 * If NAPI is enabled, however, we need to schedule polling for all
-* queues.
+* queues unless we are using napi_gro_frags(), which we call in
+* process context and not in NAPI context.
 */
 
if (IS_ENABLED(CONFIG_TUN_NAPI)) {
@@ -1042,6 +1058,9 @@ static void tun_poll_controller(struct net_device *dev)
struct tun_file *tfile;
int i;
 
+   if (tun_napi_frags_enabled(tun))
+   return;
+
rcu_read_lock();
for (i = 0; i < tun->numqueues; i++) {
tfile = rcu_dereference(tun->tfiles[i]);
@@ -1264,6 +1283,64 @@ static unsigned int tun_chr_poll(struct file *file, 
poll_table *wait)
return mask;
 }
 
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+   size_t len,
+   const struct iov_iter *it)
+{
+   struct sk_buff *skb;
+   size_t linear;
+   int err;
+   int i;
+
+   if (it->nr_segs > MAX_SKB_FRAGS + 1)
+   return ERR_PTR(-ENOMEM);
+
+   local_bh_disable();
+   skb = napi_get_frags(>napi);
+   local_bh_enable();
+   if (!skb)
+   return ERR_PTR(-ENOMEM);
+
+   linear = iov_iter_single_seg_count(it);
+   err = __skb_grow(skb, linear);
+   if (err)
+   goto free;
+
+   skb->len = len;
+   skb->data_len = len - linear;
+   skb->truesize += skb->data_len;
+
+   for (i = 1; i < it->nr_segs; i++) {
+   size_t fragsz = it->iov[i].iov_len;
+   unsigned long offset;
+   struct page *page;
+   void *data;
+
+   if (fragsz == 0 || fragsz > PAGE_SIZE) {
+   err = -EINVAL;
+   goto free;
+   }
+
+   local_bh_disable();
+   data = 

Re: [PATCH net-next 04/14] gtp: udp recv clean up

2017-09-19 Thread kbuild test robot
Hi Tom,

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Tom-Herbert/gtp-Additional-feature-support/20170919-143920
config: i386-randconfig-x016-201738 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

Note: the 
linux-review/Tom-Herbert/gtp-Additional-feature-support/20170919-143920 HEAD 
737a09b8f9cd56706d01703d17523b0fea907f41 builds fine.
  It only hurts bisectibility.

All errors (new ones prefixed by >>):

   drivers//net/gtp.c: In function 'gtp_rx':
>> drivers//net/gtp.c:222:21: error: 'gtp' undeclared (first use in this 
>> function)
 gro_cells_receive(>gro_cells, skb);
^~~
   drivers//net/gtp.c:222:21: note: each undeclared identifier is reported only 
once for each function it appears in
   drivers//net/gtp.c: In function 'gtp_link_setup':
   drivers//net/gtp.c:628:18: error: 'gtp' undeclared (first use in this 
function)
 gro_cells_init(>gro_cells, dev);
 ^~~

vim +/gtp +222 drivers//net/gtp.c

   190  
   191  static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb,
   192  unsigned int hdrlen, unsigned int role)
   193  {
   194  struct pcpu_sw_netstats *stats;
   195  
   196  if (!gtp_check_ms(skb, pctx, hdrlen, role)) {
   197  netdev_dbg(pctx->dev, "No PDP ctx for this MS\n");
   198  return 1;
   199  }
   200  
   201  /* Get rid of the GTP + UDP headers. */
   202  if (iptunnel_pull_header(skb, hdrlen, skb->protocol,
   203   !net_eq(sock_net(pctx->sk), 
dev_net(pctx->dev
   204  return -1;
   205  
   206  netdev_dbg(pctx->dev, "forwarding packet from GGSN to 
uplink\n");
   207  
   208  /* Now that the UDP and the GTP header have been removed, set 
up the
   209   * new network header. This is required by the upper layer to
   210   * calculate the transport header.
   211   */
   212  skb_reset_network_header(skb);
   213  
   214  skb->dev = pctx->dev;
   215  
   216  stats = this_cpu_ptr(pctx->dev->tstats);
   217  u64_stats_update_begin(>syncp);
   218  stats->rx_packets++;
   219  stats->rx_bytes += skb->len;
   220  u64_stats_update_end(>syncp);
   221  
 > 222  gro_cells_receive(>gro_cells, skb);
   223  
   224  return 0;
   225  }
   226  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


[PATCH,net-next,1/2] tun: enable NAPI for TUN/TAP driver

2017-09-19 Thread Petar Penkov
Changes TUN driver to use napi_gro_receive() upon receiving packets
rather than netif_rx_ni(). Adds flag CONFIG_TUN_NAPI that enables
these changes and operation is not affected if the flag is disabled.
SKBs are constructed upon packet arrival and are queued to be
processed later.

The new path was evaluated with a benchmark with the following setup:
Open two tap devices and a receiver thread that reads in a loop for
each device. Start one sender thread and pin all threads to different
CPUs. Send 1M minimum UDP packets to each device and measure sending
time for each of the sending methods:
napi_gro_receive(): 4.90s
netif_rx_ni():  4.90s
netif_receive_skb():7.20s

Signed-off-by: Petar Penkov 
Cc: Eric Dumazet 
Cc: Mahesh Bandewar 
Cc: Willem de Bruijn 
Cc: da...@davemloft.net
Cc: ppen...@stanford.edu
---
 drivers/net/Kconfig |   8 
 drivers/net/tun.c   | 120 +++-
 2 files changed, 118 insertions(+), 10 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index aba0d652095b..0176264b1e70 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -307,6 +307,14 @@ config TAP
  This option is selected by any driver implementing tap user space
  interface for a virtual interface to re-use core tap functionality.
 
+config TUN_NAPI
+   bool "NAPI support on tx path for TUN/TAP driver"
+   default n
+   depends on TUN
+   ---help---
+ This option allows the TUN/TAP driver to use NAPI to pass packets to
+ the kernel when receiving packets from user space via write()/send().
+
 config TUN_VNET_CROSS_LE
bool "Support for cross-endian vnet headers on little-endian kernels"
default n
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3c9985f29950..46cca1094c91 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -172,6 +172,7 @@ struct tun_file {
u16 queue_index;
unsigned int ifindex;
};
+   struct napi_struct napi;
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
@@ -229,6 +230,67 @@ struct tun_struct {
struct bpf_prog __rcu *xdp_prog;
 };
 
+static int tun_napi_receive(struct napi_struct *napi, int budget)
+{
+   struct tun_file *tfile = container_of(napi, struct tun_file, napi);
+   struct sk_buff_head *queue = >sk.sk_write_queue;
+   struct sk_buff_head process_queue;
+   struct sk_buff *skb;
+   int received = 0;
+
+   __skb_queue_head_init(_queue);
+
+   spin_lock(>lock);
+   skb_queue_splice_tail_init(queue, _queue);
+   spin_unlock(>lock);
+
+   while (received < budget && (skb = __skb_dequeue(_queue))) {
+   napi_gro_receive(napi, skb);
+   ++received;
+   }
+
+   if (!skb_queue_empty(_queue)) {
+   spin_lock(>lock);
+   skb_queue_splice(_queue, queue);
+   spin_unlock(>lock);
+   }
+
+   return received;
+}
+
+static int tun_napi_poll(struct napi_struct *napi, int budget)
+{
+   unsigned int received;
+
+   received = tun_napi_receive(napi, budget);
+
+   if (received < budget)
+   napi_complete_done(napi, received);
+
+   return received;
+}
+
+static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile)
+{
+   if (IS_ENABLED(CONFIG_TUN_NAPI)) {
+   netif_napi_add(tun->dev, >napi, tun_napi_poll,
+  NAPI_POLL_WEIGHT);
+   napi_enable(>napi);
+   }
+}
+
+static void tun_napi_disable(struct tun_file *tfile)
+{
+   if (IS_ENABLED(CONFIG_TUN_NAPI))
+   napi_disable(>napi);
+}
+
+static void tun_napi_del(struct tun_file *tfile)
+{
+   if (IS_ENABLED(CONFIG_TUN_NAPI))
+   netif_napi_del(>napi);
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -541,6 +603,11 @@ static void __tun_detach(struct tun_file *tfile, bool 
clean)
 
tun = rtnl_dereference(tfile->tun);
 
+   if (tun && clean) {
+   tun_napi_disable(tfile);
+   tun_napi_del(tfile);
+   }
+
if (tun && !tfile->detached) {
u16 index = tfile->queue_index;
BUG_ON(index >= tun->numqueues);
@@ -598,6 +665,7 @@ static void tun_detach_all(struct net_device *dev)
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
BUG_ON(!tfile);
+   tun_napi_disable(tfile);
tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
RCU_INIT_POINTER(tfile->tun, NULL);
@@ -613,6 +681,7 @@ static void tun_detach_all(struct net_device *dev)
synchronize_net();

[PATCH net-next 1/4] qed: Add iWARP enablement support

2017-09-19 Thread Michal Kalderon
This patch is the last of the initial iWARP patch series. It
adds the possiblity to actually detect iWARP from the device and enable
it in the critical locations which basically make iWARP available.

It wasn't submitted until now as iWARP hadn't been accepted into
the rdma tree.

Signed-off-by: Michal Kalderon 
Signed-off-by: Ariel Elior 
---
 drivers/net/ethernet/qlogic/qed/qed_cxt.c |  6 ++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c | 10 +-
 drivers/net/ethernet/qlogic/qed/qed_rdma.c|  5 -
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |  1 +
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c 
b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index af106be..afd07ad 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -2069,6 +2069,12 @@ static void qed_rdma_set_pf_params(struct qed_hwfn 
*p_hwfn,
 
num_srqs = min_t(u32, 32 * 1024, p_params->num_srqs);
 
+   if (p_hwfn->mcp_info->func_info.protocol == QED_PCI_ETH_RDMA) {
+   DP_NOTICE(p_hwfn,
+ "Current day drivers don't support RoCE & iWARP 
simultaneously on the same PF. Default to RoCE-only\n");
+   p_hwfn->hw_info.personality = QED_PCI_ETH_ROCE;
+   }
+
switch (p_hwfn->hw_info.personality) {
case QED_PCI_ETH_IWARP:
/* Each QP requires one connection */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c 
b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 376485d..8b99c7d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1691,12 +1691,12 @@ int qed_mcp_get_media_type(struct qed_dev *cdev, u32 
*p_media_type)
case FW_MB_PARAM_GET_PF_RDMA_ROCE:
*p_proto = QED_PCI_ETH_ROCE;
break;
-   case FW_MB_PARAM_GET_PF_RDMA_BOTH:
-   DP_NOTICE(p_hwfn,
- "Current day drivers don't support RoCE & iWARP. 
Default to RoCE-only\n");
-   *p_proto = QED_PCI_ETH_ROCE;
-   break;
case FW_MB_PARAM_GET_PF_RDMA_IWARP:
+   *p_proto = QED_PCI_ETH_IWARP;
+   break;
+   case FW_MB_PARAM_GET_PF_RDMA_BOTH:
+   *p_proto = QED_PCI_ETH_RDMA;
+   break;
default:
DP_NOTICE(p_hwfn,
  "MFW answers GET_PF_RDMA_PROTOCOL but param is 
%08x\n",
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c 
b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index 6fb9951..06715f7 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -156,7 +156,10 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
return rc;
 
p_hwfn->p_rdma_info = p_rdma_info;
-   p_rdma_info->proto = PROTOCOLID_ROCE;
+   if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+   p_rdma_info->proto = PROTOCOLID_IWARP;
+   else
+   p_rdma_info->proto = PROTOCOLID_ROCE;
 
num_cons = qed_cxt_get_proto_cid_count(p_hwfn, p_rdma_info->proto,
   NULL);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c 
b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 46d0c3c..a1d33f3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -377,6 +377,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
p_ramrod->personality = PERSONALITY_ISCSI;
break;
case QED_PCI_ETH_ROCE:
+   case QED_PCI_ETH_IWARP:
p_ramrod->personality = PERSONALITY_RDMA_AND_ETH;
break;
default:
-- 
1.8.3.1



[PATCH net-next 2/4] qed: Add iWARP out of order support

2017-09-19 Thread Michal Kalderon
iWARP requires OOO support which is already provided by the ll2
interface (until now was used only for iSCSI offload).
The changes mostly include opening a ll2 dedicated connection for
OOO and notifiying the FW about the handle id.

Signed-off-by: Michal Kalderon 
Signed-off-by: Ariel Elior 
---
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 44 +
 drivers/net/ethernet/qlogic/qed/qed_iwarp.h | 11 +++-
 drivers/net/ethernet/qlogic/qed/qed_rdma.c  |  7 +++--
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c 
b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 9d989c9..568e985 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -41,6 +41,7 @@
 #include "qed_rdma.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
+#include "qed_ooo.h"
 
 #define QED_IWARP_ORD_DEFAULT  32
 #define QED_IWARP_IRD_DEFAULT  32
@@ -119,6 +120,13 @@ static void qed_iwarp_cid_cleaned(struct qed_hwfn *p_hwfn, 
u32 cid)
spin_unlock_bh(_hwfn->p_rdma_info->lock);
 }
 
+void qed_iwarp_init_fw_ramrod(struct qed_hwfn *p_hwfn,
+ struct iwarp_init_func_params *p_ramrod)
+{
+   p_ramrod->ll2_ooo_q_index = RESC_START(p_hwfn, QED_LL2_QUEUE) +
+   p_hwfn->p_rdma_info->iwarp.ll2_ooo_handle;
+}
+
 static int qed_iwarp_alloc_cid(struct qed_hwfn *p_hwfn, u32 *cid)
 {
int rc;
@@ -1876,6 +1884,16 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, 
struct qed_ptt *p_ptt)
iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
}
 
+   if (iwarp_info->ll2_ooo_handle != QED_IWARP_HANDLE_INVAL) {
+   rc = qed_ll2_terminate_connection(p_hwfn,
+ iwarp_info->ll2_ooo_handle);
+   if (rc)
+   DP_INFO(p_hwfn, "Failed to terminate ooo connection\n");
+
+   qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_ooo_handle);
+   iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
+   }
+
qed_llh_remove_mac_filter(p_hwfn,
  p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr);
return rc;
@@ -1927,10 +1945,12 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, 
struct qed_ptt *p_ptt)
struct qed_iwarp_info *iwarp_info;
struct qed_ll2_acquire_data data;
struct qed_ll2_cbs cbs;
+   u16 n_ooo_bufs;
int rc = 0;
 
iwarp_info = _hwfn->p_rdma_info->iwarp;
iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
+   iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
 
iwarp_info->max_mtu = params->max_mtu;
 
@@ -1978,6 +1998,29 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, 
struct qed_ptt *p_ptt)
if (rc)
goto err;
 
+   /* Start OOO connection */
+   data.input.conn_type = QED_LL2_TYPE_OOO;
+   data.input.mtu = params->max_mtu;
+
+   n_ooo_bufs = (QED_IWARP_MAX_OOO * QED_IWARP_RCV_WND_SIZE_DEF) /
+iwarp_info->max_mtu;
+   n_ooo_bufs = min_t(u32, n_ooo_bufs, QED_IWARP_LL2_OOO_MAX_RX_SIZE);
+
+   data.input.rx_num_desc = n_ooo_bufs;
+   data.input.rx_num_ooo_buffers = n_ooo_bufs;
+
+   data.input.tx_max_bds_per_packet = 1;   /* will never be fragmented */
+   data.input.tx_num_desc = QED_IWARP_LL2_OOO_DEF_TX_SIZE;
+   data.p_connection_handle = _info->ll2_ooo_handle;
+
+   rc = qed_ll2_acquire_connection(p_hwfn, );
+   if (rc)
+   goto err;
+
+   rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_ooo_handle);
+   if (rc)
+   goto err;
+
return rc;
 err:
qed_iwarp_ll2_stop(p_hwfn, p_ptt);
@@ -2014,6 +2057,7 @@ int qed_iwarp_setup(struct qed_hwfn *p_hwfn, struct 
qed_ptt *p_ptt,
 
qed_spq_register_async_cb(p_hwfn, PROTOCOLID_IWARP,
  qed_iwarp_async_event);
+   qed_ooo_setup(p_hwfn);
 
return qed_iwarp_ll2_start(p_hwfn, params, p_ptt);
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h 
b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
index 148ef3c..9e2bfde 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
@@ -47,7 +47,12 @@ enum qed_iwarp_qp_state {
 #define QED_IWARP_LL2_SYN_TX_SIZE   (128)
 #define QED_IWARP_LL2_SYN_RX_SIZE   (256)
 #define QED_IWARP_MAX_SYN_PKT_SIZE  (128)
-#define QED_IWARP_HANDLE_INVAL (0xff)
+
+#define QED_IWARP_LL2_OOO_DEF_TX_SIZE   (256)
+#define QED_IWARP_MAX_OOO  (16)
+#define QED_IWARP_LL2_OOO_MAX_RX_SIZE   (16384)
+
+#define QED_IWARP_HANDLE_INVAL (0xff)
 
 struct qed_iwarp_ll2_buff {
void *data;
@@ -67,6 +72,7 @@ struct qed_iwarp_info {
u8 crc_needed;
u8 tcp_flags;
 

[PATCH net-next 0/4] qed: iWARP fixes and enhancements

2017-09-19 Thread Michal Kalderon
This patch series includes several fixes and enhancements
related to iWARP.

Patch #1 is actually the last of the initial iWARP submission.
It has been delayed until now as I wanted to make sure that qedr
supports iWARP prior to enabling iWARP device detection.

iWARP changes in RDMA tree have been accepted and targeted at
kernel 4.15, therefore, all iWARP fixes for this cycle are
submitted to net-next.

Signed-off by: michal.kalde...@cavium.com
Signed-off-by: Ariel Elior 

Michal Kalderon (4):
  qed: Add iWARP enablement support
  qed: Add iWARP out of order support
  qed: Fix maximum number of CQs for iWARP
  qed: iWARP - Add check for errors on a SYN packet

 drivers/net/ethernet/qlogic/qed/qed_cxt.c |  6 +++
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c   | 52 +++
 drivers/net/ethernet/qlogic/qed/qed_iwarp.h   | 11 -
 drivers/net/ethernet/qlogic/qed/qed_ll2.c |  1 +
 drivers/net/ethernet/qlogic/qed/qed_mcp.c | 10 ++---
 drivers/net/ethernet/qlogic/qed/qed_rdma.c| 24 +++
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |  1 +
 include/linux/qed/qed_ll2_if.h|  1 +
 8 files changed, 91 insertions(+), 15 deletions(-)

-- 
1.8.3.1



[PATCH net-next 4/4] qed: iWARP - Add check for errors on a SYN packet

2017-09-19 Thread Michal Kalderon
A SYN packet which arrives with errors from FW should be dropped.
This required adding an additional field to the ll2
rx completion data.

Signed-off-by: Michal Kalderon 
Signed-off-by: Ariel Elior 
---
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 8 
 drivers/net/ethernet/qlogic/qed/qed_ll2.c   | 1 +
 include/linux/qed/qed_ll2_if.h  | 1 +
 3 files changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c 
b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 568e985..8fc9c811 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1733,6 +1733,14 @@ int qed_iwarp_reject(void *rdma_cxt, struct 
qed_iwarp_reject_in *iparams)
 
memset(_info, 0, sizeof(cm_info));
ll2_syn_handle = p_hwfn->p_rdma_info->iwarp.ll2_syn_handle;
+
+   /* Check if packet was received with errors... */
+   if (data->err_flags) {
+   DP_NOTICE(p_hwfn, "Error received on SYN packet: 0x%x\n",
+ data->err_flags);
+   goto err;
+   }
+
if (GET_FIELD(data->parse_flags,
  PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED) &&
GET_FIELD(data->parse_flags, PARSING_AND_ERR_FLAGS_L4CHKSMERROR)) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c 
b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index c06ad4f..250afa5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -413,6 +413,7 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
  struct qed_ll2_comp_rx_data *data)
 {
data->parse_flags = le16_to_cpu(p_cqe->rx_cqe_fp.parse_flags.flags);
+   data->err_flags = le16_to_cpu(p_cqe->rx_cqe_fp.err_flags.flags);
data->length.packet_length =
le16_to_cpu(p_cqe->rx_cqe_fp.packet_length);
data->vlan = le16_to_cpu(p_cqe->rx_cqe_fp.vlan);
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index dd7a3b8..89fa0bb 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -101,6 +101,7 @@ struct qed_ll2_comp_rx_data {
void *cookie;
dma_addr_t rx_buf_addr;
u16 parse_flags;
+   u16 err_flags;
u16 vlan;
bool b_last_packet;
u8 connection_handle;
-- 
1.8.3.1



[PATCH] net: compat: assert the size of cmsg copied in is as expected

2017-09-19 Thread Meng Xu
The actual length of cmsg fetched in during the second loop
(i.e., kcmsg - kcmsg_base) could be different from what we
get from the first loop (i.e., kcmlen).

The main reason is that the two get_user() calls in the two
loops (i.e., get_user(ucmlen, >cmsg_len) and
__get_user(ucmlen, >cmsg_len)) could cause ucmlen
to have different values even they fetch from the same userspace
address, as user can race to change the memory content in
>cmsg_len across fetches.

Although in the second loop, the sanity check
if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
is inplace, it only ensures that the cmsg fetched in during the
second loop does not exceed the length of kcmlen, but not
necessarily equal to kcmlen. But indicated by the assignment
kmsg->msg_controllen = kcmlen, we should enforce that.

This patch adds this additional sanity check and ensures that
what is recorded in kmsg->msg_controllen is the actual cmsg length.

Signed-off-by: Meng Xu 
---
 net/compat.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/net/compat.c b/net/compat.c
index 6ded6c8..2238171 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -185,6 +185,13 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, 
struct sock *sk,
ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
}
 
+   /*
+* check the length of messages copied in is the same as the
+* what we get from the first loop
+*/
+   if ((char *)kcmsg - (char *)kcmsg_base != kcmlen)
+   goto Einval;
+
/* Ok, looks like we made it.  Hook it up and return success. */
kmsg->msg_control = kcmsg_base;
kmsg->msg_controllen = kcmlen;
-- 
2.7.4



[PATCH] net: emac: Fix napi poll list corruption

2017-09-19 Thread Christian Lamparter
This patch is pretty much a carbon copy of
commit 3079c652141f ("caif: Fix napi poll list corruption")
with "caif" replaced by "emac".

The commit d75b1ade567f ("net: less interrupt masking in NAPI")
breaks emac.

It is now required that if the entire budget is consumed when poll
returns, the napi poll_list must remain empty.  However, like some
other drivers emac tries to do a last-ditch check and if there is
more work it will call napi_reschedule and then immediately process
some of this new work.  Should the entire budget be consumed while
processing such new work then we will violate the new caller
contract.

This patch fixes this by not touching any work when we reschedule
in emac.

Signed-off-by: Christian Lamparter 
---
 drivers/net/ethernet/ibm/emac/mal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/mal.c 
b/drivers/net/ethernet/ibm/emac/mal.c
index 2c74baa2398a..fff09dcf9e34 100644
--- a/drivers/net/ethernet/ibm/emac/mal.c
+++ b/drivers/net/ethernet/ibm/emac/mal.c
@@ -402,7 +402,7 @@ static int mal_poll(struct napi_struct *napi, int budget)
unsigned long flags;
 
MAL_DBG2(mal, "poll(%d)" NL, budget);
- again:
+
/* Process TX skbs */
list_for_each(l, >poll_list) {
struct mal_commac *mc =
@@ -451,7 +451,6 @@ static int mal_poll(struct napi_struct *napi, int budget)
spin_lock_irqsave(>lock, flags);
mal_disable_eob_irq(mal);
spin_unlock_irqrestore(>lock, flags);
-   goto again;
}
mc->ops->poll_tx(mc->dev);
}
-- 
2.14.1



Re: [PATCH net] MAINTAINERS: Remove Yuval Mintz from maintainers list

2017-09-19 Thread David Miller
From: 
Date: Tue, 19 Sep 2017 12:54:34 +0300

> From: Ariel Elior 
> 
> Remove Yuval from maintaining the bnx2x & qed* modules as he is no longer
> working for the company. Thanks Yuval for your huge contributions and
> tireless efforts over the many years and various companies.
> 
> Ariel
> Signed-off-by: Ariel Elior 

Applied, thanks.


Re: [PATCH net-next v3 1/4] bpf: add helper bpf_perf_event_read_value for perf event array map

2017-09-19 Thread kbuild test robot
Hi Yonghong,

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Yonghong-Song/bpf-add-two-helpers-to-read-perf-event-enabled-running-time/20170919-134113
config: m68k-allyesconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 4.9.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=m68k 

All errors (new ones prefixed by >>):

   kernel/bpf/arraymap.c: In function 'perf_event_fd_array_get_ptr':
>> kernel/bpf/arraymap.c:495:6: error: too many arguments to function 
>> 'perf_event_read_local'
 if (perf_event_read_local(event, , NULL, NULL) == -EOPNOTSUPP)
 ^
   In file included from kernel/bpf/arraymap.c:18:0:
   include/linux/perf_event.h:1290:19: note: declared here
static inline int perf_event_read_local(struct perf_event *event, u64 
*value)
  ^

vim +/perf_event_read_local +495 kernel/bpf/arraymap.c

   480  
   481  static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
   482   struct file *map_file, int fd)
   483  {
   484  struct bpf_event_entry *ee;
   485  struct perf_event *event;
   486  struct file *perf_file;
   487  u64 value;
   488  
   489  perf_file = perf_event_get(fd);
   490  if (IS_ERR(perf_file))
   491  return perf_file;
   492  
   493  ee = ERR_PTR(-EOPNOTSUPP);
   494  event = perf_file->private_data;
 > 495  if (perf_event_read_local(event, , NULL, NULL) == 
 > -EOPNOTSUPP)
   496  goto err_out;
   497  
   498  ee = bpf_event_entry_gen(perf_file, map_file);
   499  if (ee)
   500  return ee;
   501  ee = ERR_PTR(-ENOMEM);
   502  err_out:
   503  fput(perf_file);
   504  return ee;
   505  }
   506  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: Page allocator bottleneck

2017-09-19 Thread Aaron Lu
On Mon, Sep 18, 2017 at 06:33:20PM +0300, Tariq Toukan wrote:
> 
> 
> On 18/09/2017 10:44 AM, Aaron Lu wrote:
> > On Mon, Sep 18, 2017 at 03:34:47PM +0800, Aaron Lu wrote:
> > > On Sun, Sep 17, 2017 at 07:16:15PM +0300, Tariq Toukan wrote:
> > > > 
> > > > It's nice to have the option to dynamically play with the parameter.
> > > > But maybe we should also think of changing the default fraction 
> > > > guaranteed
> > > > to the PCP, so that unaware admins of networking servers would also 
> > > > benefit.
> > > 
> > > I collected some performance data with will-it-scale/page_fault1 process
> > > mode on different machines with different pcp->batch sizes, starting
> > > from the default 31(calculated by zone_batchsize(), 31 is the standard
> > > value for any zone that has more than 1/2MiB memory), then incremented
> > > by 31 upwards till 527. PCP's upper limit is 6*batch.
> > > 
> > > An image is plotted and attached: batch_full.png(full here means the
> > > number of process started equals to CPU number).
> > 
> > To be clear: X-axis is the value of batch size(31, 62, 93, ..., 527),
> > Y-axis is the value of per_process_ops, generated by will-it-scale,

One correction here, Y-axis isn't per_process_ops but per_process_ops *
nr_processes. Still, higher is better.

> > higher is better.
> > 
> > > 
> > >  From the image:
> > > - For EX machines, they all see throughput increase with increased batch
> > >size and peaked at around batch_size=310, then fall;
> > > - For EP machines, Haswell-EP and Broadwell-EP also see throughput
> > >increase with increased batch size and peaked at batch_size=279, then
> > >fall, batch_size=310 also delivers pretty good result. Skylake-EP is
> > >quite different in that it doesn't see any obvious throughput increase
> > >after batch_size=93, though the trend is still increasing, but in a 
> > > very
> > >small way and finally peaked at batch_size=403, then fall.
> > >Ivybridge EP behaves much like desktop ones.
> > > - For Desktop machines, they do not see any obvious changes with
> > >increased batch_size.
> > > 
> > > So the default batch size(31) doesn't deliver good enough result, we
> > > probbaly should change the default value.
> 
> Thanks Aaron for sharing your experiment results.
> That's a good analysis of the effect of the batch value.
> I agree with your conclusion.
> 
> From networking perspective, we should reconsider the defaults to be able to
> reach the increasing NICs linerates.
> Not only for pcp->batch, but also for pcp->high.

I guess I didn't make it clear in my last email: when pcp->batch is
changed, pcp->high is also changed. Their relationship is:
pcp->high = pcp->batch * 6.

Manipulating percpu_pagelist_fraction could increase pcp->high, but not
pcp->batch(it has an upper limit as 96 currently).

My test shows even when pcp->high being the same, changing pcp->batch
could further improve will-it-scale's performance. e.g. in the below two
cases, pcp->high are both set to 1860 but with different pcp->batch:

 will-it-scalenative_queued_spin_lock_slowpath(perf)
pcp->batch=9615762348 79.95%
pcp->batch=310   19291492 +22.3%  74.87% -5.1%

Granted, this is the case for will-it-scale and may not apply to your
case. I have a small patch that adds a batch interface for debug
purpose, echo a value could set batch and high will be batch * 6. You
are welcome to give it a try if you think it's worth(attached).

Regards,
Aaron
>From e3c9516beb8302cb8fb2f5ab866bbe2686fda5fb Mon Sep 17 00:00:00 2001
From: Aaron Lu 
Date: Thu, 6 Jul 2017 15:00:07 +0800
Subject: [PATCH] percpu_pagelist_batch: add a batch interface

Signed-off-by: Aaron Lu 
---
 include/linux/mmzone.h |  2 ++
 kernel/sysctl.c|  9 +
 mm/page_alloc.c| 40 +++-
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ef6a13b7bd3e..0548d038b7cd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -875,6 +875,8 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, 
int,
void __user *, size_t *, loff_t *);
 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+int percpu_pagelist_batch_sysctl_handler(struct ctl_table *, int,
+   void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a76cc3..85cc4544db1b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -108,6 +108,7 @@ extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int pid_max_min, 

Re: [PATCH net] bpf: fix ri->map prog pointer on bpf_prog_realloc

2017-09-19 Thread Daniel Borkmann

On 09/19/2017 03:43 AM, Alexei Starovoitov wrote:

On Tue, Sep 19, 2017 at 03:16:44AM +0200, Daniel Borkmann wrote:

Commit 109980b894e9 ("bpf: don't select potentially stale
ri->map from buggy xdp progs") passed the pointer to the prog
itself to be loaded into r4 prior on bpf_redirect_map() helper
call, so that we can store the owner into ri->map_owner out of
the helper.

Issue with that is that the actual address of the prog is still
subject to change when subsequent rewrites occur, e.g. through
patching other helper functions or constant blinding. Thus, we
really need to take prog->aux as the address we're holding, and
then during runtime fetch the actual pointer via aux->prog. This
also works with prog clones as they share the same aux and fixup
pointer to self after blinding finished.

Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp 
progs")
Signed-off-by: Daniel Borkmann 
---
  kernel/bpf/verifier.c | 12 ++--
  1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 799b245..243c09f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4205,9 +4205,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
}

if (insn->imm == BPF_FUNC_redirect_map) {
-   u64 addr = (unsigned long)prog;
+   /* Note, we cannot use prog directly as imm as 
subsequent
+* rewrites would still change the prog pointer. The 
only
+* stable address we can use is aux, which also works 
with
+* prog clones during blinding.
+*/


good catch. extra load at runtime sucks, but I don't see better solution.


+   u64 addr = (unsigned long)prog->aux;
+   const int r4 = BPF_REG_4;
struct bpf_insn r4_ld[] = {
-   BPF_LD_IMM64(BPF_REG_4, addr),
+   BPF_LD_IMM64(r4, addr),
+   BPF_LDX_MEM(BPF_DW, r4, r4,
+   offsetof(struct bpf_prog_aux, 
prog)),


needs to be BPF_FIELD_SIZEOF(struct bpf_prog_aux, prog) to work on 32-bit


Good point, will spin a v2. Thanks!


Re: [PATCH v2] net: stmmac: dwmac-sun8i: Use reset exclusive

2017-09-19 Thread Maxime Ripard
On Mon, Sep 18, 2017 at 08:30:43PM +0200, Corentin Labbe wrote:
> The current dwmac_sun8i module cannot be rmmod/modprobe due to that
> the reset controller was not released when removed.
> 
> This patch remove ambiguity, by using of_reset_control_get_exclusive and
> add the missing reset_control_put().
> 
> Note that we cannot use devm_reset_control_get, since the reset is not
> in the device node.
> 
> Signed-off-by: Corentin Labbe 
> ---
> Changes since v1:
> - added a note about devm_reset_control_get in commit message

That comment would be better if it was in the code.

> 
>  drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c 
> b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
> index 57bb6dd7b401..1736d7cb0d96 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
> @@ -854,6 +854,7 @@ static int sun8i_dwmac_unpower_internal_phy(struct 
> sunxi_priv_data *gmac)
>  
>   clk_disable_unprepare(gmac->ephy_clk);
>   reset_control_assert(gmac->rst_ephy);
> + reset_control_put(gmac->rst_ephy);

Putting it here is weird.

What would happen if power_phy / unpower_phy is called several times?

Can't we just make it symetric and undo in remove what we do in probe?

Maxime

-- 
Maxime Ripard, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com


signature.asc
Description: PGP signature


Re: [PATCH RFC V1 net-next 0/6] Time based packet transmission

2017-09-19 Thread Richard Cochran
On Tue, Sep 19, 2017 at 04:43:02PM +0200, Miroslav Lichvar wrote:
> If I understand it correctly, this also allows us to make a PTP/NTP
> "one-step" clock with HW that doesn't support it directly.

Cool, yeah, I hadn't thought of that, but it would work...

Thanks,
Richard


[PATCH net] tcp: fastopen: fix on syn-data transmit failure

2017-09-19 Thread Eric Dumazet
From: Eric Dumazet 

Our recent change exposed a bug in TCP Fastopen Client that syzkaller
found right away [1]

When we prepare skb with SYN+DATA, we attempt to transmit it,
and we update socket state as if the transmit was a success.

In socket RTX queue we have two skbs, one with the SYN alone,
and a second one containing the DATA.

When (malicious) ACK comes in, we now complain that second one had no
skb_mstamp.

The proper fix is to make sure that if the transmit failed, we do not
pretend we sent the DATA skb, and make it our send_head.

When 3WHS completes, we can now send the DATA right away, without having
to wait for a timeout.

[1]
WARNING: CPU: 0 PID: 100189 at net/ipv4/tcp_input.c:3117 
tcp_clean_rtx_queue+0x2057/0x2ab0 net/ipv4/tcp_input.c:3117()

 WARN_ON_ONCE(last_ackt == 0);

Modules linked in:
CPU: 0 PID: 100189 Comm: syz-executor1 Not tainted 
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
  8800b35cb1d8 81cad00d 
 828a4347 88009f86c080 8316eb20 0d7f
 8800b35cb220 812c33c2 8800baad2440 0009d46575c0
Call Trace:
 [] __dump_stack 
 [] dump_stack+0xc1/0x124 
 [] warn_slowpath_common+0xe2/0x150 
 [] warn_slowpath_null+0x2e/0x40 
 [] tcp_clean_rtx_queue+0x2057/0x2ab0 n
 [] tcp_ack+0x151d/0x3930 
 [] tcp_rcv_state_process+0x1c69/0x4fd0 
 [] tcp_v4_do_rcv+0x54f/0x7c0 
 [] sk_backlog_rcv 
 [] __release_sock+0x12b/0x3a0 
 [] release_sock+0x5e/0x1c0 
 [] inet_wait_for_connect 
 [] __inet_stream_connect+0x545/0xc50 
 [] tcp_sendmsg_fastopen 
 [] tcp_sendmsg+0x2298/0x35a0 
 [] inet_sendmsg+0xe5/0x520 
 [] sock_sendmsg_nosec 
 [] sock_sendmsg+0xcf/0x110 

Fixes: 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully")
Fixes: 783237e8daf1 ("net-tcp: Fast Open client - sending SYN-data")
Signed-off-by: Eric Dumazet 
Reported-by: Dmitry Vyukov 
Cc: Neal Cardwell 
Cc: Yuchung Cheng 
---
 net/ipv4/tcp_output.c |9 +
 1 file changed, 9 insertions(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 
517d737059d18d8821b65dcdf54d9bb3448784c2..0bc9e46a53696578eb6e911f2f75e6b34c80894f
 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3389,6 +3389,10 @@ static int tcp_send_syn_data(struct sock *sk, struct 
sk_buff *syn)
goto done;
}
 
+   /* data was not sent, this is our new send_head */
+   sk->sk_send_head = syn_data;
+   tp->packets_out -= tcp_skb_pcount(syn_data);
+
 fallback:
/* Send a regular SYN with Fast Open cookie request option */
if (fo->cookie.len > 0)
@@ -3441,6 +3445,11 @@ int tcp_connect(struct sock *sk)
 */
tp->snd_nxt = tp->write_seq;
tp->pushed_seq = tp->write_seq;
+   buff = tcp_send_head(sk);
+   if (unlikely(buff)) {
+   tp->snd_nxt = TCP_SKB_CB(buff)->seq;
+   tp->pushed_seq  = TCP_SKB_CB(buff)->seq;
+   }
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
 
/* Timer for repeating the SYN until an answer. */




[PATCH net-next 3/4] qed: Fix maximum number of CQs for iWARP

2017-09-19 Thread Michal Kalderon
The maximum number of CQs supported is bound to the number
of connections supported, which differs between RoCE and iWARP.

This fixes a crash that occurred in iWARP when running 1000 sessions
using perftest.

Signed-off-by: Michal Kalderon 
Signed-off-by: Ariel Elior 
---
 drivers/net/ethernet/qlogic/qed/qed_rdma.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c 
b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index 4f46f28..c8c4b39 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -209,11 +209,11 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
goto free_pd_map;
}
 
-   /* Allocate bitmap for cq's. The maximum number of CQs is bounded to
-* twice the number of QPs.
+   /* Allocate bitmap for cq's. The maximum number of CQs is bound to
+* the number of connections we support. (num_qps in iWARP or
+* num_qps/2 in RoCE).
 */
-   rc = qed_rdma_bmap_alloc(p_hwfn, _rdma_info->cq_map,
-p_rdma_info->num_qps * 2, "CQ");
+   rc = qed_rdma_bmap_alloc(p_hwfn, _rdma_info->cq_map, num_cons, "CQ");
if (rc) {
DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
   "Failed to allocate cq bitmap, rc = %d\n", rc);
@@ -222,10 +222,10 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
 
/* Allocate bitmap for toggle bit for cq icids
 * We toggle the bit every time we create or resize cq for a given icid.
-* The maximum number of CQs is bounded to  twice the number of QPs.
+* Size needs to equal the size of the cq bmap.
 */
rc = qed_rdma_bmap_alloc(p_hwfn, _rdma_info->toggle_bits,
-p_rdma_info->num_qps * 2, "Toggle");
+num_cons, "Toggle");
if (rc) {
DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
   "Failed to allocate toogle bits, rc = %d\n", rc);
-- 
1.8.3.1



Re: [PATCH net] bpf: do not disable/enable BH in bpf_map_free_id()

2017-09-19 Thread Martin KaFai Lau
On Tue, Sep 19, 2017 at 04:15:59PM +, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> syzkaller reported following splat [1]
> 
> Since hard irq are disabled by the caller, bpf_map_free_id()
> should not try to enable/disable BH.
> 
> Another solution would be to change htab_map_delete_elem() to
> defer the free_htab_elem() call after
> raw_spin_unlock_irqrestore(>lock, flags), but this might be not
> enough to cover other code paths.
Thanks for fixing it.

Acked-by: Martin KaFai Lau 


Re: [PATCH] VSOCK: fix uapi/linux/vm_sockets.h incomplete types

2017-09-19 Thread David Miller
From: Stefan Hajnoczi 
Date: Mon, 18 Sep 2017 16:21:00 +0100

> On Fri, Sep 15, 2017 at 02:14:32PM -0700, David Miller wrote:
>> > diff --git a/include/uapi/linux/vm_sockets.h 
>> > b/include/uapi/linux/vm_sockets.h
>> > index b4ed5d895699..4ae5c625ac56 100644
>> > --- a/include/uapi/linux/vm_sockets.h
>> > +++ b/include/uapi/linux/vm_sockets.h
>> > @@ -18,6 +18,10 @@
>> >  
>> >  #include 
>> >  
>> > +#ifndef __KERNEL__
>> > +#include  /* struct sockaddr */
>> > +#endif
>> > +
>> 
>> There is no precedence whatsoever to include sys/socket.h in _any_ UAPI
>> header file provided by the kernel.
> 
>  does it for the same reason:
> 
> include/uapi/linux/if.h:#include  /* for struct 
> sockaddr. */

You don't need it for struct sockaddr, you need it for sa_family_t,
the comment is very misleading.

Please do as I have instructed and it will fix this problem.

Thank you.



[PATCH net-next 4/4] net: dsa: move master ethtool code

2017-09-19 Thread Vivien Didelot
DSA overrides the master device ethtool ops, so that it can inject stats
from its dedicated switch CPU port as well.

The related code is currently split in dsa.c and slave.c, but it only
scopes the master net device. Move it to a new master.c DSA core file.

This file will be later extented with master net device specific code.

Signed-off-by: Vivien Didelot 
---
 net/dsa/Makefile   |   2 +-
 net/dsa/dsa.c  |  28 -
 net/dsa/dsa2.c |   4 +-
 net/dsa/dsa_priv.h |   7 ++--
 net/dsa/legacy.c   |   4 +-
 net/dsa/master.c   | 120 +
 net/dsa/slave.c|  83 
 7 files changed, 129 insertions(+), 119 deletions(-)
 create mode 100644 net/dsa/master.c

diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index fcce25da937c..2e7ac8bab19d 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,6 @@
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o dsa2.o legacy.o port.o slave.o switch.o
+dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index abadf7b49236..81c852e32821 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -112,34 +112,6 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int 
tag_protocol)
return ops;
 }
 
-int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp)
-{
-   struct dsa_switch *ds = cpu_dp->ds;
-   struct net_device *master;
-   struct ethtool_ops *cpu_ops;
-
-   master = cpu_dp->netdev;
-
-   cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL);
-   if (!cpu_ops)
-   return -ENOMEM;
-
-   cpu_dp->orig_ethtool_ops = master->ethtool_ops;
-   if (cpu_dp->orig_ethtool_ops)
-   memcpy(cpu_ops, cpu_dp->orig_ethtool_ops, sizeof(*cpu_ops));
-
-   dsa_cpu_port_ethtool_init(cpu_ops);
-   master->ethtool_ops = cpu_ops;
-
-   return 0;
-}
-
-void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp)
-{
-   cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops;
-   cpu_dp->orig_ethtool_ops = NULL;
-}
-
 void dsa_cpu_dsa_destroy(struct dsa_port *port)
 {
struct device_node *port_dn = port->dn;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 032f8bc3e788..dcccaebde708 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -440,7 +440,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
wmb();
dst->cpu_dp->netdev->dsa_ptr = dst;
 
-   err = dsa_cpu_port_ethtool_setup(dst->cpu_dp);
+   err = dsa_master_ethtool_setup(dst->cpu_dp->netdev);
if (err)
return err;
 
@@ -457,7 +457,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
if (!dst->applied)
return;
 
-   dsa_cpu_port_ethtool_restore(dst->cpu_dp);
+   dsa_master_ethtool_restore(dst->cpu_dp->netdev);
 
dst->cpu_dp->netdev->dsa_ptr = NULL;
 
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 9c3eeb72462d..f616b318 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -97,8 +97,6 @@ struct dsa_slave_priv {
 int dsa_cpu_dsa_setup(struct dsa_port *port);
 void dsa_cpu_dsa_destroy(struct dsa_port *dport);
 const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
-int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp);
-void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp);
 bool dsa_schedule_work(struct work_struct *work);
 
 /* legacy.c */
@@ -112,6 +110,10 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr 
*tb[],
   struct net_device *dev,
   const unsigned char *addr, u16 vid);
 
+/* master.c */
+int dsa_master_ethtool_setup(struct net_device *dev);
+void dsa_master_ethtool_restore(struct net_device *dev);
+
 /* port.c */
 int dsa_port_set_state(struct dsa_port *dp, u8 state,
   struct switchdev_trans *trans);
@@ -139,7 +141,6 @@ int dsa_port_vlan_del(struct dsa_port *dp,
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
-void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops);
 int dsa_slave_create(struct dsa_port *port, const char *name);
 void dsa_slave_destroy(struct net_device *slave_dev);
 int dsa_slave_suspend(struct net_device *slave_dev);
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 163910699db7..ae505d8e4417 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -602,7 +602,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, 
struct net_device *dev,
wmb();
dev->dsa_ptr = dst;
 
-   return dsa_cpu_port_ethtool_setup(dst->cpu_dp);
+   return dsa_master_ethtool_setup(dst->cpu_dp->netdev);
 }
 
 static int dsa_probe(struct platform_device *pdev)
@@ -667,7 +667,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
 {

[PATCH net-next 3/4] net: dsa: setup master ethtool after dsa_ptr

2017-09-19 Thread Vivien Didelot
DSA overrides the master's ethtool ops so that we can inject its CPU
port's statistics. Because of that, we need to setup the ethtool ops
after the master's dsa_ptr pointer has been assigned, not before.

This patch setups the ethtool ops after dsa_ptr is assigned, and
restores them before it gets cleared.

Signed-off-by: Vivien Didelot 
---
 net/dsa/dsa2.c   | 12 +++-
 net/dsa/legacy.c | 10 +++---
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index bd19304f862f..032f8bc3e788 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -433,16 +433,17 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
return err;
}
 
-   err = dsa_cpu_port_ethtool_setup(dst->cpu_dp);
-   if (err)
-   return err;
-
/* If we use a tagging format that doesn't have an ethertype
 * field, make sure that all packets from this point on get
 * sent to the tag format's receive function.
 */
wmb();
dst->cpu_dp->netdev->dsa_ptr = dst;
+
+   err = dsa_cpu_port_ethtool_setup(dst->cpu_dp);
+   if (err)
+   return err;
+
dst->applied = true;
 
return 0;
@@ -456,6 +457,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
if (!dst->applied)
return;
 
+   dsa_cpu_port_ethtool_restore(dst->cpu_dp);
+
dst->cpu_dp->netdev->dsa_ptr = NULL;
 
/* If we used a tagging format that doesn't have an ethertype
@@ -472,7 +475,6 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
dsa_ds_unapply(dst, ds);
}
 
-   dsa_cpu_port_ethtool_restore(dst->cpu_dp);
dst->cpu_dp = NULL;
 
pr_info("DSA: tree %d unapplied\n", dst->tree);
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 91e6f7981d39..163910699db7 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -206,10 +206,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
netdev_err(master, "[%d] : can't configure CPU and DSA ports\n",
   index);
 
-   ret = dsa_cpu_port_ethtool_setup(ds->dst->cpu_dp);
-   if (ret)
-   return ret;
-
return 0;
 }
 
@@ -606,7 +602,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, 
struct net_device *dev,
wmb();
dev->dsa_ptr = dst;
 
-   return 0;
+   return dsa_cpu_port_ethtool_setup(dst->cpu_dp);
 }
 
 static int dsa_probe(struct platform_device *pdev)
@@ -671,6 +667,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
 {
int i;
 
+   dsa_cpu_port_ethtool_restore(dst->cpu_dp);
+
dst->cpu_dp->netdev->dsa_ptr = NULL;
 
/* If we used a tagging format that doesn't have an ethertype
@@ -686,8 +684,6 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
dsa_switch_destroy(ds);
}
 
-   dsa_cpu_port_ethtool_restore(dst->cpu_dp);
-
dev_put(dst->cpu_dp->netdev);
 }
 
-- 
2.14.1



[PATCH net-next 2/4] net: dsa: setup master ethtool unconditionally

2017-09-19 Thread Vivien Didelot
When a DSA switch tree is meant to be applied, it already has a CPU
port. Thus remove the condition of dst->cpu_dp.

Moreover, the next lines access dst->cpu_dp unconditionally.

Signed-off-by: Vivien Didelot 
---
 net/dsa/dsa2.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 873af0108e24..bd19304f862f 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -433,11 +433,9 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
return err;
}
 
-   if (dst->cpu_dp) {
-   err = dsa_cpu_port_ethtool_setup(dst->cpu_dp);
-   if (err)
-   return err;
-   }
+   err = dsa_cpu_port_ethtool_setup(dst->cpu_dp);
+   if (err)
+   return err;
 
/* If we use a tagging format that doesn't have an ethertype
 * field, make sure that all packets from this point on get
@@ -474,10 +472,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
dsa_ds_unapply(dst, ds);
}
 
-   if (dst->cpu_dp) {
-   dsa_cpu_port_ethtool_restore(dst->cpu_dp);
-   dst->cpu_dp = NULL;
-   }
+   dsa_cpu_port_ethtool_restore(dst->cpu_dp);
+   dst->cpu_dp = NULL;
 
pr_info("DSA: tree %d unapplied\n", dst->tree);
dst->applied = false;
-- 
2.14.1



[PATCH net-next 0/4] net: dsa: move master ethtool code

2017-09-19 Thread Vivien Didelot
The DSA core overrides the master device's ethtool_ops structure so that
it can inject statistics and such of its dedicated switch CPU port.

This ethtool code is currently called on unnecessary conditions or
before the master interface and its switch CPU port get wired up.
This patchset fixes this.

Similarly to slave.c where the DSA slave net_device is the entry point
of the dsa_slave_* functions, this patchset also isolates the master's
ethtool code in a new master.c file, where the DSA master net_device is
the entry point of the dsa_master_* functions.

This is a first step towards better control of the master device and
support for multiple CPU ports.

Vivien Didelot (4):
  net: dsa: remove copy of master ethtool_ops
  net: dsa: setup master ethtool unconditionally
  net: dsa: setup master ethtool after dsa_ptr
  net: dsa: move master ethtool code

 include/net/dsa.h  |   1 -
 net/dsa/Makefile   |   2 +-
 net/dsa/dsa.c  |  28 -
 net/dsa/dsa2.c |  18 
 net/dsa/dsa_priv.h |   7 ++--
 net/dsa/legacy.c   |  10 ++---
 net/dsa/master.c   | 120 +
 net/dsa/slave.c|  80 ---
 8 files changed, 136 insertions(+), 130 deletions(-)
 create mode 100644 net/dsa/master.c

-- 
2.14.1



Re: Re: [PATCH] net/packet: fix race condition between fanout_add and __unregister_prot_hook

2017-09-19 Thread Willem de Bruijn
On Tue, Sep 19, 2017 at 3:21 AM, Nixiaoming  wrote:
> On Fri, Sep 15, 2017 at 10:46 AM, Willem de Bruijn
>
>  wrote:
>
>>
>
>> In case of failure we also need to unlink and free match. I
>
>> sent the following:
>
>>
>
>> http://patchwork.ozlabs.org/patch/813945/
>
>
>
> +   spin_lock(>bind_lock);
>
> +   if (po->running &&
>
> +   match->type == type &&
>
>match->prot_hook.type == po->prot_hook.type &&
>
>match->prot_hook.dev == po->prot_hook.dev) {
>
> err = -ENOSPC;
>
> @@ -1761,6 +1760,13 @@  static int fanout_add(struct sock *sk, u16 id, u16
> type_flags)
>
>   err = 0;
>
> }
>
>}
>
> +   spin_unlock(>bind_lock);
>
> +
>
> +   if (err && !refcount_read(>sk_ref)) {
>
> +list_del(>list);
>
> +kfree(match);
>
> +   }
>
>
>
>
>
> In the function fanout_add add spin_lock to protect po-> running and po->
> fanout,
>
> then whether it should be in the function fanout_release also add spin_lock
> protection ?

po->bind_lock is held when registering and unregistering the
protocol hook. fanout_release does access po->running or
prot_hook.

It is called from packet_release, which does hold the bind_lock
when unregistering the protocol hook.


RE: [PATCH net-next 05/12] net: dsa: b53: Use a macro to define I/O operations

2017-09-19 Thread David Laight
> >>> +#define b53_build_op(type, op_size, val_type)\
> >>> +static inline int b53_##type##op_size(struct b53_device *dev, u8
> >page,\
> >>> +   u8 reg, val_type val) 
> >>> \
> >>> +{
> >>> \
> >>> + int ret;
> >>> \
> >>> + 
> >>> \
> >>> + mutex_lock(>reg_mutex);
> >>> \
> >>> + ret = dev->ops->type##op_size(dev, page, reg, val); 
> >>> \
> >>> + mutex_unlock(>reg_mutex);  
> >>> \
> >>> + 
> >>> \
> >>> + return ret; 
> >>> \
> >>>  }
> >>
> >> Why separate the 'type' and 'op_size' arguments since they
> >> are always pasted together?
> >
> >For read/write48, the value type is u64.
> 
> The way I read David's comment is that instead of calling the macro with 
> read, 48, just combine that
> in a single argument: read48. I don't have a preference about that and can 
> respin eventually.

Indeed, factoring in the type is harder because reads want 'u64 *' not 'u64'.
While that could be factored, it would take more source lines and make
things very obfuscated.

David



[RFC PATCH 1/3] usbnet: Get rid of spammy usbnet "kevent X may have been dropped"

2017-09-19 Thread Douglas Anderson
Every once in a while when my system is under a bit of stress I see
some spammy messages show up in my logs that say:

  kevent X may have been dropped

As far as I can tell these messages aren't terribly useful.  The
comments around the messages make me think that either workqueues used
to work differently or that the original author of the code missed a
sublety related to them.  The error message appears to predate the git
conversion of the kernel so it's somewhat hard to tell.

Specifically, workqueues should work like this:

A) If a workqueue hasn't been scheduled then schedule_work() schedules
   it and returns true.

B) If a workqueue has been scheduled (but hasn't started) then
   schedule_work() will do nothing and return false.

C) If a workqueue has been scheduled (and has started) then
   schedule_work() will put it on the queue to run again and return
   true.

Said another way: if you call schedule_work() you can guarantee that
at least one full runthrough of the work will happen again.  That
should mean that the work will get processed and I don't see any
reason to think something should be dropped.

Reading the comments in in usbnet_defer_kevent() made me think that B)
and C) would be treated the same.  That is: even if we've started the
work and are 99% of the way through then schedule_work() would return
false and the work wouldn't be queued again.  If schedule_work()
really did behave that way then, truly, some amount of work would be
lost.  ...but it doesn't.

NOTE: if somehow these warnings are useful to mean something then
perhaps we should change them to make it more obvious.  If it's
interesting to know when the work is backlogged then we should change
the spam to say "warning: usbnet is backlogged".

ALSO NOTE: If somehow some of the types of work need to be repeated if
usbnet_defer_kevent() is called multiple times then that should be
quite easy to accomplish without dropping any work on the floor.  We
can just keep an atomic count for that type of work and add a loop
into usbnet_deferred_kevent().

Signed-off-by: Douglas Anderson 
---

 drivers/net/usb/usbnet.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 6510e5cc1817..a3e8dbaadcf9 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -450,19 +450,17 @@ static enum skb_state defer_bh(struct usbnet *dev, struct 
sk_buff *skb,
 }
 
 /* some work can't be done in tasklets, so we use keventd
- *
- * NOTE:  annoying asymmetry:  if it's active, schedule_work() fails,
- * but tasklet_schedule() doesn't.  hope the failure is rare.
  */
 void usbnet_defer_kevent (struct usbnet *dev, int work)
 {
set_bit (work, >flags);
-   if (!schedule_work (>kevent)) {
-   if (net_ratelimit())
-   netdev_err(dev->net, "kevent %d may have been 
dropped\n", work);
-   } else {
-   netdev_dbg(dev->net, "kevent %d scheduled\n", work);
-   }
+
+   /* If work is already started this will mark it to run again when it
+* finishes; if we already had work pending and it hadn't started
+* yet then that's fine too.
+*/
+   schedule_work (>kevent);
+   netdev_dbg(dev->net, "kevent %d scheduled\n", work);
 }
 EXPORT_SYMBOL_GPL(usbnet_defer_kevent);
 
-- 
2.14.1.690.gbb1197296e-goog



[PATCH V2 net 4/7] net: hns3: Fixes the initialization of MAC address in hardware

2017-09-19 Thread Salil Mehta
From: Lipeng 

This patch fixes the initialization of MAC address, fetched from HNS3
firmware i.e. when it is not randomly generated, to the HNS3 hardware.

Fixes: ca60906d2795 ("net: hns3: Add support of HNS3 Ethernet Driver for
hip08 SoC")
Signed-off-by: Lipeng 
Signed-off-by: Salil Mehta 
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
index 1c3e294..4d68d6e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
@@ -2705,10 +2705,11 @@ static void hns3_init_mac_addr(struct net_device 
*netdev)
eth_hw_addr_random(netdev);
dev_warn(priv->dev, "using random MAC address %pM\n",
 netdev->dev_addr);
-   /* Also copy this new MAC address into hdev */
-   if (h->ae_algo->ops->set_mac_addr)
-   h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr);
}
+
+   if (h->ae_algo->ops->set_mac_addr)
+   h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr);
+
 }
 
 static void hns3_nic_set_priv_ops(struct net_device *netdev)
-- 
2.7.4




[PATCH V2 net 6/7] net: hns3: Fixes the default VLAN-id of PF

2017-09-19 Thread Salil Mehta
From: Lipeng 

When there is no vlan id in the packets, hardware will treat the vlan id
as 0 and look for the mac_vlan table. This patch set the default vlan id
of PF as 0. Without this config, it will fail when look for mac_vlan
table, and hardware will drop packets.

Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine &
Compatibility Layer Support")
Signed-off-by: Mingguang Qu 
Signed-off-by: Lipeng 
Signed-off-by: Salil Mehta 
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 8e172af..74008ef 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3673,6 +3673,7 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev)
 {
 #define HCLGE_VLAN_TYPE_VF_TABLE   0
 #define HCLGE_VLAN_TYPE_PORT_TABLE 1
+   struct hnae3_handle *handle;
int ret;
 
ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_VLAN_TYPE_VF_TABLE,
@@ -3682,8 +3683,11 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev)
 
ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_VLAN_TYPE_PORT_TABLE,
 true);
+   if (ret)
+   return ret;
 
-   return ret;
+   handle = >vport[0].nic;
+   return hclge_set_port_vlan_filter(handle, htons(ETH_P_8021Q), 0, false);
 }
 
 static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu)
-- 
2.7.4




[PATCH V2 net 0/7] Bug fixes for the HNS3 Ethernet Driver for Hip08 SoC

2017-09-19 Thread Salil Mehta
This patch set presents some bug fixes for the HNS3 Ethernet driver identified
during internal testing & stabilization efforts.

Change Log:
Patch V2: Resolved comments from Leon Romanovsky
Patch V1: Initial Submit

Lipeng (6):
  net: hns3: Fixes initialization of phy address from firmware
  net: hns3: Fixes the command used to unmap ring from vector
  net: hns3: Fixes ring-to-vector map-and-unmap command
  net: hns3: Fixes the initialization of MAC address in hardware
  net: hns3: Fixes the default VLAN-id of PF
  net: hns3: Fixes the premature exit of loop when matching clients

Salil Mehta (1):
  net: hns3: Fixes the ether address copy with appropriate API

 drivers/net/ethernet/hisilicon/hns3/hnae3.c| 43 +-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |  8 +++-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 20 --
 .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c |  7 ++--
 4 files changed, 35 insertions(+), 43 deletions(-)

-- 
2.7.4




[PATCH V2 net 2/7] net: hns3: Fixes the command used to unmap ring from vector

2017-09-19 Thread Salil Mehta
From: Lipeng 

This patch fixes the IMP command being used to unmap the vector
from the corresponding ring.

Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine &
Compatibility Layer Support")
Signed-off-by: Lipeng 
Signed-off-by: Salil Mehta 
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index db4e07d..e324bc6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2779,7 +2779,7 @@ static int hclge_unmap_ring_from_vector(
}
i = 0;
hclge_cmd_setup_basic_desc(,
-  HCLGE_OPC_ADD_RING_TO_VECTOR,
+  HCLGE_OPC_DEL_RING_TO_VECTOR,
   false);
req->int_vector_id = vector_id;
}
-- 
2.7.4




[PATCH V2 net 3/7] net: hns3: Fixes ring-to-vector map-and-unmap command

2017-09-19 Thread Salil Mehta
From: Lipeng 

This patch fixes the vector-to-ring map and unmap command and adds
INT_GL(for, Gap Limiting Interrupts) and VF id to it as required
by the hardware interface.

Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine &
Compatibility Layer Support")
Signed-off-by: Lipeng 
Signed-off-by: Mingguang Qu 
Signed-off-by: Salil Mehta 
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h  | 8 ++--
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 8 
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index 91ae013..c2b613b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -238,7 +238,7 @@ struct hclge_tqp_map {
u8 rsv[18];
 };
 
-#define HCLGE_VECTOR_ELEMENTS_PER_CMD  11
+#define HCLGE_VECTOR_ELEMENTS_PER_CMD  10
 
 enum hclge_int_type {
HCLGE_INT_TX,
@@ -252,8 +252,12 @@ struct hclge_ctrl_vector_chain {
 #define HCLGE_INT_TYPE_S   0
 #define HCLGE_INT_TYPE_M   0x3
 #define HCLGE_TQP_ID_S 2
-#define HCLGE_TQP_ID_M (0x3fff << HCLGE_TQP_ID_S)
+#define HCLGE_TQP_ID_M (0x7ff << HCLGE_TQP_ID_S)
+#define HCLGE_INT_GL_IDX_S 13
+#define HCLGE_INT_GL_IDX_M (0x3 << HCLGE_INT_GL_IDX_S)
__le16 tqp_type_and_id[HCLGE_VECTOR_ELEMENTS_PER_CMD];
+   u8 vfid;
+   u8 rsv;
 };
 
 #define HCLGE_TC_NUM   8
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index e324bc6..eafd9c6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2680,7 +2680,11 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport 
*vport, int vector_id,
   hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M,
   HCLGE_TQP_ID_S,  node->tqp_index);
+   hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M,
+  HCLGE_INT_GL_IDX_S,
+  hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]);
+   req->vfid = vport->vport_id;
 
if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) {
req->int_cause_num = HCLGE_VECTOR_ELEMENTS_PER_CMD;
@@ -2764,8 +2768,12 @@ static int hclge_unmap_ring_from_vector(
   hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M,
   HCLGE_TQP_ID_S,  node->tqp_index);
+   hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M,
+  HCLGE_INT_GL_IDX_S,
+  hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
 
req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]);
+   req->vfid = vport->vport_id;
 
if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) {
req->int_cause_num = HCLGE_VECTOR_ELEMENTS_PER_CMD;
-- 
2.7.4




[PATCH V2 net 5/7] net: hns3: Fixes the ether address copy with appropriate API

2017-09-19 Thread Salil Mehta
This patch replaces the ethernet address copy instance with more
appropriate ether_addr_copy() function.

Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine &
Compatibility Layer Support")
Signed-off-by: Salil Mehta 
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index eafd9c6..8e172af 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1063,8 +1063,7 @@ static int hclge_configure(struct hclge_dev *hdev)
hdev->base_tqp_pid = 0;
hdev->rss_size_max = 1;
hdev->rx_buf_len = cfg.rx_buf_len;
-   for (i = 0; i < ETH_ALEN; i++)
-   hdev->hw.mac.mac_addr[i] = cfg.mac_addr[i];
+   ether_addr_copy(hdev->hw.mac.mac_addr, cfg.mac_addr);
hdev->hw.mac.media_type = cfg.media_type;
hdev->hw.mac.phy_addr = cfg.phy_addr;
hdev->num_desc = cfg.tqp_desc_num;
-- 
2.7.4




Re: [RFC net-next v2] bridge lwtunnel, VPLS & NVGRE

2017-09-19 Thread Amine Kherbouche

Hi David,

What's next ? do you plan to send a v3 or should I do it ?

On 09/11/2017 10:02 AM, Amine Kherbouche wrote:

Hi David,

Do you plan to send a v3?

On 21/08/2017 18:15, David Lamparter wrote:

Hi all,


this is an update on the earlier "[RFC net-next] VPLS support".  Note
I've changed the subject lines on some of the patches to better reflect
what they really do (tbh the earlier subject lines were crap.)

As previously, iproute2 / FRR patches are at:
- https://github.com/eqvinox/vpls-iproute2
- https://github.com/opensourcerouting/frr/commits/vpls
while this patchset is also available at:
- https://github.com/eqvinox/vpls-linux-kernel
(but please be aware that I'm amending and rebasing commits)


Re: [PATCH net-next v2 12/12] net: dsa: bcm_sf2: Utilize b53_{enable,disable}_port

2017-09-19 Thread Vivien Didelot
Florian Fainelli  writes:

> Export b53_{enable,disable}_port and use these two functions in
> bcm_sf2_port_setup and bcm_sf2_port_disable. The generic functions
> cannot be used without wrapping because we need to manage additional
> switch integration details (PHY, Broadcom tag etc.).
>
> Signed-off-by: Florian Fainelli 

Reviewed-by: Vivien Didelot 


Re: [PATCH net-next 4/4] test_rhashtable: add test case for rhl_table interface

2017-09-19 Thread kbuild test robot
Hi Florian,

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Florian-Westphal/test_rhashtable-add-test-case-for-rhl-table/20170919-135550
config: x86_64-randconfig-a0-09192105 (attached as .config)
compiler: gcc-4.4 (Debian 4.4.7-8) 4.4.7
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

All warnings (new ones prefixed by >>):

   lib/test_rhashtable.c: In function 'test_rhltable':
>> lib/test_rhashtable.c:433: warning: the frame size of 2144 bytes is larger 
>> than 2048 bytes

vim +433 lib/test_rhashtable.c

   254  
   255  static int __init test_rhltable(unsigned int entries)
   256  {
   257  struct test_obj_rhl *rhl_test_objects;
   258  unsigned long *obj_in_table;
   259  struct rhltable rhlt;
   260  unsigned int i, j, k;
   261  int ret, err;
   262  
   263  if (entries == 0)
   264  entries = 1;
   265  
   266  rhl_test_objects = vzalloc(sizeof(*rhl_test_objects) * entries);
   267  if (!rhl_test_objects)
   268  return -ENOMEM;
   269  
   270  ret = -ENOMEM;
   271  obj_in_table = vzalloc(BITS_TO_LONGS(entries) * sizeof(unsigned 
long));
   272  if (!obj_in_table)
   273  goto out_free;
   274  
   275  /* nulls_base not supported in rhlist interface */
   276  test_rht_params.nulls_base = 0;
   277  err = rhltable_init(, _rht_params);
   278  if (WARN_ON(err))
   279  goto out_free;
   280  
   281  k = prandom_u32();
   282  ret = 0;
   283  for (i = 0; i < entries; i++) {
   284  rhl_test_objects[i].value.id = k;
   285  err = rhltable_insert(, 
_test_objects[i].list_node,
   286test_rht_params);
   287  if (WARN(err, "error %d on element %d\n", err, i))
   288  break;
   289  if (err == 0)
   290  set_bit(i, obj_in_table);
   291  }
   292  
   293  if (err)
   294  ret = err;
   295  
   296  pr_info("test %d add/delete pairs into rhlist\n", entries);
   297  for (i = 0; i < entries; i++) {
   298  struct rhlist_head *h, *pos;
   299  struct test_obj_rhl *obj;
   300  struct test_obj_val key = {
   301  .id = k,
   302  };
   303  bool found;
   304  
   305  rcu_read_lock();
   306  h = rhltable_lookup(, , test_rht_params);
   307  if (WARN(!h, "key not found during iteration %d of %d", 
i, entries)) {
   308  rcu_read_unlock();
   309  break;
   310  }
   311  
   312  if (i) {
   313  j = i - 1;
   314  rhl_for_each_entry_rcu(obj, pos, h, list_node) {
   315  if (WARN(pos == 
_test_objects[j].list_node, "old element found, should be gone"))
   316  break;
   317  }
   318  }
   319  
   320  cond_resched_rcu();
   321  
   322  found = false;
   323  
   324  rhl_for_each_entry_rcu(obj, pos, h, list_node) {
   325  if (pos == _test_objects[i].list_node) {
   326  found = true;
   327  break;
   328  }
   329  }
   330  
   331  rcu_read_unlock();
   332  
   333  if (WARN(!found, "element %d not found", i))
   334  break;
   335  
   336  err = rhltable_remove(, 
_test_objects[i].list_node, test_rht_params);
   337  WARN(err, "rhltable_remove: err %d for iteration %d\n", 
err, i);
   338  if (err == 0)
   339  clear_bit(i, obj_in_table);
   340  }
   341  
   342  if (ret == 0 && err)
   343  ret = err;
   344  
   345  for (i = 0; i < entries; i++) {
   346  WARN(test_bit(i, obj_in_table), "elem %d allegedly 
still present", i);
   347  
   348  err = rhltable_insert(, 
_test_objects[i].list_node,
   349test_rht_params);
   350  if (WARN(err, "error %d on element %d\n", err, i))
   351  break;
   352  if (err == 0)
   353  set_bit(i, obj_in_table);
   354  }
   355  
   356  pr_info

Re: [PATCH net-next 00/14] gtp: Additional feature support

2017-09-19 Thread Tom Herbert
On Tue, Sep 19, 2017 at 5:43 AM, Harald Welte  wrote:
> Hi Tom,
>
> first of all, thanks a lot for your patch series.  It makes me happy to
> see contributions on the GTP code :)
>
> On Mon, Sep 18, 2017 at 05:38:50PM -0700, Tom Herbert wrote:
>>   - IPv6 support
>
> see my detailed comments in other mails.  It's unfortunately only
> support for the already "deprecated" IPv6-only PDP contexts, not the
> more modern v4v6 type.  In order to interoperate with old and new
> approach, all three cases (v4, v6 and v4v6) should be supported from one
> code base.
>
It sounds like something that can be subsequently added. Do you have a
reference to the spec?

>>   - Configurable networking interfaces so that GTP kernel can be used
>>   and tested without needing GSN network emulation (i.e. no user space
>>   daemon needed).
>
> We have some pretty decent userspace utilities for configuring the GTP
> interfaces and tunnels in the libgtpnl repository, but if it helps
> people to have another way of configuration, I won't be against it.
>
AFAIK those userspace utilities don't support IPv6. Being able to
configure GTP like any other encapsulation will facilitate development
of IPv6 and other features.

> What we have to keep in mind is that the current model of 1:1 mapping of
> a "UDP socket' to a GTP netdevice is conceptually broken and needs to be
> refactored soon (without breaking backwards compatibility).  See related
> earlier discussions with patches submitted by Andreas Schultz.
>
I don't think I changed the model, so this can evolve.

> Summary:
>
> In real-world GGSNs you often want to host multiple virtual GGSNs on a
> single GGSN (= UDP socket).  Each virtual GGSN terminates into one
> external PDN (packet data network), which can be a private corporate vpn
> or any other IP network, with no routing between those networks.
>
Sounds like network virtualization and VNIs.

> Naively one would assume you "simply" run another virtual GGSN
> instance on another IP address, and then differentiate like that.
>
> However, the problem is that adding a new GGSN IP address will require
> manual configuration changes at each of your roaming partners (easily
> hundreds of operators!) and hence it is avoided at all cost due to the
> related long schedule, requirement for interop testing with each of them,
> etc.
>
> So what you do in reality at operators is that you operate many of those
> virtual GGSNs on the same IP:Port combination (and hence UDP socket),
> which means you have PDP contexts for vGGSN A which terminate on e.g.
> gtp0 and PDP contexts for vGGSN B on gtp1, and so on.  The decision
> which gtp-device a given PDP context is a member is made by the GTP-C
> instance.  In the kenel we'll have to decouple net-devices from sockets.
>
> So whatever new configuration mechanism or architectural changes we
> introduce, we need to make sure that those will accomodate the "new
> model" rather than introducing further dependencies for which we will
> have to maintain backwards compatibility workaronds later on.
>
>>   - Port numbers are configurable
>
> I'm not sure if this is a useful feature.  GTP is used only in
> operator-controlled networks and only on standard ports.  It's not
> possible to negotiate any non-standard ports on the signaling plane
> either.
>
Bear in mind that we're not required to do everything the GTP spec
says. Adding port configuration is another one of those things that
gives us flexibility and and better capability to test without needing
a full blown GSN network. One feature I didn't implement was UDP
source for flow entropy-- as we've seen with other encapsulation
protocols this helps significantly to get good ECMP in the network. My
impression is GTP designers probably didn't think in terms of getting
best performance. But we can ;-)

>>   - Addition of a dst_cache in the GTP structure and other cleanup
>
> looks fine to me.
>
>>   - GSO,GRO
>>   - Control of zero UDP checksums
>
> [...]
>
>> Additionally, this patch set also includes a couple of general support
>> capabilities:
>>
>>   - A facility that allows application specific GSO callbacks
>>   - Common functions to get a route fo for an IP tunnel
>
> This is where the "core netdev" folks will have to comment.  I'm too
> remote from mainline kernel development these days and will focus on
> reviewing the GTP specific bits of your patch series.
>
Thanks. Obviously, I and many on this list have more expertise on the
core networking side than GTP, so your review is quite welcome.

>> For IPv6 support, the mobile subscriber needs to allow IPv6 addresses,
>> and the remote enpoint can be IPv6.
>
> Minor correction: The mobile subscriber specifically requests a PDP Type
> when establishing the PDP context via Session Management related
> signaling from MS/UE to SGSN.  The SGSN simply translates this to GTP
> and then forwards it to the GGSN.  So it's acutally not "allow" but
> "specifically request".
>
Okay.

>> 

[PATCH net-next 1/4] net: dsa: remove copy of master ethtool_ops

2017-09-19 Thread Vivien Didelot
There is no need to store a copy of the master ethtool ops, storing the
original pointer in DSA and the new one in the master netdev itself is
enough.

In the meantime, set orig_ethtool_ops to NULL when restoring the master
ethtool ops and check the presence of the master original ethtool ops as
well as its needed functions before calling them.

Signed-off-by: Vivien Didelot 
---
 include/net/dsa.h |  1 -
 net/dsa/dsa.c |  8 
 net/dsa/slave.c   | 19 +++
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index dd44d6ce1097..8dee216a5a9b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -188,7 +188,6 @@ struct dsa_port {
/*
 * Original copy of the master netdev ethtool_ops
 */
-   struct ethtool_ops  ethtool_ops;
const struct ethtool_ops *orig_ethtool_ops;
 };
 
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 03c58b0eb082..abadf7b49236 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -124,11 +124,10 @@ int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp)
if (!cpu_ops)
return -ENOMEM;
 
-   memcpy(_dp->ethtool_ops, master->ethtool_ops,
-  sizeof(struct ethtool_ops));
cpu_dp->orig_ethtool_ops = master->ethtool_ops;
-   memcpy(cpu_ops, _dp->ethtool_ops,
-  sizeof(struct ethtool_ops));
+   if (cpu_dp->orig_ethtool_ops)
+   memcpy(cpu_ops, cpu_dp->orig_ethtool_ops, sizeof(*cpu_ops));
+
dsa_cpu_port_ethtool_init(cpu_ops);
master->ethtool_ops = cpu_ops;
 
@@ -138,6 +137,7 @@ int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp)
 void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp)
 {
cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops;
+   cpu_dp->orig_ethtool_ops = NULL;
 }
 
 void dsa_cpu_dsa_destroy(struct dsa_port *port)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 2afa99506f8b..2ff4f907d137 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -574,12 +574,13 @@ static void dsa_cpu_port_get_ethtool_stats(struct 
net_device *dev,
struct dsa_switch_tree *dst = dev->dsa_ptr;
struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
struct dsa_switch *ds = cpu_dp->ds;
+   const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
s8 cpu_port = cpu_dp->index;
int count = 0;
 
-   if (cpu_dp->ethtool_ops.get_sset_count) {
-   count = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS);
-   cpu_dp->ethtool_ops.get_ethtool_stats(dev, stats, data);
+   if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
+   count = ops->get_sset_count(dev, ETH_SS_STATS);
+   ops->get_ethtool_stats(dev, stats, data);
}
 
if (ds->ops->get_ethtool_stats)
@@ -591,10 +592,11 @@ static int dsa_cpu_port_get_sset_count(struct net_device 
*dev, int sset)
struct dsa_switch_tree *dst = dev->dsa_ptr;
struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
struct dsa_switch *ds = cpu_dp->ds;
+   const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
int count = 0;
 
-   if (cpu_dp->ethtool_ops.get_sset_count)
-   count += cpu_dp->ethtool_ops.get_sset_count(dev, sset);
+   if (ops && ops->get_sset_count)
+   count += ops->get_sset_count(dev, sset);
 
if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
count += ds->ops->get_sset_count(ds);
@@ -608,6 +610,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
struct dsa_switch_tree *dst = dev->dsa_ptr;
struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
struct dsa_switch *ds = cpu_dp->ds;
+   const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
s8 cpu_port = cpu_dp->index;
int len = ETH_GSTRING_LEN;
int mcount = 0, count;
@@ -619,9 +622,9 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
/* We do not want to be NULL-terminated, since this is a prefix */
pfx[sizeof(pfx) - 1] = '_';
 
-   if (cpu_dp->ethtool_ops.get_sset_count) {
-   mcount = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS);
-   cpu_dp->ethtool_ops.get_strings(dev, stringset, data);
+   if (ops && ops->get_sset_count && ops->get_strings) {
+   mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+   ops->get_strings(dev, stringset, data);
}
 
if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
-- 
2.14.1



Re: [REGRESSION] Warning in tcp_fastretrans_alert() of net/ipv4/tcp_input.c

2017-09-19 Thread Oleksandr Natalenko
And 2 more events:

===
$ dmesg --time-format iso | grep RIP
…
2017-09-19T16:52:21,623328+0200 RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
2017-09-19T16:52:40,455296+0200 RIP: 0010:tcp_fastretrans_alert+0x7c8/0x990
2017-09-19T16:52:41,047378+0200 RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
…
2017-09-19T16:54:59,930726+0200 RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
2017-09-19T16:55:07,985767+0200 RIP: 0010:tcp_fastretrans_alert+0x7c8/0x990
2017-09-19T16:55:41,911527+0200 RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
…
===

On pondělí 18. září 2017 23:40:08 CEST Yuchung Cheng wrote:
> On Mon, Sep 18, 2017 at 1:46 PM, Oleksandr Natalenko
> 
>  wrote:
> > Actually, same warning was just triggered with RACK enabled. But main
> > warning was not triggered in this case.
> 
> Thanks.
> 
> I assume this kernel does not have the patch that Neal proposed in his
> first reply?
> 
> The main warning needs to be triggered by another peculiar SACK that
> kicks the sender into recovery again (after undo). Please let it run
> longer if possible to see if we can get both. But the new data does
> indicate the we can (validly) be in CA_Open with retrans_out > 0.
> 
> > ===
> > Sep 18 22:44:32 defiant kernel: [ cut here ]
> > Sep 18 22:44:32 defiant kernel: WARNING: CPU: 1 PID: 702 at net/ipv4/
> > tcp_input.c:2392 tcp_undo_cwnd_reduction+0xbd/0xd0
> > Sep 18 22:44:32 defiant kernel: Modules linked in: netconsole ctr ccm
> > cls_bpf sch_htb act_mirred cls_u32 sch_ingress sit tunnel4 ip_tunnel
> > 8021q mrp nf_conntrack_ipv6 nf_defrag_ipv6 nft_ct nft_set_bitmap
> > nft_set_hash nft_set_rbtree nf_tables_inet nf_tables_ipv6 nft_masq_ipv4
> > nf_nat_masquerade_ipv4 nft_masq nft_nat nft_counter nft_meta
> > nft_chain_nat_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat
> > nf_conntrack libcrc32c crc32c_generic nf_tables_ipv4 nf_tables tun nct6775
> > nfnetlink hwmon_vid nls_iso8859_1 nls_cp437 vfat fat ext4
> > snd_hda_codec_hdmi mbcache jbd2 snd_hda_codec_realtek
> > snd_hda_codec_generic f2fs arc4 fscrypto intel_rapl iTCO_wdt ath9k
> > iTCO_vendor_support intel_powerclamp ath9k_common ath9k_hw coretemp
> > kvm_intel ath mac80211 kvm irqbypass intel_cstate cfg80211 pcspkr
> > snd_hda_intel snd_hda_codec r8169
> > Sep 18 22:44:32 defiant kernel:  joydev evdev mii snd_hda_core mousedev
> > mei_txe input_leds i2c_i801 mac_hid i915 lpc_ich mei shpchp snd_hwdep
> > snd_intel_sst_acpi snd_intel_sst_core snd_soc_rt5670
> > snd_soc_sst_atom_hifi2_platform battery snd_soc_sst_match snd_soc_rl6231
> > drm_kms_helper hci_uart ov5693(C) ov2722(C) lm3554(C) btbcm btqca
> > v4l2_common snd_soc_core btintel snd_compress videodev snd_pcm_dmaengine
> > snd_pcm video bluetooth snd_timer drm media tpm_tis snd i2c_hid soundcore
> > tpm_tis_core rfkill_gpio ac97_bus soc_button_array ecdh_generic rfkill
> > crc16 tpm 8250_dw intel_gtt syscopyarea sysfillrect acpi_pad sysimgblt
> > intel_int0002_vgpio fb_sys_fops pinctrl_cherryview i2c_algo_bit button
> > sch_fq_codel tcp_bbr ifb ip_tables x_tables btrfs xor raid6_pq
> > algif_skcipher af_alg hid_logitech_hidpp hid_logitech_dj usbhid hid uas
> > Sep 18 22:44:32 defiant kernel:  usb_storage dm_crypt dm_mod dax raid10
> > md_mod sd_mod crct10dif_pclmul crc32_pclmul crc32c_intel
> > ghash_clmulni_intel pcbc ahci aesni_intel xhci_pci libahci aes_x86_64
> > crypto_simd glue_helper xhci_hcd cryptd libata usbcore scsi_mod
> > usb_common serio sdhci_acpi sdhci led_class mmc_core
> > Sep 18 22:44:32 defiant kernel: CPU: 1 PID: 702 Comm: irq/123-enp3s0
> > Tainted: GWC  4.13.0-pf4 #1
> > Sep 18 22:44:32 defiant kernel: Hardware name: To Be Filled By O.E.M. To
> > Be
> > Filled By O.E.M./J3710-ITX, BIOS P1.30 03/30/2016
> > Sep 18 22:44:32 defiant kernel: task: 88923a738000 task.stack:
> > 95800150
> > Sep 18 22:44:32 defiant kernel: RIP:
> > 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
> > Sep 18 22:44:32 defiant kernel: RSP: 0018:88927fc83a48 EFLAGS:
> > 00010202
> > Sep 18 22:44:32 defiant kernel: RAX: 0001 RBX:
> > 8892412d9800
> > RCX: 88927fc83b0c
> > Sep 18 22:44:32 defiant kernel: RDX: 7fff RSI:
> > 0001
> > RDI: 8892412d9800
> > Sep 18 22:44:32 defiant kernel: RBP: 88927fc83a50 R08:
> > 
> > R09: 18dfb063
> > Sep 18 22:44:32 defiant kernel: R10: 18dfd223 R11:
> > 18dfb063
> > R12: 5320
> > Sep 18 22:44:32 defiant kernel: R13: 88927fc83b10 R14:
> > 0001
> > R15: 88927fc83b0c
> > Sep 18 22:44:32 defiant kernel: FS:  ()
> > GS:88927fc8() knlGS:
> > Sep 18 22:44:32 defiant kernel: CS:  0010 DS:  ES:  CR0:
> > 80050033
> > Sep 18 22:44:32 defiant kernel: CR2: 7f1cd1a43620 CR3:
> > 000114a09000
> > CR4: 001006e0
> > Sep 18 22:44:32 defiant kernel: Call Trace:
> > Sep 18 22:44:32 defiant kernel:  
> > Sep 18 22:44:32 defiant kernel: 

Re: [PATCH net-next 03/14] gtp: Call common functions to get tunnel routes and add dst_cache

2017-09-19 Thread Tom Herbert
On Mon, Sep 18, 2017 at 9:17 PM, David Miller  wrote:
> From: Tom Herbert 
> Date: Mon, 18 Sep 2017 17:38:53 -0700
>
>> Call ip_tunnel_get_route and dst_cache to pdp context which should
>> improve performance by obviating the need to perform a route lookup
>> on every packet.
>>
>> Signed-off-by: Tom Herbert 
>
> Not caused by your changes, but something to think about:
>
>> -static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4,
>> -const struct sock *sk,
>> -__be32 daddr)
>> -{
>> - memset(fl4, 0, sizeof(*fl4));
>> - fl4->flowi4_oif = sk->sk_bound_dev_if;
>> - fl4->daddr  = daddr;
>> - fl4->saddr  = inet_sk(sk)->inet_saddr;
>> - fl4->flowi4_tos = RT_CONN_FLAGS(sk);
>> - fl4->flowi4_proto   = sk->sk_protocol;
>> -
>> - return ip_route_output_key(sock_net(sk), fl4);
>> -}
>
> This and the new dst caching code ignores any source address selection
> done by ip_route_output_key() or the new tunnel route lookup helpers.
>
> Either source address selection should be respected, or if saddr will
> never be modified by a route lookup for some specific reason here,
> that should be documented.

Yes, I noticed that. In this case the source address is intended to be
taken bound on the socket which would imply we aren't interested in
source address selection.

Tom


Re: [PATCH net-next 1/3] bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE

2017-09-19 Thread Daniel Borkmann

On 09/19/2017 05:08 PM, Craig Gallek wrote:

On Mon, Sep 18, 2017 at 6:53 PM, Alexei Starovoitov  wrote:

On 9/18/17 12:30 PM, Craig Gallek wrote:

[...]

+
+   next_bit = extract_bit(key->data, node->prefixlen);
+   /* If we hit a node that has more than one child or is a
valid
+* prefix itself, do not remove it. Reset the root of the
trim
+* path to its descendant on our path.
+*/
+   if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
+   (node->child[0] && node->child[1]))
+   trim = >child[next_bit];
+   node = rcu_dereference_protected(
+   node->child[next_bit],
lockdep_is_held(>lock));
+   }
+
+   if (!node || node->prefixlen != key->prefixlen ||
+   (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+   ret = -ENOENT;
+   goto out;
+   }
+
+   trie->n_entries--;
+
+   /* If the node we are removing is not a leaf node, simply mark it
+* as intermediate and we are done.
+*/
+   if (rcu_access_pointer(node->child[0]) ||
+   rcu_access_pointer(node->child[1])) {
+   node->flags |= LPM_TREE_NODE_FLAG_IM;
+   goto out;
+   }
+
+   /* trim should now point to the slot holding the start of a path
from
+* zero or more intermediate nodes to our leaf node for deletion.
+*/
+   while ((node = rcu_dereference_protected(
+   *trim, lockdep_is_held(>lock {
+   RCU_INIT_POINTER(*trim, NULL);
+   trim = rcu_access_pointer(node->child[0]) ?
+   >child[0] :
+   >child[1];
+   kfree_rcu(node, rcu);


can it be that some of the nodes this loop walks have
both child[0] and [1] ?

No, the loop above will push trim down the walk every time it
encounters a node with two children.  The only other trim assignment
is the initial trim = >root.  But the only time we would skip
the assignment in the loop is if the node being removed is the root.
If the root had multiple children and is being removed, it would be
handled by the case that turns the node into an intermediate node
rather than walking the trim path freeing things.


Looks good to me. We should probably still merge nodes once we turn
a real node into an im which just has a single child attached to it;
parent can be im or real node. Thus, we don't need to traverse this
extra one on lookup.

Acked-by: Daniel Borkmann 


Re: [PATCH net-next 3/3] bpf: Test deletion in BPF_MAP_TYPE_LPM_TRIE

2017-09-19 Thread Daniel Borkmann

On 09/18/2017 09:30 PM, Craig Gallek wrote:

From: Craig Gallek 

Extend the 'random' operation tests to include a delete operation
(delete half of the nodes from both lpm implementions and ensure
that lookups are still equivalent).

Also, add a simple IPv4 test which verifies lookup behavior as nodes
are deleted from the tree.

Signed-off-by: Craig Gallek 


Acked-by: Daniel Borkmann 


Re: Re: [PATCH] net/packet: fix race condition between fanout_add and __unregister_prot_hook

2017-09-19 Thread Willem de Bruijn
On Tue, Sep 19, 2017 at 12:09 PM, Willem de Bruijn
 wrote:
> On Tue, Sep 19, 2017 at 3:21 AM, Nixiaoming  wrote:
>> On Fri, Sep 15, 2017 at 10:46 AM, Willem de Bruijn
>>
>>  wrote:
>>
>>>
>>
>>> In case of failure we also need to unlink and free match. I
>>
>>> sent the following:
>>
>>>
>>
>>> http://patchwork.ozlabs.org/patch/813945/
>>
>>
>>
>> +   spin_lock(>bind_lock);
>>
>> +   if (po->running &&
>>
>> +   match->type == type &&
>>
>>match->prot_hook.type == po->prot_hook.type &&
>>
>>match->prot_hook.dev == po->prot_hook.dev) {
>>
>> err = -ENOSPC;
>>
>> @@ -1761,6 +1760,13 @@  static int fanout_add(struct sock *sk, u16 id, u16
>> type_flags)
>>
>>   err = 0;
>>
>> }
>>
>>}
>>
>> +   spin_unlock(>bind_lock);
>>
>> +
>>
>> +   if (err && !refcount_read(>sk_ref)) {
>>
>> +list_del(>list);
>>
>> +kfree(match);
>>
>> +   }
>>
>>
>>
>>
>>
>> In the function fanout_add add spin_lock to protect po-> running and po->
>> fanout,
>>
>> then whether it should be in the function fanout_release also add spin_lock
>> protection ?
>
> po->bind_lock is held when registering and unregistering the
> protocol hook. fanout_release does access po->running or
> prot_hook.

whoops. does *not* access.


Re: [PATCH net-next 2/3] bpf: Add uniqueness invariant to trivial lpm test implementation

2017-09-19 Thread Daniel Borkmann

On 09/18/2017 09:30 PM, Craig Gallek wrote:

From: Craig Gallek 

The 'trivial' lpm implementation in this test allows equivalent nodes
to be added (that is, nodes consisting of the same prefix and prefix
length).  For lookup operations, this is fine because insertion happens
at the head of the (singly linked) list and the first, best match is
returned.  In order to support deletion, the tlpm data structue must
first enforce uniqueness.  This change modifies the insertion algorithm
to search for equivalent nodes and remove them.  Note: the
BPF_MAP_TYPE_LPM_TRIE already has a uniqueness invariant that is
implemented as node replacement.

Signed-off-by: Craig Gallek 


Acked-by: Daniel Borkmann 


[RFC PATCH 3/3] usbnet: Fix memory leak when rx_submit() fails

2017-09-19 Thread Douglas Anderson
If rx_submit() returns an error code then nobody calls usb_free_urb().
That means it's leaked.

NOTE: This problem was found solely by code inspection and not due to
any failing test cases.

Signed-off-by: Douglas Anderson 
---

 drivers/net/usb/usbnet.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index e72547d8d0e6..4c067aaeea5a 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1182,9 +1182,12 @@ usbnet_deferred_kevent (struct work_struct *work)
usb_free_urb(urb);
goto fail_lowmem;
}
-   if (rx_submit (dev, urb, GFP_KERNEL) ==
-   -ENOLINK)
-   resched = 0;
+   status = rx_submit (dev, urb, GFP_KERNEL);
+   if (status) {
+   usb_free_urb(urb);
+   if (status == -ENOLINK)
+   resched = 0;
+   }
usb_autopm_put_interface(dev->intf);
 fail_lowmem:
if (resched)
-- 
2.14.1.690.gbb1197296e-goog



[PATCH net] bpf: do not disable/enable BH in bpf_map_free_id()

2017-09-19 Thread Eric Dumazet
From: Eric Dumazet 

syzkaller reported following splat [1]

Since hard irq are disabled by the caller, bpf_map_free_id()
should not try to enable/disable BH.

Another solution would be to change htab_map_delete_elem() to
defer the free_htab_elem() call after
raw_spin_unlock_irqrestore(>lock, flags), but this might be not
enough to cover other code paths.

[1]
WARNING: CPU: 1 PID: 8052 at kernel/softirq.c:161 __local_bh_enable_ip
+0x1e/0x160 kernel/softirq.c:161
Kernel panic - not syncing: panic_on_warn set ...

CPU: 1 PID: 8052 Comm: syz-executor1 Not tainted 4.13.0-next-20170915+
#23
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 panic+0x1e4/0x417 kernel/panic.c:181
 __warn+0x1c4/0x1d9 kernel/panic.c:542
 report_bug+0x211/0x2d0 lib/bug.c:183
 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178
 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline]
 do_trap+0x260/0x390 arch/x86/kernel/traps.c:261
 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311
 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
RIP: 0010:__local_bh_enable_ip+0x1e/0x160 kernel/softirq.c:161
RSP: 0018:8801cdcd7748 EFLAGS: 00010046
RAX: 0082 RBX: 0201 RCX: 
RDX: 10b5933c RSI: 0201 RDI: 85ac99e0
RBP: 8801cdcd7758 R08: 85b87158 R09: 110039b9aec6
R10: 8801c99f24c0 R11: 0002 R12: 817b0b47
R13: dc00 R14: 8801cdcd77e8 R15: 0001
 __raw_spin_unlock_bh include/linux/spinlock_api_smp.h:176 [inline]
 _raw_spin_unlock_bh+0x30/0x40 kernel/locking/spinlock.c:207
 spin_unlock_bh include/linux/spinlock.h:361 [inline]
 bpf_map_free_id kernel/bpf/syscall.c:197 [inline]
 __bpf_map_put+0x267/0x320 kernel/bpf/syscall.c:227
 bpf_map_put+0x1a/0x20 kernel/bpf/syscall.c:235
 bpf_map_fd_put_ptr+0x15/0x20 kernel/bpf/map_in_map.c:96
 free_htab_elem+0xc3/0x1b0 kernel/bpf/hashtab.c:658
 htab_map_delete_elem+0x74d/0x970 kernel/bpf/hashtab.c:1063
 map_delete_elem kernel/bpf/syscall.c:633 [inline]
 SYSC_bpf kernel/bpf/syscall.c:1479 [inline]
 SyS_bpf+0x2188/0x46a0 kernel/bpf/syscall.c:1451
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Fixes: f3f1c054c288 ("bpf: Introduce bpf_map ID")
Signed-off-by: Eric Dumazet 
Cc: Martin KaFai Lau 
---
 kernel/bpf/syscall.c |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 
cb17e1cd1d434dc2e052a2a9fb0aea967fcf4417..25d074920a009ff682d97bf88e68f466c79bd564
 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map)
 
 static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 {
+   unsigned long flags;
+
if (do_idr_lock)
-   spin_lock_bh(_idr_lock);
+   spin_lock_irqsave(_idr_lock, flags);
else
__acquire(_idr_lock);
 
idr_remove(_idr, map->id);
 
if (do_idr_lock)
-   spin_unlock_bh(_idr_lock);
+   spin_unlock_irqrestore(_idr_lock, flags);
else
__release(_idr_lock);
 }




[RFC PATCH 2/3] usbnet: Avoid potential races in usbnet_deferred_kevent()

2017-09-19 Thread Douglas Anderson
In general when you've got a flag communicating that "something needs
to be done" you want to clear that flag _before_ doing the task.  If
you clear the flag _after_ doing the task you end up with the risk
that this will happen:

1. Requester sets flag saying task A needs to be done.
2. Worker comes and stars doing task A.
3. Worker finishes task A but hasn't yet cleared the flag.
4. Requester wants to set flag saying task A needs to be done again.
5. Worker clears the flag without doing anything.

Let's make the usbnet codebase consistently clear the flag _before_ it
does the requested work.  That way if there's another request to do
the work while the work is already in progress it won't be lost.

NOTES:
- No known bugs are fixed by this; it's just found by code inspection.
- This changes the semantics in some of the error conditions.
  -> If we fail to clear the "tx halt" or "rx halt" we still clear the
 flag and thus won't retry the clear next time we happen to be in
 the work function.  Had the old code really wanted to retry these
 events it should have re-scheduled the worker anyway.
  -> If we fail to allocate memory in usb_alloc_urb() we will still
 clear the EVENT_RX_MEMORY flag.  This makes it consistent with
 how we would deal with other failures, including failure to
 allocate a memory chunk in rx_submit().  It can also be noted
 that usb_alloc_urb() in this case is allocating much less than 4K
 worth of data and probably never fails.

Signed-off-by: Douglas Anderson 
---

 drivers/net/usb/usbnet.c | 50 +---
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index a3e8dbaadcf9..e72547d8d0e6 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1103,8 +1103,6 @@ static void __handle_link_change(struct usbnet *dev)
 
/* hard_mtu or rx_urb_size may change during link change */
usbnet_update_max_qlen(dev);
-
-   clear_bit(EVENT_LINK_CHANGE, >flags);
 }
 
 static void usbnet_set_rx_mode(struct net_device *net)
@@ -1118,8 +1116,6 @@ static void __handle_set_rx_mode(struct usbnet *dev)
 {
if (dev->driver_info->set_rx_mode)
(dev->driver_info->set_rx_mode)(dev);
-
-   clear_bit(EVENT_SET_RX_MODE, >flags);
 }
 
 /* work that cannot be done in interrupt context uses keventd.
@@ -1135,7 +1131,7 @@ usbnet_deferred_kevent (struct work_struct *work)
int status;
 
/* usb_clear_halt() needs a thread context */
-   if (test_bit (EVENT_TX_HALT, >flags)) {
+   if (test_and_clear_bit (EVENT_TX_HALT, >flags)) {
unlink_urbs (dev, >txq);
status = usb_autopm_get_interface(dev->intf);
if (status < 0)
@@ -1150,12 +1146,11 @@ usbnet_deferred_kevent (struct work_struct *work)
netdev_err(dev->net, "can't clear tx halt, 
status %d\n",
   status);
} else {
-   clear_bit (EVENT_TX_HALT, >flags);
if (status != -ESHUTDOWN)
netif_wake_queue (dev->net);
}
}
-   if (test_bit (EVENT_RX_HALT, >flags)) {
+   if (test_and_clear_bit (EVENT_RX_HALT, >flags)) {
unlink_urbs (dev, >rxq);
status = usb_autopm_get_interface(dev->intf);
if (status < 0)
@@ -1170,41 +1165,39 @@ usbnet_deferred_kevent (struct work_struct *work)
netdev_err(dev->net, "can't clear rx halt, 
status %d\n",
   status);
} else {
-   clear_bit (EVENT_RX_HALT, >flags);
tasklet_schedule (>bh);
}
}
 
/* tasklet could resubmit itself forever if memory is tight */
-   if (test_bit (EVENT_RX_MEMORY, >flags)) {
+   if (test_and_clear_bit (EVENT_RX_MEMORY, >flags)) {
struct urb  *urb = NULL;
int resched = 1;
 
-   if (netif_running (dev->net))
+   if (netif_running (dev->net)) {
urb = usb_alloc_urb (0, GFP_KERNEL);
-   else
-   clear_bit (EVENT_RX_MEMORY, >flags);
-   if (urb != NULL) {
-   clear_bit (EVENT_RX_MEMORY, >flags);
-   status = usb_autopm_get_interface(dev->intf);
-   if (status < 0) {
-   usb_free_urb(urb);
-   goto fail_lowmem;
-   }
-   if (rx_submit (dev, urb, GFP_KERNEL) == -ENOLINK)
-   resched = 0;
-   usb_autopm_put_interface(dev->intf);
+   if (urb != NULL) {
+   

[PATCH V2 net 7/7] net: hns3: Fixes the premature exit of loop when matching clients

2017-09-19 Thread Salil Mehta
From: Lipeng 

When register/unregister ae_dev, ae_dev should match all client
in the client_list. Enet and roce can co-exists together so we
should continue checking for enet and roce presence together.
So break should not be there.

Above caused problems in loading and unloading of modules.

Fixes: 38eddd126772 ("net: hns3: Add support of the HNAE3 framework")
Signed-off-by: Lipeng 
Signed-off-by: Salil Mehta 
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c | 43 ++---
 1 file changed, 9 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c 
b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index 59efbd6..5bcb223 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -37,20 +37,15 @@ static bool hnae3_client_match(enum hnae3_client_type 
client_type,
 }
 
 static int hnae3_match_n_instantiate(struct hnae3_client *client,
-struct hnae3_ae_dev *ae_dev,
-bool is_reg, bool *matched)
+struct hnae3_ae_dev *ae_dev, bool is_reg)
 {
int ret;
 
-   *matched = false;
-
/* check if this client matches the type of ae_dev */
if (!(hnae3_client_match(client->type, ae_dev->dev_type) &&
  hnae_get_bit(ae_dev->flag, HNAE3_DEV_INITED_B))) {
return 0;
}
-   /* there is a match of client and dev */
-   *matched = true;
 
/* now, (un-)instantiate client by calling lower layer */
if (is_reg) {
@@ -69,7 +64,6 @@ int hnae3_register_client(struct hnae3_client *client)
 {
struct hnae3_client *client_tmp;
struct hnae3_ae_dev *ae_dev;
-   bool matched;
int ret = 0;
 
mutex_lock(_common_lock);
@@ -86,7 +80,7 @@ int hnae3_register_client(struct hnae3_client *client)
/* if the client could not be initialized on current port, for
 * any error reasons, move on to next available port
 */
-   ret = hnae3_match_n_instantiate(client, ae_dev, true, );
+   ret = hnae3_match_n_instantiate(client, ae_dev, true);
if (ret)
dev_err(_dev->pdev->dev,
"match and instantiation failed for port\n");
@@ -102,12 +96,11 @@ EXPORT_SYMBOL(hnae3_register_client);
 void hnae3_unregister_client(struct hnae3_client *client)
 {
struct hnae3_ae_dev *ae_dev;
-   bool matched;
 
mutex_lock(_common_lock);
/* un-initialize the client on every matched port */
list_for_each_entry(ae_dev, _ae_dev_list, node) {
-   hnae3_match_n_instantiate(client, ae_dev, false, );
+   hnae3_match_n_instantiate(client, ae_dev, false);
}
 
list_del(>node);
@@ -124,7 +117,6 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo)
const struct pci_device_id *id;
struct hnae3_ae_dev *ae_dev;
struct hnae3_client *client;
-   bool matched;
int ret = 0;
 
mutex_lock(_common_lock);
@@ -151,13 +143,10 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo)
 * initialize the figure out client instance
 */
list_for_each_entry(client, _client_list, node) {
-   ret = hnae3_match_n_instantiate(client, ae_dev, true,
-   );
+   ret = hnae3_match_n_instantiate(client, ae_dev, true);
if (ret)
dev_err(_dev->pdev->dev,
"match and instantiation failed\n");
-   if (matched)
-   break;
}
}
 
@@ -175,7 +164,6 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo)
const struct pci_device_id *id;
struct hnae3_ae_dev *ae_dev;
struct hnae3_client *client;
-   bool matched;
 
mutex_lock(_common_lock);
/* Check if there are matched ae_dev */
@@ -187,12 +175,8 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo 
*ae_algo)
/* check the client list for the match with this ae_dev type and
 * un-initialize the figure out client instance
 */
-   list_for_each_entry(client, _client_list, node) {
-   hnae3_match_n_instantiate(client, ae_dev, false,
- );
-   if (matched)
-   break;
-   }
+   list_for_each_entry(client, _client_list, node)
+   hnae3_match_n_instantiate(client, ae_dev, false);
 
ae_algo->ops->uninit_ae_dev(ae_dev);

[PATCH V2 net 1/7] net: hns3: Fixes initialization of phy address from firmware

2017-09-19 Thread Salil Mehta
From: Lipeng 

Default phy address of every port is 0. Therefore, phy address for
each port need to be fetched from firmware and device initialized
with fetched non-default phy address.

Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine &
Compatibility Layer Support")
Signed-off-by: Lipeng 
Signed-off-by: Salil Mehta 
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index bb45365..db4e07d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1066,6 +1066,7 @@ static int hclge_configure(struct hclge_dev *hdev)
for (i = 0; i < ETH_ALEN; i++)
hdev->hw.mac.mac_addr[i] = cfg.mac_addr[i];
hdev->hw.mac.media_type = cfg.media_type;
+   hdev->hw.mac.phy_addr = cfg.phy_addr;
hdev->num_desc = cfg.tqp_desc_num;
hdev->tm_info.num_pg = 1;
hdev->tm_info.num_tc = cfg.tc_num;
-- 
2.7.4




Re: [PATCH net v2] l2tp: fix race condition in l2tp_tunnel_delete

2017-09-19 Thread Guillaume Nault
On Tue, Sep 19, 2017 at 03:40:40PM +0200, Sabrina Dubroca wrote:
> If we try to delete the same tunnel twice, the first delete operation
> does a lookup (l2tp_tunnel_get), finds the tunnel, calls
> l2tp_tunnel_delete, which queues it for deletion by
> l2tp_tunnel_del_work.
> 
> The second delete operation also finds the tunnel and calls
> l2tp_tunnel_delete. If the workqueue has already fired and started
> running l2tp_tunnel_del_work, then l2tp_tunnel_delete will queue the
> same tunnel a second time, and try to free the socket again.
> 
> Add a dead flag to prevent firing the workqueue twice. Then we can
> remove the check of queue_work's result that was meant to prevent that
> race but doesn't.
> 
> Also check the flag in the tunnel lookup functions, to avoid returning a
> tunnel that is already scheduled for destruction.
> 
> Reproducer:
> 
> ip l2tp add tunnel tunnel_id 3000 peer_tunnel_id 4000 local 192.168.0.2 
> remote 192.168.0.1 encap udp udp_sport 5000 udp_dport 6000
> ip l2tp add session name l2tp1 tunnel_id 3000 session_id 1000 
> peer_session_id 2000
> ip link set l2tp1 up
> ip l2tp del tunnel tunnel_id 3000
> ip l2tp del tunnel tunnel_id 3000
> 
> Fixes: f8ccac0e4493 ("l2tp: put tunnel socket release on a workqueue")
> Reported-by: Jianlin Shi 
> Signed-off-by: Sabrina Dubroca 
> ---
> v2: as Tom Parkin explained, we can't remove the tunnel from the
> per-net list from netlink. v2 uses only a dead flag, and adds
> corresponding checks during lookups
> 
>  net/l2tp/l2tp_core.c | 18 +-
>  net/l2tp/l2tp_core.h |  5 -
>  2 files changed, 13 insertions(+), 10 deletions(-)
> 
> diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
> index ee485df73ccd..3891f0260f2b 100644
> --- a/net/l2tp/l2tp_core.c
> +++ b/net/l2tp/l2tp_core.c
> @@ -203,7 +203,8 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net 
> *net, u32 tunnel_id)
>  
>   rcu_read_lock_bh();
>   list_for_each_entry_rcu(tunnel, >l2tp_tunnel_list, list) {
> - if (tunnel->tunnel_id == tunnel_id) {
> + if (tunnel->tunnel_id == tunnel_id &&
> + !test_bit(0, >dead)) {
>   l2tp_tunnel_inc_refcount(tunnel);
>   rcu_read_unlock_bh();
>  
> @@ -390,7 +391,8 @@ struct l2tp_tunnel *l2tp_tunnel_find(const struct net 
> *net, u32 tunnel_id)
>  
>   rcu_read_lock_bh();
>   list_for_each_entry_rcu(tunnel, >l2tp_tunnel_list, list) {
> - if (tunnel->tunnel_id == tunnel_id) {
> + if (tunnel->tunnel_id == tunnel_id &&
> + !test_bit(0, >dead)) {
>   rcu_read_unlock_bh();
>   return tunnel;
>   }
> @@ -409,7 +411,7 @@ struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net 
> *net, int nth)
>  
>   rcu_read_lock_bh();
>   list_for_each_entry_rcu(tunnel, >l2tp_tunnel_list, list) {
> - if (++count > nth) {
> + if (++count > nth && !test_bit(0, >dead)) {
>   rcu_read_unlock_bh();
>   return tunnel;
>   }
> 
I don't get why you're checking the dead flag in l2tp_tunnel_{get,find}*().
Since it can be set concurrently right after test_bit(), it doesn't
protect the caller from getting a tunnel that is being removed by
l2tp_tunnel_delete().
Or have I missed something?


Re: [RFC PATCH 1/3] usbnet: Get rid of spammy usbnet "kevent X may have been dropped"

2017-09-19 Thread Guenter Roeck
On Tue, Sep 19, 2017 at 9:15 AM, Douglas Anderson  wrote:
> Every once in a while when my system is under a bit of stress I see
> some spammy messages show up in my logs that say:
>
>   kevent X may have been dropped
>
> As far as I can tell these messages aren't terribly useful.  The
> comments around the messages make me think that either workqueues used
> to work differently or that the original author of the code missed a
> sublety related to them.  The error message appears to predate the git
> conversion of the kernel so it's somewhat hard to tell.
>
> Specifically, workqueues should work like this:
>
> A) If a workqueue hasn't been scheduled then schedule_work() schedules
>it and returns true.
>
> B) If a workqueue has been scheduled (but hasn't started) then
>schedule_work() will do nothing and return false.
>
> C) If a workqueue has been scheduled (and has started) then
>schedule_work() will put it on the queue to run again and return
>true.
>
> Said another way: if you call schedule_work() you can guarantee that
> at least one full runthrough of the work will happen again.  That
> should mean that the work will get processed and I don't see any
> reason to think something should be dropped.
>
> Reading the comments in in usbnet_defer_kevent() made me think that B)
> and C) would be treated the same.  That is: even if we've started the
> work and are 99% of the way through then schedule_work() would return
> false and the work wouldn't be queued again.  If schedule_work()
> really did behave that way then, truly, some amount of work would be
> lost.  ...but it doesn't.
>
> NOTE: if somehow these warnings are useful to mean something then
> perhaps we should change them to make it more obvious.  If it's
> interesting to know when the work is backlogged then we should change
> the spam to say "warning: usbnet is backlogged".
>
> ALSO NOTE: If somehow some of the types of work need to be repeated if
> usbnet_defer_kevent() is called multiple times then that should be
> quite easy to accomplish without dropping any work on the floor.  We
> can just keep an atomic count for that type of work and add a loop
> into usbnet_deferred_kevent().
>
> Signed-off-by: Douglas Anderson 

Reviewed-by: Guenter Roeck 

> ---
>
>  drivers/net/usb/usbnet.c | 16 +++-
>  1 file changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
> index 6510e5cc1817..a3e8dbaadcf9 100644
> --- a/drivers/net/usb/usbnet.c
> +++ b/drivers/net/usb/usbnet.c
> @@ -450,19 +450,17 @@ static enum skb_state defer_bh(struct usbnet *dev, 
> struct sk_buff *skb,
>  }
>
>  /* some work can't be done in tasklets, so we use keventd
> - *
> - * NOTE:  annoying asymmetry:  if it's active, schedule_work() fails,
> - * but tasklet_schedule() doesn't.  hope the failure is rare.
>   */
>  void usbnet_defer_kevent (struct usbnet *dev, int work)
>  {
> set_bit (work, >flags);
> -   if (!schedule_work (>kevent)) {
> -   if (net_ratelimit())
> -   netdev_err(dev->net, "kevent %d may have been 
> dropped\n", work);
> -   } else {
> -   netdev_dbg(dev->net, "kevent %d scheduled\n", work);
> -   }
> +
> +   /* If work is already started this will mark it to run again when it
> +* finishes; if we already had work pending and it hadn't started
> +* yet then that's fine too.
> +*/
> +   schedule_work (>kevent);
> +   netdev_dbg(dev->net, "kevent %d scheduled\n", work);
>  }
>  EXPORT_SYMBOL_GPL(usbnet_defer_kevent);
>
> --
> 2.14.1.690.gbb1197296e-goog
>


Re: [PATCH net-next v2 08/12] net: dsa: b53: Move EEE functions to b53

2017-09-19 Thread Vivien Didelot
Florian Fainelli  writes:

> Move the bcm_sf2 EEE-related functions to the b53 driver because this is 
> shared
> code amongst Gigabit capable switch, only 5325 and 5365 are too old to support
> that.
>
> Signed-off-by: Florian Fainelli 

Reviewed-by: Vivien Didelot 


Re: [PATCH net-next v2 09/12] net: dsa: b53: Wire-up EEE

2017-09-19 Thread Vivien Didelot
Florian Fainelli  writes:

> Add support for enabling and disabling EEE, as well as re-negotiating it in
> .adjust_link() and in .port_enable().
>
> Signed-off-by: Florian Fainelli 

Reviewed-by: Vivien Didelot 


Re: [PATCH net-next v2 10/12] net: dsa: b53: Export b53_imp_vlan_setup()

2017-09-19 Thread Vivien Didelot
Florian Fainelli  writes:

> bcm_sf2 and b53 do exactly the same thing, so share that piece.
>
> Signed-off-by: Florian Fainelli 

Reviewed-by: Vivien Didelot 


Re: software interrupts close to 100 with 9000 tc filter entries

2017-09-19 Thread Marco Berizzi
> Eric Dumazet wrote:
> 
> On Tue, 2017-09-19 at 15:28 +0200, Marco Berizzi wrote:
> 
> > Hi Folks,
> > 
> > I'm running linux 4.12.10 x86_64 on a Slackware 14.2 64bit
> > as a simple 4 NIC router. Network throughput processed by
> > this machine is less than 200Mbit/s
> > The cpu model is Intel(R) Xeon(R) CPU 5160 @ 3.00GHz with
> > 2GB ram.
> > 
> > I need to blacklist about 9000 single ip addresses.
> > This is the relevant script to blacklist these ip addresses:
> > 
> > tc qdisc add dev eth0 ingress
> > tc qdisc add dev eth1 ingress
> > 
> > while read -r line
> > do
> >  tc filter add dev eth0 parent : protocol ip prio 50 u32 match ip src 
> > $line action drop
> >  tc filter add dev eth1 parent : protocol ip prio 50 u32 match ip src 
> > $line action drop
> > done < blacklisted_ip_addresses
> > 
> > After loading these ip addresses, the si (software interrupts)
> > number shown by top is always close to 100
> > If I delete the ingress qdisc on both the device, the si
> > fall down to less than 5
> > 
> > Running the same script with 'only' 700 ip addresses is
> > flawless.
> > 
> > Kindly I would like to ask if am I doing anything in
> > a wrong way or if the hardware is too old for this kind
> > of setup.
> > 
> > I have selected the tc filter setup instead of netfilter
> > one, because I was reading this from iproute2/doc/actions:
> > 
> > A side effect is that we can now get stateless firewalling to work with tc..
> > Essentially this is now an alternative to iptables.
> > I wont go into details of my dislike for iptables at times, but.
> > scalability is one of the main issues; however, if you need stateful
> > classification - use netfilter (for now).
> > 
> > Any response are welcome
> > TIA
> 
> Processing a list of 700 rules per incoming packet is not wise.
> 
> Alternatives :
> 
> *   netfilter with IPSET : This probably can be done with one lookup in a
> table. Probably easiest way to setup.
> 
> *   BPF filter (XDP or TC )

Thanks Eric for the quick response.
For better performance (latency time and network throughput) which is the better
solution? netfilter with ipset or BPF?


Re: [PATCH net-next v2 11/12] net: dsa: bcm_sf2: Use SF2_NUM_EGRESS_QUEUES for CFP

2017-09-19 Thread Vivien Didelot
Florian Fainelli  writes:

> The magic number 8 in 3 locations in bcm_sf2_cfp.c actually designates the
> number of switch port egress queues, so use that define instead of open-coding
> it.
>
> Signed-off-by: Florian Fainelli 

Reviewed-by: Vivien Didelot 


[RFC 1/1] net/smc: add SMC rendezvous protocol

2017-09-19 Thread Ursula Braun
The SMC protocol [1] uses a rendezvous protocol to negotiate SMC
capability between peers. The current Linux implementation does not use
this rendezvous protocol and, thus, is not compliant to RFC7609 and
incompatible with other SMC implementations like in zOS. This patch adds
support for the SMC rendezvous protocol.

Details:

The SMC rendezvous protocol relies on the use of a new TCP experimental
option. With this option, SMC capabilities are exchanged between the
peers during the TCP three way handshake.

The goal of this patch is to leave common TCP code unmodified. Thus,
it uses netfilter hooks to intercept TCP SYN and SYN/ACK packets. For
outgoing packets originating from SMC sockets, the experimental option
is added. For inbound packets destined for SMC sockets, the experimental
option is checked.

Another goal was to minimize the performance impact on non-SMC traffic
(when SMC is enabled). The netfilter hooks used for SMC client
connections are active only during TCP connection establishment.
The netfilter hooks used for SMC servers are active as long as there are
listening SMC sockets.

When the hooks are active, the following additional operations are
performed on incoming and outgoing packets:
  (1) call SMC netfilter hook (all IPv4 packets)
  (2) check if TCP SYN or SYN/ACK packet (all IPv4 packets)
  (3) check if packet goes to/comes from SMC socket (SYN & SYN/ACK
  packets only)
  (4) check/add SMC experimental option (SMC sockets' SYN & SYN/ACK
  packets only)

References:
  [1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609

Signed-off-by: Hans Wippel 
Signed-off-by: Ursula Braun 
---
 net/smc/Makefile |   2 +-
 net/smc/af_smc.c |  66 ++-
 net/smc/smc.h|  10 +-
 net/smc/smc_rv.c | 542 +++
 net/smc/smc_rv.h |  31 
 5 files changed, 644 insertions(+), 7 deletions(-)
 create mode 100644 net/smc/smc_rv.c
 create mode 100644 net/smc/smc_rv.h

diff --git a/net/smc/Makefile b/net/smc/Makefile
index 188104654b54..2155a7eff41d 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_SMC)  += smc.o
 obj-$(CONFIG_SMC_DIAG) += smc_diag.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_rv.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 8c6d24b2995d..6c280bbcd2fe 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -34,6 +34,7 @@
 #include 
 
 #include "smc.h"
+#include "smc_rv.h"
 #include "smc_clc.h"
 #include "smc_llc.h"
 #include "smc_cdc.h"
@@ -109,6 +110,7 @@ static int smc_release(struct socket *sock)
 {
struct sock *sk = sock->sk;
struct smc_sock *smc;
+   int old_state;
int rc = 0;
 
if (!sk)
@@ -123,6 +125,7 @@ static int smc_release(struct socket *sock)
lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
else
lock_sock(sk);
+   old_state = sk->sk_state;
 
if (smc->use_fallback) {
sk->sk_state = SMC_CLOSED;
@@ -132,6 +135,10 @@ static int smc_release(struct socket *sock)
sock_set_flag(sk, SOCK_DEAD);
sk->sk_shutdown |= SHUTDOWN_MASK;
}
+   if (old_state == SMC_LISTEN) {
+   smc_rv_nf_unregister_hook(sock_net(sk), _nfho_serv);
+   kfree(smc->listen_pends);
+   }
if (smc->clcsock) {
sock_release(smc->clcsock);
smc->clcsock = NULL;
@@ -178,6 +185,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct 
socket *sock)
sk->sk_destruct = smc_destruct;
sk->sk_protocol = SMCPROTO_SMC;
smc = smc_sk(sk);
+   smc->use_fallback = true; /* default: not SMC-capable */
INIT_WORK(>tcp_listen_work, smc_tcp_listen_work);
INIT_LIST_HEAD(>accept_q);
spin_lock_init(>accept_q_lock);
@@ -386,6 +394,10 @@ static int smc_connect_rdma(struct smc_sock *smc)
int rc = 0;
u8 ibport;
 
+   if (smc->use_fallback)
+   /* peer has not signalled SMC-capability */
+   goto out_connected;
+
/* IPSec connections opt out of SMC-R optimizations */
if (using_ipsec(smc)) {
reason_code = SMC_CLC_DECL_IPSEC;
@@ -496,7 +508,6 @@ static int smc_connect_rdma(struct smc_sock *smc)
smc_tx_init(smc);
 
 out_connected:
-   smc_copy_sock_settings_to_clc(smc);
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
 
@@ -551,7 +562,11 @@ static int smc_connect(struct socket *sock, struct 
sockaddr *addr,
}
 
smc_copy_sock_settings_to_clc(smc);
+   smc_rv_nf_register_hook(sock_net(sk), _nfho_clnt);
+
rc = kernel_connect(smc->clcsock, addr, alen, flags);
+   if (rc != -EINPROGRESS)
+   

Re: [PATCH net-next 1/3] bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE

2017-09-19 Thread Craig Gallek
On Mon, Sep 18, 2017 at 6:53 PM, Alexei Starovoitov  wrote:
Thanks for the review!  Please correct me if I'm wrong...

> On 9/18/17 12:30 PM, Craig Gallek wrote:
>>
>> From: Craig Gallek 
>>
>> This is a simple non-recursive delete operation.  It prunes paths
>> of empty nodes in the tree, but it does not try to further compress
>> the tree as nodes are removed.
>>
>> Signed-off-by: Craig Gallek 
>> ---
>>  kernel/bpf/lpm_trie.c | 80
>> +--
>>  1 file changed, 77 insertions(+), 3 deletions(-)
>>
>> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
>> index 1b767844a76f..9d58a576b2ae 100644
>> --- a/kernel/bpf/lpm_trie.c
>> +++ b/kernel/bpf/lpm_trie.c
>> @@ -389,10 +389,84 @@ static int trie_update_elem(struct bpf_map *map,
>> return ret;
>>  }
>>
>> -static int trie_delete_elem(struct bpf_map *map, void *key)
>> +/* Called from syscall or from eBPF program */
>> +static int trie_delete_elem(struct bpf_map *map, void *_key)
>>  {
>> -   /* TODO */
>> -   return -ENOSYS;
>> +   struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
>> +   struct bpf_lpm_trie_key *key = _key;
>> +   struct lpm_trie_node __rcu **trim;
>> +   struct lpm_trie_node *node;
>> +   unsigned long irq_flags;
>> +   unsigned int next_bit;
>> +   size_t matchlen = 0;
>> +   int ret = 0;
>> +
>> +   if (key->prefixlen > trie->max_prefixlen)
>> +   return -EINVAL;
>> +
>> +   raw_spin_lock_irqsave(>lock, irq_flags);
>> +
>> +   /* Walk the tree looking for an exact key/length match and keeping
>> +* track of where we could begin trimming the tree.  The
>> trim-point
>> +* is the sub-tree along the walk consisting of only single-child
>> +* intermediate nodes and ending at a leaf node that we want to
>> +* remove.
>> +*/
>> +   trim = >root;
>> +   node = rcu_dereference_protected(
>> +   trie->root, lockdep_is_held(>lock));
>> +   while (node) {
>> +   matchlen = longest_prefix_match(trie, node, key);
>> +
>> +   if (node->prefixlen != matchlen ||
>> +   node->prefixlen == key->prefixlen)
>> +   break;
>
>
> curious why there is no need to do
> 'node->prefixlen == trie->max_prefixlen' in the above
> like update/lookup do?
I don't believe the node->prefixlen == trie->max_prefixlen check in
trie_update_elem is necessary. In order to get to this third clause,
it implies that the first two clauses evaluated false.  Which happens
when we find an exact prefix match for the current node, but the
to-be-inserted key prefix is different.  If the node we are comparing
against had a prefix of max_prefixlen, it would not be possible to
have both a full prefix match but different prefix lengths.  This
assumes that there are no nodes in the tree with > max_prefixlen
prefixes, but that is handled earlier in the update function.

There's a similar (I believe) unnecessary max_prefixlen check in
trie_lookup_elem.  The function should behave the same way without
that check, but at least in this case it's used as an early-out and
saves a few lines of execution.

>
>> +
>> +   next_bit = extract_bit(key->data, node->prefixlen);
>> +   /* If we hit a node that has more than one child or is a
>> valid
>> +* prefix itself, do not remove it. Reset the root of the
>> trim
>> +* path to its descendant on our path.
>> +*/
>> +   if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
>> +   (node->child[0] && node->child[1]))
>> +   trim = >child[next_bit];
>> +   node = rcu_dereference_protected(
>> +   node->child[next_bit],
>> lockdep_is_held(>lock));
>> +   }
>> +
>> +   if (!node || node->prefixlen != key->prefixlen ||
>> +   (node->flags & LPM_TREE_NODE_FLAG_IM)) {
>> +   ret = -ENOENT;
>> +   goto out;
>> +   }
>> +
>> +   trie->n_entries--;
>> +
>> +   /* If the node we are removing is not a leaf node, simply mark it
>> +* as intermediate and we are done.
>> +*/
>> +   if (rcu_access_pointer(node->child[0]) ||
>> +   rcu_access_pointer(node->child[1])) {
>> +   node->flags |= LPM_TREE_NODE_FLAG_IM;
>> +   goto out;
>> +   }
>> +
>> +   /* trim should now point to the slot holding the start of a path
>> from
>> +* zero or more intermediate nodes to our leaf node for deletion.
>> +*/
>> +   while ((node = rcu_dereference_protected(
>> +   *trim, lockdep_is_held(>lock {
>> +   RCU_INIT_POINTER(*trim, NULL);
>> +   trim = rcu_access_pointer(node->child[0]) ?
>> +   >child[0] :
>> +   >child[1];
>> +   

Re: [PATCH RFC V1 net-next 0/6] Time based packet transmission

2017-09-19 Thread Miroslav Lichvar
On Mon, Sep 18, 2017 at 09:41:15AM +0200, Richard Cochran wrote:
> This series is an early RFC that introduces a new socket option
> allowing time based transmission of packets.  This option will be
> useful in implementing various real time protocols over Ethernet,
> including but not limited to P802.1Qbv, which is currently finding
> its way into 802.1Q.

If I understand it correctly, this also allows us to make a PTP/NTP
"one-step" clock with HW that doesn't support it directly.

> * Open questions about SO_TXTIME semantics
> 
>   - What should the kernel do if the dialed Tx time is in the past?
> Should the packet be sent ASAP, or should we throw an error?

Dropping the packet with an error would make more sense to me.

>   - What should the timescale be for the dialed Tx time?  Should the
> kernel select UTC when using the SW Qdisc and the HW time
> otherwise?  Or should the socket option include a clockid_t?

I think for applications that don't (want to) bind their socket to a
specific interface it would be useful if the cmsg specified clockid_t
or maybe if_index. If the packet would be sent using a different
PHC/interface, it should be dropped.

>   | | plain preempt_rt | so_txtime | txtime @ 250 us |
>   |-+--+---+-|
>   | min:|+1.940800e+04 | +4.72e+02 |   +4.72e+02 |
>   | max:|+7.556000e+04 | +5.68e+02 |   +5.76e+02 |
>   | pk-pk:  |+5.615200e+04 | +9.60e+01 |   +1.04e+02 |
>   | mean:   |+3.292776e+04 | +5.072274e+02 |   +5.073602e+02 |
>   | stddev: |+6.514709e+03 | +1.310849e+01 |   +1.507144e+01 |
>   | count:  |   60 |60 | 240 |
> 
>   Using so_txtime, the peak to peak jitter is about 100 nanoseconds,

Nice!

-- 
Miroslav Lichvar


Re: [PATCH net-next 07/14] gtp: Support encapsulation of IPv6 packets

2017-09-19 Thread David Miller
From: Harald Welte 
Date: Tue, 19 Sep 2017 20:12:45 +0800

> Hi Dave,
> 
> On Mon, Sep 18, 2017 at 09:19:08PM -0700, David Miller wrote:
> 
>> > +static inline u32 ipv6_hashfn(const struct in6_addr *a)
>> > +{
>> > +  return __ipv6_addr_jhash(a, gtp_h_initval);
>> > +}
>> 
>> I know you are just following the pattern of the existing "ipv4_hashfn()" 
>> here
>> but this kind of stuff is not very global namespace friendly.  Even simply
>> adding a "gtp_" prefix to these hash functions would be a lot better.
> 
> I would agree if this was an inline function defined in a header file or
> a non-static function.  But where is the global namespace concern in
> case of static inline functions defined and used in the same .c file?

The problem is if we create a generic ipv6_hashfn() in linux/ipv6.h or
something like that, then this driver stops building.


Re: [PATCH net-next 2/4] qed: Add iWARP out of order support

2017-09-19 Thread Leon Romanovsky
On Tue, Sep 19, 2017 at 08:26:17PM +0300, Michal Kalderon wrote:
> iWARP requires OOO support which is already provided by the ll2
> interface (until now was used only for iSCSI offload).
> The changes mostly include opening a ll2 dedicated connection for
> OOO and notifiying the FW about the handle id.
>
> Signed-off-by: Michal Kalderon 
> Signed-off-by: Ariel Elior 
> ---
>  drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 44 
> +
>  drivers/net/ethernet/qlogic/qed/qed_iwarp.h | 11 +++-
>  drivers/net/ethernet/qlogic/qed/qed_rdma.c  |  7 +++--
>  3 files changed, 59 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c 
> b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
> index 9d989c9..568e985 100644
> --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
> +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
> @@ -41,6 +41,7 @@
>  #include "qed_rdma.h"
>  #include "qed_reg_addr.h"
>  #include "qed_sp.h"
> +#include "qed_ooo.h"
>
>  #define QED_IWARP_ORD_DEFAULT32
>  #define QED_IWARP_IRD_DEFAULT32
> @@ -119,6 +120,13 @@ static void qed_iwarp_cid_cleaned(struct qed_hwfn 
> *p_hwfn, u32 cid)
>   spin_unlock_bh(_hwfn->p_rdma_info->lock);
>  }
>
> +void qed_iwarp_init_fw_ramrod(struct qed_hwfn *p_hwfn,
> +   struct iwarp_init_func_params *p_ramrod)
> +{
> + p_ramrod->ll2_ooo_q_index = RESC_START(p_hwfn, QED_LL2_QUEUE) +
> + p_hwfn->p_rdma_info->iwarp.ll2_ooo_handle;
> +}
> +
>  static int qed_iwarp_alloc_cid(struct qed_hwfn *p_hwfn, u32 *cid)
>  {
>   int rc;
> @@ -1876,6 +1884,16 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, 
> struct qed_ptt *p_ptt)
>   iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
>   }
>
> + if (iwarp_info->ll2_ooo_handle != QED_IWARP_HANDLE_INVAL) {
> + rc = qed_ll2_terminate_connection(p_hwfn,
> +   iwarp_info->ll2_ooo_handle);
> + if (rc)
> + DP_INFO(p_hwfn, "Failed to terminate ooo connection\n");

What exactly will you do with this knowledge? Anyway you are not
interested in return values of qed_ll2_terminate_connection function in
this place and other places too.

Why don't you handle EAGAIN returned from the qed_ll2_terminate_connection()?

Thanks

> +
> + qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_ooo_handle);
> + iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
> + }
> +
>   qed_llh_remove_mac_filter(p_hwfn,
> p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr);
>   return rc;
> @@ -1927,10 +1945,12 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn 
> *p_hwfn, struct qed_ptt *p_ptt)
>   struct qed_iwarp_info *iwarp_info;
>   struct qed_ll2_acquire_data data;
>   struct qed_ll2_cbs cbs;
> + u16 n_ooo_bufs;
>   int rc = 0;
>
>   iwarp_info = _hwfn->p_rdma_info->iwarp;
>   iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
> + iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
>
>   iwarp_info->max_mtu = params->max_mtu;
>
> @@ -1978,6 +1998,29 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, 
> struct qed_ptt *p_ptt)
>   if (rc)
>   goto err;
>
> + /* Start OOO connection */
> + data.input.conn_type = QED_LL2_TYPE_OOO;
> + data.input.mtu = params->max_mtu;
> +
> + n_ooo_bufs = (QED_IWARP_MAX_OOO * QED_IWARP_RCV_WND_SIZE_DEF) /
> +  iwarp_info->max_mtu;
> + n_ooo_bufs = min_t(u32, n_ooo_bufs, QED_IWARP_LL2_OOO_MAX_RX_SIZE);
> +
> + data.input.rx_num_desc = n_ooo_bufs;
> + data.input.rx_num_ooo_buffers = n_ooo_bufs;
> +
> + data.input.tx_max_bds_per_packet = 1;   /* will never be fragmented */
> + data.input.tx_num_desc = QED_IWARP_LL2_OOO_DEF_TX_SIZE;
> + data.p_connection_handle = _info->ll2_ooo_handle;
> +
> + rc = qed_ll2_acquire_connection(p_hwfn, );
> + if (rc)
> + goto err;
> +
> + rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_ooo_handle);
> + if (rc)
> + goto err;
> +
>   return rc;
>  err:
>   qed_iwarp_ll2_stop(p_hwfn, p_ptt);
> @@ -2014,6 +2057,7 @@ int qed_iwarp_setup(struct qed_hwfn *p_hwfn, struct 
> qed_ptt *p_ptt,
>
>   qed_spq_register_async_cb(p_hwfn, PROTOCOLID_IWARP,
> qed_iwarp_async_event);
> + qed_ooo_setup(p_hwfn);
>
>   return qed_iwarp_ll2_start(p_hwfn, params, p_ptt);
>  }
> diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h 
> b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
> index 148ef3c..9e2bfde 100644
> --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
> +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
> @@ -47,7 +47,12 @@ enum qed_iwarp_qp_state {
>  #define QED_IWARP_LL2_SYN_TX_SIZE   (128)
>  #define 

Re: [PATCH net-next 03/14] gtp: Call common functions to get tunnel routes and add dst_cache

2017-09-19 Thread David Miller
From: Harald Welte 
Date: Tue, 19 Sep 2017 20:09:42 +0800

> So I guess you're asking us to document that rationale as form of a
> source code comment ?

Yes that would make ignoring the potential changing of the non-const
'saddr' argument at least be documented.



Re: [PATCH net-next 3/4] qed: Fix maximum number of CQs for iWARP

2017-09-19 Thread Leon Romanovsky
On Tue, Sep 19, 2017 at 08:26:18PM +0300, Michal Kalderon wrote:
> The maximum number of CQs supported is bound to the number
> of connections supported, which differs between RoCE and iWARP.
>
> This fixes a crash that occurred in iWARP when running 1000 sessions
> using perftest.
>
> Signed-off-by: Michal Kalderon 
> Signed-off-by: Ariel Elior 
> ---

It is worth to add Fixes line.

Thanks


signature.asc
Description: PGP signature


[PATCH net] net: change skb->mac_header when Generic XDP calls adjust_head

2017-09-19 Thread Edward Cree
Since XDP's view of the packet includes the MAC header, moving the start-
 of-packet with bpf_xdp_adjust_head needs to also update the offset of the
 MAC header (which is relative to skb->head, not to the skb->data that was
 changed).
Without this, tcpdump sees packets starting from the old MAC header rather
 than the new one, at least in my tests on the loopback device.

Fixes: b5cdae3291f7 ("net: Generic XDP")
Signed-off-by: Edward Cree 
---
 net/core/dev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index fb766d9..9a2254f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3892,6 +3892,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
__skb_pull(skb, off);
else if (off < 0)
__skb_push(skb, -off);
+   skb->mac_header += off;
 
switch (act) {
case XDP_REDIRECT:


[PATCH net-next v3 02/12] net: dsa: b53: Make b53_enable_cpu_port() take a port argument

2017-09-19 Thread Florian Fainelli
In preparation for future changes allowing the configuring of multiple
CPU ports, make b53_enable_cpu_port() take a port argument.

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 274f3679f33d..d8bc54cfcfbe 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -538,19 +538,18 @@ static void b53_disable_port(struct dsa_switch *ds, int 
port,
b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg);
 }
 
-static void b53_enable_cpu_port(struct b53_device *dev)
+static void b53_enable_cpu_port(struct b53_device *dev, int port)
 {
-   unsigned int cpu_port = dev->cpu_port;
u8 port_ctrl;
 
/* BCM5325 CPU port is at 8 */
-   if ((is5325(dev) || is5365(dev)) && cpu_port == B53_CPU_PORT_25)
-   cpu_port = B53_CPU_PORT;
+   if ((is5325(dev) || is5365(dev)) && port == B53_CPU_PORT_25)
+   port = B53_CPU_PORT;
 
port_ctrl = PORT_CTRL_RX_BCST_EN |
PORT_CTRL_RX_MCST_EN |
PORT_CTRL_RX_UCST_EN;
-   b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(cpu_port), port_ctrl);
+   b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), port_ctrl);
 }
 
 static void b53_enable_mib(struct b53_device *dev)
@@ -820,7 +819,7 @@ static int b53_setup(struct dsa_switch *ds)
if (BIT(port) & ds->enabled_port_mask)
b53_enable_port(ds, port, NULL);
else if (dsa_is_cpu_port(ds, port))
-   b53_enable_cpu_port(dev);
+   b53_enable_cpu_port(dev, port);
else
b53_disable_port(ds, port, NULL);
}
-- 
2.9.3



Re: [PATCH net-next 2/4] net: dsa: setup master ethtool unconditionally

2017-09-19 Thread Florian Fainelli
On 09/19/2017 08:56 AM, Vivien Didelot wrote:
> When a DSA switch tree is meant to be applied, it already has a CPU
> port. Thus remove the condition of dst->cpu_dp.
> 
> Moreover, the next lines access dst->cpu_dp unconditionally.
> 
> Signed-off-by: Vivien Didelot 

Reviewed-by: Florian Fainelli 
-- 
Florian


[PATCH net-next v3 05/12] net: dsa: b53: Use a macro to define I/O operations

2017-09-19 Thread Florian Fainelli
Instead of repeating the same pattern: acquire mutex, read/write,
release mutex, define a macro: b53_build_op() which takes the type
(read|write), I/O size, and value (scalar or pointer). This helps with
fixing bugs that could exist (e.g: missing barrier, lock etc.).

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_priv.h | 133 +++--
 1 file changed, 22 insertions(+), 111 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 7528b22aeb03..5bebe97900e8 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -199,119 +199,30 @@ static inline void b53_switch_remove(struct b53_device 
*dev)
dsa_unregister_switch(dev->ds);
 }
 
-static inline int b53_read8(struct b53_device *dev, u8 page, u8 reg, u8 *val)
-{
-   int ret;
-
-   mutex_lock(>reg_mutex);
-   ret = dev->ops->read8(dev, page, reg, val);
-   mutex_unlock(>reg_mutex);
-
-   return ret;
-}
-
-static inline int b53_read16(struct b53_device *dev, u8 page, u8 reg, u16 *val)
-{
-   int ret;
-
-   mutex_lock(>reg_mutex);
-   ret = dev->ops->read16(dev, page, reg, val);
-   mutex_unlock(>reg_mutex);
-
-   return ret;
-}
-
-static inline int b53_read32(struct b53_device *dev, u8 page, u8 reg, u32 *val)
-{
-   int ret;
-
-   mutex_lock(>reg_mutex);
-   ret = dev->ops->read32(dev, page, reg, val);
-   mutex_unlock(>reg_mutex);
-
-   return ret;
-}
-
-static inline int b53_read48(struct b53_device *dev, u8 page, u8 reg, u64 *val)
-{
-   int ret;
-
-   mutex_lock(>reg_mutex);
-   ret = dev->ops->read48(dev, page, reg, val);
-   mutex_unlock(>reg_mutex);
-
-   return ret;
+#define b53_build_op(type_op_size, val_type)   \
+static inline int b53_##type_op_size(struct b53_device *dev, u8 page,  \
+u8 reg, val_type val)  \
+{  \
+   int ret;\
+   \
+   mutex_lock(>reg_mutex);\
+   ret = dev->ops->type_op_size(dev, page, reg, val);  \
+   mutex_unlock(>reg_mutex);  \
+   \
+   return ret; \
 }
 
-static inline int b53_read64(struct b53_device *dev, u8 page, u8 reg, u64 *val)
-{
-   int ret;
-
-   mutex_lock(>reg_mutex);
-   ret = dev->ops->read64(dev, page, reg, val);
-   mutex_unlock(>reg_mutex);
-
-   return ret;
-}
-
-static inline int b53_write8(struct b53_device *dev, u8 page, u8 reg, u8 value)
-{
-   int ret;
-
-   mutex_lock(>reg_mutex);
-   ret = dev->ops->write8(dev, page, reg, value);
-   mutex_unlock(>reg_mutex);
-
-   return ret;
-}
-
-static inline int b53_write16(struct b53_device *dev, u8 page, u8 reg,
- u16 value)
-{
-   int ret;
-
-   mutex_lock(>reg_mutex);
-   ret = dev->ops->write16(dev, page, reg, value);
-   mutex_unlock(>reg_mutex);
-
-   return ret;
-}
-
-static inline int b53_write32(struct b53_device *dev, u8 page, u8 reg,
- u32 value)
-{
-   int ret;
-
-   mutex_lock(>reg_mutex);
-   ret = dev->ops->write32(dev, page, reg, value);
-   mutex_unlock(>reg_mutex);
-
-   return ret;
-}
-
-static inline int b53_write48(struct b53_device *dev, u8 page, u8 reg,
- u64 value)
-{
-   int ret;
-
-   mutex_lock(>reg_mutex);
-   ret = dev->ops->write48(dev, page, reg, value);
-   mutex_unlock(>reg_mutex);
-
-   return ret;
-}
-
-static inline int b53_write64(struct b53_device *dev, u8 page, u8 reg,
-  u64 value)
-{
-   int ret;
-
-   mutex_lock(>reg_mutex);
-   ret = dev->ops->write64(dev, page, reg, value);
-   mutex_unlock(>reg_mutex);
-
-   return ret;
-}
+b53_build_op(read8, u8 *);
+b53_build_op(read16, u16 *);
+b53_build_op(read32, u32 *);
+b53_build_op(read48, u64 *);
+b53_build_op(read64, u64 *);
+
+b53_build_op(write8, u8);
+b53_build_op(write16, u16);
+b53_build_op(write32, u32);
+b53_build_op(write48, u64);
+b53_build_op(write64, u64);
 
 struct b53_arl_entry {
u8 port;
-- 
2.9.3



[PATCH net-next v3 01/12] net: dsa: b53: Remove is_cpu_port()

2017-09-19 Thread Florian Fainelli
This is not used anywhere, so remove it.

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_priv.h | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 01bd8cbe9a3f..7528b22aeb03 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -186,11 +186,6 @@ static inline int is58xx(struct b53_device *dev)
 #define B53_CPU_PORT_255
 #define B53_CPU_PORT   8
 
-static inline int is_cpu_port(struct b53_device *dev, int port)
-{
-   return dev->cpu_port;
-}
-
 struct b53_device *b53_switch_alloc(struct device *base,
const struct b53_io_ops *ops,
void *priv);
-- 
2.9.3



Re: pull-request: mac80211 2017-11-19

2017-09-19 Thread David Miller
From: Johannes Berg 
Date: Tue, 19 Sep 2017 09:20:47 +0200

> Here's a new set of two small changes to prevent null pointer
> dereferences on malformed netlink messages.
> 
> Please pull and let me know if there's any problem.

Pulled, thank you.


Re: [PATCH net-next 05/14] gtp: Remove special mtu handling

2017-09-19 Thread Tom Herbert
On Tue, Sep 19, 2017 at 4:42 AM, Harald Welte  wrote:
> Hi Tom,
>
> On Mon, Sep 18, 2017 at 05:38:55PM -0700, Tom Herbert wrote:
>> Removes MTU handling in gtp_build_skb_ip4. This is non standard relative
>> to how other tunneling protocols handle MTU. The model espoused is that
>> the inner interface should set it's MTU to be less than the expected
>> path MTU on the overlay network. Path MTU discovery is not typically
>> used for modifying tunnel MTUs.
>
> The point of the kernel GTP module is to interoperate with existing
> other GTP implementations and the practises established by cellular
> operators when operating GTP in their networks.
>
> While what you describe (chose interface MTU to be less than the
> expected path MTU) is generally best practise in the Linux IP/networking
> world, this is not generally reflected in the cellular
> universe. You see quite a bit of GTP fragmentation due to the fact
> that the transport network simply has to deal with the MTU that has
> been established via the control plane between SGSN and MS/UE, without
> the GGSN even being part of that negotiation.
>
> Also, you may very well have one "gtp0" tunnel device at the GGSN,
> but you are establishing individual GTP tunnels to dozesn to hundreds of
> different SGSNs at operators all over the world.  You cannot reliably
> set the "gtp0" interface MTU to "the path MTU of the overlay network",
> as the overlay network is in fact different for each of the SGSNs you're
> talking to - and each may have a different path MTU.
>
> So unless I'm missing something, I would currently vote for staying with
> the current code, which uses the path MTU to the specific destination IP
> address (the SGSN).
>
Okay, I'll modify tnl_update_pmtu so we can call it from GTP and not
have to replicate that function. I suspect VXLAN might also what this
at some point.

Tom

> Regards,
> Harald
>
> --
> - Harald Welte    http://laforge.gnumonks.org/
> 
> "Privacy in residential applications is a desirable marketing option."
>   (ETSI EN 300 175-7 Ch. A6)


Re: [RFC PATCH 3/3] usbnet: Fix memory leak when rx_submit() fails

2017-09-19 Thread Bjørn Mork
Douglas Anderson  writes:

> If rx_submit() returns an error code then nobody calls usb_free_urb().
> That means it's leaked.

Nope.  rx_submit() will call usb_free_urb() before returning an error:


static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags)
..
if (!skb) {
netif_dbg(dev, rx_err, dev->net, "no rx skb\n");
usbnet_defer_kevent (dev, EVENT_RX_MEMORY);
usb_free_urb (urb);
return -ENOMEM;
}
..
if (retval) {
dev_kfree_skb_any (skb);
usb_free_urb (urb);
}





Bjørn


[PATCH net-next v3 07/12] net: dsa: b53: Define EEE register page

2017-09-19 Thread Florian Fainelli
In preparation for migrating the EEE code from bcm_sf2 to b53, define the full
EEE register page and offsets within that page.

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_regs.h | 41 +
 1 file changed, 41 insertions(+)

diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h
index 5e8b8e31fee8..2a9f421680aa 100644
--- a/drivers/net/dsa/b53/b53_regs.h
+++ b/drivers/net/dsa/b53/b53_regs.h
@@ -50,6 +50,9 @@
 /* Jumbo Frame Registers */
 #define B53_JUMBO_PAGE 0x40
 
+/* EEE Control Registers Page */
+#define B53_EEE_PAGE   0x92
+
 /* CFP Configuration Registers Page */
 #define B53_CFP_PAGE   0xa1
 
@@ -472,6 +475,44 @@
 #define   JMS_MAX_SIZE 9724
 
 /*
+ * EEE Configuration Page Registers
+ */
+
+/* EEE Enable control register (16 bit) */
+#define B53_EEE_EN_CTRL0x00
+
+/* EEE LPI assert status register (16 bit) */
+#define B53_EEE_LPI_ASSERT_STS 0x02
+
+/* EEE LPI indicate status register (16 bit) */
+#define B53_EEE_LPI_INDICATE   0x4
+
+/* EEE Receiving idle symbols status register (16 bit) */
+#define B53_EEE_RX_IDLE_SYM_STS0x6
+
+/* EEE Pipeline timer register (32 bit) */
+#define B53_EEE_PIP_TIMER  0xC
+
+/* EEE Sleep timer Gig register (32 bit) */
+#define B53_EEE_SLEEP_TIMER_GIG(i) (0x10 + 4 * (i))
+
+/* EEE Sleep timer FE register (32 bit) */
+#define B53_EEE_SLEEP_TIMER_FE(i)  (0x34 + 4 * (i))
+
+/* EEE Minimum LP timer Gig register (32 bit) */
+#define B53_EEE_MIN_LP_TIMER_GIG(i)(0x58 + 4 * (i))
+
+/* EEE Minimum LP timer FE register (32 bit) */
+#define B53_EEE_MIN_LP_TIMER_FE(i) (0x7c + 4 * (i))
+
+/* EEE Wake timer Gig register (16 bit) */
+#define B53_EEE_WAKE_TIMER_GIG(i)  (0xa0 + 2 * (i))
+
+/* EEE Wake timer FE register (16 bit) */
+#define B53_EEE_WAKE_TIMER_FE(i)   (0xb2 + 2 * (i))
+
+
+/*
  * CFP Configuration Page Registers
  */
 
-- 
2.9.3



[PATCH net-next v3 08/12] net: dsa: b53: Move EEE functions to b53

2017-09-19 Thread Florian Fainelli
Move the bcm_sf2 EEE-related functions to the b53 driver because this is shared
code amongst Gigabit capable switch, only 5325 and 5365 are too old to support
that.

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c | 63 ++
 drivers/net/dsa/b53/b53_priv.h   |  5 +++
 drivers/net/dsa/bcm_sf2.c| 66 
 drivers/net/dsa/bcm_sf2.h|  2 --
 drivers/net/dsa/bcm_sf2_regs.h   |  3 --
 5 files changed, 74 insertions(+), 65 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index aa2187c71ea5..491e4ffa8a0e 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1531,6 +1531,69 @@ void b53_mirror_del(struct dsa_switch *ds, int port,
 }
 EXPORT_SYMBOL(b53_mirror_del);
 
+void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable)
+{
+   struct b53_device *dev = ds->priv;
+   u16 reg;
+
+   b53_read16(dev, B53_EEE_PAGE, B53_EEE_EN_CTRL, );
+   if (enable)
+   reg |= BIT(port);
+   else
+   reg &= ~BIT(port);
+   b53_write16(dev, B53_EEE_PAGE, B53_EEE_EN_CTRL, reg);
+}
+EXPORT_SYMBOL(b53_eee_enable_set);
+
+
+/* Returns 0 if EEE was not enabled, or 1 otherwise
+ */
+int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy)
+{
+   int ret;
+
+   ret = phy_init_eee(phy, 0);
+   if (ret)
+   return 0;
+
+   b53_eee_enable_set(ds, port, true);
+
+   return 1;
+}
+EXPORT_SYMBOL(b53_eee_init);
+
+int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e)
+{
+   struct b53_device *dev = ds->priv;
+   struct ethtool_eee *p = >ports[port].eee;
+   u16 reg;
+
+   if (is5325(dev) || is5365(dev))
+   return -EOPNOTSUPP;
+
+   b53_read16(dev, B53_EEE_PAGE, B53_EEE_LPI_INDICATE, );
+   e->eee_enabled = p->eee_enabled;
+   e->eee_active = !!(reg & BIT(port));
+
+   return 0;
+}
+EXPORT_SYMBOL(b53_get_mac_eee);
+
+int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e)
+{
+   struct b53_device *dev = ds->priv;
+   struct ethtool_eee *p = >ports[port].eee;
+
+   if (is5325(dev) || is5365(dev))
+   return -EOPNOTSUPP;
+
+   p->eee_enabled = e->eee_enabled;
+   b53_eee_enable_set(ds, port, e->eee_enabled);
+
+   return 0;
+}
+EXPORT_SYMBOL(b53_set_mac_eee);
+
 static const struct dsa_switch_ops b53_switch_ops = {
.get_tag_protocol   = b53_get_tag_protocol,
.setup  = b53_setup,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 77102f685da0..aabe80eab25d 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -70,6 +70,7 @@ enum {
 
 struct b53_port {
u16 vlan_ctl_mask;
+   struct ethtool_eee eee;
 };
 
 struct b53_vlan {
@@ -310,5 +311,9 @@ int b53_mirror_add(struct dsa_switch *ds, int port,
 void b53_mirror_del(struct dsa_switch *ds, int port,
struct dsa_mall_mirror_tc_entry *mirror);
 void b53_brcm_hdr_setup(struct dsa_switch *ds, int port);
+void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable);
+int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy);
+int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e);
+int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e);
 
 #endif
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 49cb51223f70..4e8ef4c07eab 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -107,19 +107,6 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int 
port)
core_writel(priv, reg, offset);
 }
 
-static void bcm_sf2_eee_enable_set(struct dsa_switch *ds, int port, bool 
enable)
-{
-   struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-   u32 reg;
-
-   reg = core_readl(priv, CORE_EEE_EN_CTRL);
-   if (enable)
-   reg |= 1 << port;
-   else
-   reg &= ~(1 << port);
-   core_writel(priv, reg, CORE_EEE_EN_CTRL);
-}
-
 static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable)
 {
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
@@ -256,8 +243,8 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int 
port,
bcm_sf2_imp_vlan_setup(ds, cpu_port);
 
/* If EEE was enabled, restore it */
-   if (priv->port_sts[port].eee.eee_enabled)
-   bcm_sf2_eee_enable_set(ds, port, true);
+   if (priv->dev->ports[port].eee.eee_enabled)
+   b53_eee_enable_set(ds, port, true);
 
return 0;
 }
@@ -292,47 +279,6 @@ static void bcm_sf2_port_disable(struct dsa_switch *ds, 
int port,
core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL);
 }
 
-/* Returns 0 if EEE was 

[PATCH net-next v3 12/12] net: dsa: bcm_sf2: Utilize b53_{enable,disable}_port

2017-09-19 Thread Florian Fainelli
Export b53_{enable,disable}_port and use these two functions in
bcm_sf2_port_setup and bcm_sf2_port_disable. The generic functions
cannot be used without wrapping because we need to manage additional
switch integration details (PHY, Broadcom tag etc.).

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c |  8 
 drivers/net/dsa/b53/b53_priv.h   |  2 ++
 drivers/net/dsa/bcm_sf2.c| 26 ++
 3 files changed, 8 insertions(+), 28 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index c3f1cd2c33ea..a9f2a5b55a5e 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -502,8 +502,7 @@ void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port)
 }
 EXPORT_SYMBOL(b53_imp_vlan_setup);
 
-static int b53_enable_port(struct dsa_switch *ds, int port,
-  struct phy_device *phy)
+int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
 {
struct b53_device *dev = ds->priv;
unsigned int cpu_port = dev->cpu_port;
@@ -530,9 +529,9 @@ static int b53_enable_port(struct dsa_switch *ds, int port,
 
return 0;
 }
+EXPORT_SYMBOL(b53_enable_port);
 
-static void b53_disable_port(struct dsa_switch *ds, int port,
-struct phy_device *phy)
+void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
 {
struct b53_device *dev = ds->priv;
u8 reg;
@@ -542,6 +541,7 @@ static void b53_disable_port(struct dsa_switch *ds, int 
port,
reg |= PORT_CTRL_RX_DISABLE | PORT_CTRL_TX_DISABLE;
b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg);
 }
+EXPORT_SYMBOL(b53_disable_port);
 
 void b53_brcm_hdr_setup(struct dsa_switch *ds, int port)
 {
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 8f4f83e2e4bd..603c66d240d8 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -311,6 +311,8 @@ int b53_mirror_add(struct dsa_switch *ds, int port,
   struct dsa_mall_mirror_tc_entry *mirror, bool ingress);
 void b53_mirror_del(struct dsa_switch *ds, int port,
struct dsa_mall_mirror_tc_entry *mirror);
+int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy);
+void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy);
 void b53_brcm_hdr_setup(struct dsa_switch *ds, int port);
 void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable);
 int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy);
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 08639674947a..0072a959db5b 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -163,7 +163,6 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int 
port,
  struct phy_device *phy)
 {
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-   s8 cpu_port = ds->dst->cpu_dp->index;
unsigned int i;
u32 reg;
 
@@ -184,9 +183,6 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int 
port,
reg |= i << (PRT_TO_QID_SHIFT * i);
core_writel(priv, reg, CORE_PORT_TC2_QOS_MAP_PORT(port));
 
-   /* Clear the Rx and Tx disable bits and set to no spanning tree */
-   core_writel(priv, 0, CORE_G_PCTL_PORT(port));
-
/* Re-enable the GPHY and re-apply workarounds */
if (priv->int_phy_mask & 1 << port && priv->hw_params.num_gphy == 1) {
bcm_sf2_gphy_enable_set(ds, true);
@@ -209,23 +205,7 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int 
port,
if (port == priv->moca_port)
bcm_sf2_port_intr_enable(priv, port);
 
-   /* Set this port, and only this one to be in the default VLAN,
-* if member of a bridge, restore its membership prior to
-* bringing down this port.
-*/
-   reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(port));
-   reg &= ~PORT_VLAN_CTRL_MASK;
-   reg |= (1 << port);
-   reg |= priv->dev->ports[port].vlan_ctl_mask;
-   core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(port));
-
-   b53_imp_vlan_setup(ds, cpu_port);
-
-   /* If EEE was enabled, restore it */
-   if (priv->dev->ports[port].eee.eee_enabled)
-   b53_eee_enable_set(ds, port, true);
-
-   return 0;
+   return b53_enable_port(ds, port, phy);
 }
 
 static void bcm_sf2_port_disable(struct dsa_switch *ds, int port,
@@ -248,9 +228,7 @@ static void bcm_sf2_port_disable(struct dsa_switch *ds, int 
port,
else
off = CORE_G_PCTL_PORT(port);
 
-   reg = core_readl(priv, off);
-   reg |= RX_DIS | TX_DIS;
-   core_writel(priv, reg, off);
+   b53_disable_port(ds, port, phy);
 
/* Power down the port memory */
reg = 

[PATCH net-next v3 03/12] net: dsa: b53: Defer port enabling to calling port_enable

2017-09-19 Thread Florian Fainelli
There is no need to configure the enabled ports once in b53_setup() and then a
second time around when dsa_switch_ops::port_enable is called, just do it when
port_enable is called which is better in terms of power consumption and
correctness.

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index d8bc54cfcfbe..3297af6aab8a 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -815,12 +815,13 @@ static int b53_setup(struct dsa_switch *ds)
if (ret)
dev_err(ds->dev, "failed to apply configuration\n");
 
+   /* Configure IMP/CPU port, disable unused ports. Enabled
+* ports will be configured with .port_enable
+*/
for (port = 0; port < dev->num_ports; port++) {
-   if (BIT(port) & ds->enabled_port_mask)
-   b53_enable_port(ds, port, NULL);
-   else if (dsa_is_cpu_port(ds, port))
+   if (dsa_is_cpu_port(ds, port))
b53_enable_cpu_port(dev, port);
-   else
+   else if (!(BIT(port) & ds->enabled_port_mask))
b53_disable_port(ds, port, NULL);
}
 
-- 
2.9.3



[PATCH net-next v3 09/12] net: dsa: b53: Wire-up EEE

2017-09-19 Thread Florian Fainelli
Add support for enabling and disabling EEE, as well as re-negotiating it in
.adjust_link() and in .port_enable().

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 491e4ffa8a0e..4e37ec27e496 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -523,6 +523,10 @@ static int b53_enable_port(struct dsa_switch *ds, int port,
 
b53_imp_vlan_setup(ds, cpu_port);
 
+   /* If EEE was enabled, restore it */
+   if (dev->ports[port].eee.eee_enabled)
+   b53_eee_enable_set(ds, port, true);
+
return 0;
 }
 
@@ -879,6 +883,7 @@ static void b53_adjust_link(struct dsa_switch *ds, int port,
struct phy_device *phydev)
 {
struct b53_device *dev = ds->priv;
+   struct ethtool_eee *p = >ports[port].eee;
u8 rgmii_ctrl = 0, reg = 0, off;
 
if (!phy_is_pseudo_fixed_link(phydev))
@@ -1000,6 +1005,9 @@ static void b53_adjust_link(struct dsa_switch *ds, int 
port,
b53_write8(dev, B53_CTRL_PAGE, po_reg, gmii_po);
}
}
+
+   /* Re-negotiate EEE if it was enabled already */
+   p->eee_enabled = b53_eee_init(ds, port, phydev);
 }
 
 int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering)
@@ -1605,6 +1613,8 @@ static const struct dsa_switch_ops b53_switch_ops = {
.adjust_link= b53_adjust_link,
.port_enable= b53_enable_port,
.port_disable   = b53_disable_port,
+   .get_mac_eee= b53_get_mac_eee,
+   .set_mac_eee= b53_set_mac_eee,
.port_bridge_join   = b53_br_join,
.port_bridge_leave  = b53_br_leave,
.port_stp_state_set = b53_br_set_stp_state,
-- 
2.9.3



[PATCH net-next v3 11/12] net: dsa: bcm_sf2: Use SF2_NUM_EGRESS_QUEUES for CFP

2017-09-19 Thread Florian Fainelli
The magic number 8 in 3 locations in bcm_sf2_cfp.c actually designates
the number of switch port egress queues, so use that define instead of
open-coding it.

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/bcm_sf2_cfp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 8a1da7e67707..94649e1481ec 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -144,7 +144,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int 
port,
 * destination port is enabled and that we are within the
 * number of ports supported by the switch
 */
-   port_num = fs->ring_cookie / 8;
+   port_num = fs->ring_cookie / SF2_NUM_EGRESS_QUEUES;
 
if (fs->ring_cookie == RX_CLS_FLOW_DISC ||
!(BIT(port_num) & ds->enabled_port_mask) ||
@@ -280,7 +280,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int 
port,
 * We have a small oddity where Port 6 just does not have a
 * valid bit here (so we subtract by one).
 */
-   queue_num = fs->ring_cookie % 8;
+   queue_num = fs->ring_cookie % SF2_NUM_EGRESS_QUEUES;
if (port_num >= 7)
port_num -= 1;
 
@@ -401,7 +401,7 @@ static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, 
int port,
/* There is no Port 6, so we compensate for that here */
if (nfc->fs.ring_cookie >= 6)
nfc->fs.ring_cookie++;
-   nfc->fs.ring_cookie *= 8;
+   nfc->fs.ring_cookie *= SF2_NUM_EGRESS_QUEUES;
 
/* Extract the destination queue */
queue_num = (reg >> NEW_TC_SHIFT) & NEW_TC_MASK;
-- 
2.9.3



[PATCH net-next v3 10/12] net: dsa: b53: Export b53_imp_vlan_setup()

2017-09-19 Thread Florian Fainelli
bcm_sf2 and b53 do exactly the same thing, so share that piece.

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c |  3 ++-
 drivers/net/dsa/b53/b53_priv.h   |  1 +
 drivers/net/dsa/bcm_sf2.c| 23 +--
 3 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 4e37ec27e496..c3f1cd2c33ea 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -484,7 +484,7 @@ static int b53_fast_age_vlan(struct b53_device *dev, u16 
vid)
return b53_flush_arl(dev, FAST_AGE_VLAN);
 }
 
-static void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port)
+void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port)
 {
struct b53_device *dev = ds->priv;
unsigned int i;
@@ -500,6 +500,7 @@ static void b53_imp_vlan_setup(struct dsa_switch *ds, int 
cpu_port)
b53_write16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(i), pvlan);
}
 }
+EXPORT_SYMBOL(b53_imp_vlan_setup);
 
 static int b53_enable_port(struct dsa_switch *ds, int port,
   struct phy_device *phy)
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index aabe80eab25d..8f4f83e2e4bd 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -284,6 +284,7 @@ static inline int b53_switch_get_reset_gpio(struct 
b53_device *dev)
 #endif
 
 /* Exported functions towards other drivers */
+void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port);
 void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data);
 void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data);
 int b53_get_sset_count(struct dsa_switch *ds);
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 4e8ef4c07eab..08639674947a 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -40,27 +40,6 @@ static enum dsa_tag_protocol 
bcm_sf2_sw_get_tag_protocol(struct dsa_switch *ds)
return DSA_TAG_PROTO_BRCM;
 }
 
-static void bcm_sf2_imp_vlan_setup(struct dsa_switch *ds, int cpu_port)
-{
-   struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-   unsigned int i;
-   u32 reg;
-
-   /* Enable the IMP Port to be in the same VLAN as the other ports
-* on a per-port basis such that we only have Port i and IMP in
-* the same VLAN.
-*/
-   for (i = 0; i < priv->hw_params.num_ports; i++) {
-   if (!((1 << i) & ds->enabled_port_mask))
-   continue;
-
-   reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i));
-   reg |= (1 << cpu_port);
-   core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(i));
-   }
-}
-
-
 static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
 {
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
@@ -240,7 +219,7 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int 
port,
reg |= priv->dev->ports[port].vlan_ctl_mask;
core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(port));
 
-   bcm_sf2_imp_vlan_setup(ds, cpu_port);
+   b53_imp_vlan_setup(ds, cpu_port);
 
/* If EEE was enabled, restore it */
if (priv->dev->ports[port].eee.eee_enabled)
-- 
2.9.3



[PATCH net-next v3 04/12] net: dsa: bcm_sf2: Defer port enabling to calling port_enable

2017-09-19 Thread Florian Fainelli
There is no need to configure the enabled ports once in bcm_sf2_sw_setup() and
then a second time around when dsa_switch_ops::port_enable is called, just do
it when port_enable is called which is better in terms of power consumption and
correctness.

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/bcm_sf2.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index d7b53d53c116..8acbd17bc1fd 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -890,14 +890,11 @@ static int bcm_sf2_sw_setup(struct dsa_switch *ds)
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
unsigned int port;
 
-   /* Enable all valid ports and disable those unused */
+   /* Disable unused ports and configure IMP port */
for (port = 0; port < priv->hw_params.num_ports; port++) {
-   /* IMP port receives special treatment */
-   if ((1 << port) & ds->enabled_port_mask)
-   bcm_sf2_port_setup(ds, port, NULL);
-   else if (dsa_is_cpu_port(ds, port))
+   if (dsa_is_cpu_port(ds, port))
bcm_sf2_imp_setup(ds, port);
-   else
+   else if (!((1 << port) & ds->enabled_port_mask))
bcm_sf2_port_disable(ds, port, NULL);
}
 
-- 
2.9.3



[PATCH net-next v3 06/12] net: dsa: b53: Move Broadcom header setup to b53

2017-09-19 Thread Florian Fainelli
The code to enable Broadcom tags/headers is largely switch independent,
and in preparation for enabling it for multiple devices with b53, move
the code we have in bcm_sf2.c to b53_common.c

Reviewed-by: Vivien Didelot 
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c | 47 
 drivers/net/dsa/b53/b53_priv.h   |  1 +
 drivers/net/dsa/b53/b53_regs.h   |  7 ++
 drivers/net/dsa/bcm_sf2.c| 43 ++--
 drivers/net/dsa/bcm_sf2_regs.h   |  8 ---
 5 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 3297af6aab8a..aa2187c71ea5 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -538,6 +538,53 @@ static void b53_disable_port(struct dsa_switch *ds, int 
port,
b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg);
 }
 
+void b53_brcm_hdr_setup(struct dsa_switch *ds, int port)
+{
+   struct b53_device *dev = ds->priv;
+   u8 hdr_ctl, val;
+   u16 reg;
+
+   /* Resolve which bit controls the Broadcom tag */
+   switch (port) {
+   case 8:
+   val = BRCM_HDR_P8_EN;
+   break;
+   case 7:
+   val = BRCM_HDR_P7_EN;
+   break;
+   case 5:
+   val = BRCM_HDR_P5_EN;
+   break;
+   default:
+   val = 0;
+   break;
+   }
+
+   /* Enable Broadcom tags for IMP port */
+   b53_read8(dev, B53_MGMT_PAGE, B53_BRCM_HDR, _ctl);
+   hdr_ctl |= val;
+   b53_write8(dev, B53_MGMT_PAGE, B53_BRCM_HDR, hdr_ctl);
+
+   /* Registers below are only accessible on newer devices */
+   if (!is58xx(dev))
+   return;
+
+   /* Enable reception Broadcom tag for CPU TX (switch RX) to
+* allow us to tag outgoing frames
+*/
+   b53_read16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_RX_DIS, );
+   reg &= ~BIT(port);
+   b53_write16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_RX_DIS, reg);
+
+   /* Enable transmission of Broadcom tags from the switch (CPU RX) to
+* allow delivering frames to the per-port net_devices
+*/
+   b53_read16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_TX_DIS, );
+   reg &= ~BIT(port);
+   b53_write16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_TX_DIS, reg);
+}
+EXPORT_SYMBOL(b53_brcm_hdr_setup);
+
 static void b53_enable_cpu_port(struct b53_device *dev, int port)
 {
u8 port_ctrl;
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 5bebe97900e8..77102f685da0 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -309,5 +309,6 @@ int b53_mirror_add(struct dsa_switch *ds, int port,
   struct dsa_mall_mirror_tc_entry *mirror, bool ingress);
 void b53_mirror_del(struct dsa_switch *ds, int port,
struct dsa_mall_mirror_tc_entry *mirror);
+void b53_brcm_hdr_setup(struct dsa_switch *ds, int port);
 
 #endif
diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h
index e5c86d44667a..5e8b8e31fee8 100644
--- a/drivers/net/dsa/b53/b53_regs.h
+++ b/drivers/net/dsa/b53/b53_regs.h
@@ -210,6 +210,7 @@
 #define B53_BRCM_HDR   0x03
 #define   BRCM_HDR_P8_EN   BIT(0) /* Enable tagging on port 8 */
 #define   BRCM_HDR_P5_EN   BIT(1) /* Enable tagging on port 5 */
+#define   BRCM_HDR_P7_EN   BIT(2) /* Enable tagging on port 7 */
 
 /* Mirror capture control register (16 bit) */
 #define B53_MIR_CAP_CTL0x10
@@ -249,6 +250,12 @@
 /* Revision ID register (8 bit) */
 #define B53_REV_ID 0x40
 
+/* Broadcom header RX control (16 bit) */
+#define B53_BRCM_HDR_RX_DIS0x60
+
+/* Broadcom header TX control (16 bit) */
+#define B53_BRCM_HDR_TX_DIS0x62
+
 /*
  * ARL Access Page Registers
  */
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 8acbd17bc1fd..49cb51223f70 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -60,45 +60,6 @@ static void bcm_sf2_imp_vlan_setup(struct dsa_switch *ds, 
int cpu_port)
}
 }
 
-static void bcm_sf2_brcm_hdr_setup(struct bcm_sf2_priv *priv, int port)
-{
-   u32 reg, val;
-
-   /* Resolve which bit controls the Broadcom tag */
-   switch (port) {
-   case 8:
-   val = BRCM_HDR_EN_P8;
-   break;
-   case 7:
-   val = BRCM_HDR_EN_P7;
-   break;
-   case 5:
-   val = BRCM_HDR_EN_P5;
-   break;
-   default:
-   val = 0;
-   break;
-   }
-
-   /* Enable Broadcom tags for IMP port */
-   reg 

[PATCH net-next v3 00/12] net: dsa: b53/bcm_sf2 cleanups

2017-09-19 Thread Florian Fainelli
Hi all,

This patch series is a first pass set of clean-ups to reduce the number of LOCs
between b53 and bcm_sf2 and sharing as many functions as possible.

There is a number of additional cleanups queued up locally that require more
thorough testing.

Thanks!

Changes in v3:

- remove one extra argument for the b53_build_io_op macro (David Laight)
- added additional Reviewed-by tags from Vivien

Changes in v2:

- added Reviewed-by tags from Vivien
- added a missing EXPORT_SYMBOL() in patch 8
- fixed a typo in patch 5

Florian Fainelli (12):
  net: dsa: b53: Remove is_cpu_port()
  net: dsa: b53: Make b53_enable_cpu_port() take a port argument
  net: dsa: b53: Defer port enabling to calling port_enable
  net: dsa: bcm_sf2: Defer port enabling to calling port_enable
  net: dsa: b53: Use a macro to define I/O operations
  net: dsa: b53: Move Broadcom header setup to b53
  net: dsa: b53: Define EEE register page
  net: dsa: b53: Move EEE functions to b53
  net: dsa: b53: Wire-up EEE
  net: dsa: b53: Export b53_imp_vlan_setup()
  net: dsa: bcm_sf2: Use SF2_NUM_EGRESS_QUEUES for CFP
  net: dsa: bcm_sf2: Utilize b53_{enable,disable}_port

 drivers/net/dsa/b53/b53_common.c | 151 
 drivers/net/dsa/b53/b53_priv.h   | 145 ---
 drivers/net/dsa/b53/b53_regs.h   |  48 
 drivers/net/dsa/bcm_sf2.c| 161 +++
 drivers/net/dsa/bcm_sf2.h|   2 -
 drivers/net/dsa/bcm_sf2_cfp.c|   6 +-
 drivers/net/dsa/bcm_sf2_regs.h   |  11 ---
 7 files changed, 228 insertions(+), 296 deletions(-)

-- 
2.9.3



Re: [RFC PATCH 1/3] usbnet: Get rid of spammy usbnet "kevent X may have been dropped"

2017-09-19 Thread Bjørn Mork
Douglas Anderson  writes:

> Every once in a while when my system is under a bit of stress I see
> some spammy messages show up in my logs that say:
>
>   kevent X may have been dropped
>
> As far as I can tell these messages aren't terribly useful.

I agree, FWIW. These messages just confuse users for no purpose at all.


> + /* If work is already started this will mark it to run again when it
> +  * finishes; if we already had work pending and it hadn't started
> +  * yet then that's fine too.
> +  */
> + schedule_work (>kevent);
> + netdev_dbg(dev->net, "kevent %d scheduled\n", work);

Or maybe

if (schedule_work (>kevent))
netdev_dbg(dev->net, "kevent %d scheduled\n", work);


?  Not that I think it matters much.


Bjørn


Re: [PATCH net-next 3/4] net: dsa: setup master ethtool after dsa_ptr

2017-09-19 Thread Florian Fainelli
On 09/19/2017 08:56 AM, Vivien Didelot wrote:
> DSA overrides the master's ethtool ops so that we can inject its CPU
> port's statistics. Because of that, we need to setup the ethtool ops
> after the master's dsa_ptr pointer has been assigned, not before.

Yes, good point, technically this is a bugfix, but since we have changed
this quite often and the race is tiny, I am not positive we could a)
trigger this in real life, and b) provide a proper Fixes tag.

> 
> This patch setups the ethtool ops after dsa_ptr is assigned, and
> restores them before it gets cleared.
> 
> Signed-off-by: Vivien Didelot 

Reviewed-by: Florian Fainelli 
-- 
Florian


Re: [PATCH 2/3] selftests: actually run the various net selftests

2017-09-19 Thread Willem de Bruijn
On Tue, Sep 19, 2017 at 9:34 AM, Josef Bacik  wrote:
> On Mon, Sep 18, 2017 at 04:14:41PM -0600, Shuah Khan wrote:
>> On 09/18/2017 11:32 AM, jo...@toxicpanda.com wrote:
>> > From: Josef Bacik 
>> >
>> > These self tests are just self contained binaries, they are not run by
>> > any of the scripts in the directory.  This means they need to be marked
>> > with TEST_GEN_PROGS to actually be run, not TEST_GEN_FILES.
>> >
>> > Signed-off-by: Josef Bacik 
>> > ---
>> >  tools/testing/selftests/net/Makefile | 4 ++--
>> >  1 file changed, 2 insertions(+), 2 deletions(-)
>> >
>> > diff --git a/tools/testing/selftests/net/Makefile 
>> > b/tools/testing/selftests/net/Makefile
>> > index 3df542c84610..45a4e77a47c4 100644
>> > --- a/tools/testing/selftests/net/Makefile
>> > +++ b/tools/testing/selftests/net/Makefile
>> > @@ -6,8 +6,8 @@ CFLAGS += -I../../../../usr/include/
>> >  TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh 
>> > rtnetlink.sh
>> >  TEST_GEN_FILES =  socket
>> >  TEST_GEN_FILES += psock_fanout psock_tpacket
>> > -TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
>> > -TEST_GEN_FILES += reuseport_dualstack msg_zerocopy reuseaddr_conflict
>> > +TEST_GEN_PROGS += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
>> > +TEST_GEN_PROGS += reuseport_dualstack msg_zerocopy reuseaddr_conflict
>>
>> Hmm. I see msg_zerocopy.sh for running msg_zerocopy. msg_zerocopy should
>> still stay in TEST_GEN_FILES and msg_zerocopy.sh needs to be added to
>> TEST_PROGS so it runs.
>>
>
> Actually the shell script requires arguments, it doesn't just run the test.
> I'll fix this to just omit the test for now as it's not setup to run properly.
>
> Willem, could you follow up with a patch so that the zero copy test is run
> properly the way you envision it running?  You need to make sure that
>
> make -C tools/testing/selftests TARGETS=net run_tests
>
> actually runs your zero copy test the way you expect it to, otherwise it's 
> just
> sitting there collecting dust.  Thanks,

Will do.

In its current state, this test is really only meant to be run manually.
It demonstrates the API and outputs some information on stderr.

Zerocopy itself requires a two-host test. The feature is expressly
disabled over loopback.

But I can make this a pass/fail tests that exercises the interface
and notification channel and verifies that data was copied. It will
be a bit more work than just changing the default invocation of
msg_zerocopy.sh


Re: [REGRESSION] Warning in tcp_fastretrans_alert() of net/ipv4/tcp_input.c

2017-09-19 Thread Yuchung Cheng
On Tue, Sep 19, 2017 at 4:04 AM, Oleksandr Natalenko
 wrote:
> Hi.
>
> 18.09.2017 23:40, Yuchung Cheng wrote:
>>
>> I assume this kernel does not have the patch that Neal proposed in his
>> first reply?
>
>
> Correct.
>
>> The main warning needs to be triggered by another peculiar SACK that
>> kicks the sender into recovery again (after undo). Please let it run
>> longer if possible to see if we can get both. But the new data does
>> indicate the we can (validly) be in CA_Open with retrans_out > 0.
>
>
> OK, here it is:
>
> ===
> » LC_TIME=C jctl -kb | grep RIP
> …
> Sep 19 12:54:03 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
> Sep 19 12:54:22 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
> Sep 19 12:54:25 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
> Sep 19 12:56:00 defiant kernel: RIP: 0010:tcp_fastretrans_alert+0x7c8/0x990
> Sep 19 12:57:07 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
> Sep 19 12:57:14 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
> Sep 19 12:58:04 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0
> …
> ===
>
> Note timestamps — two types of warning are distant in time, so didn't happen
> at once.
>
> While still running this kernel, anything else I can check for you?
Thanks. Based on all the experiments you did I believe there's other
code path than my hypothesis that'd cause the warning:
1) Neal's proposed F-RTO fix didn't work
2) the main warning is not being triggered together with the newly-instrumented
warning in undo
3) Disabling RACK stopped the warning

We couldn't figure out exactly what. So we'll do a bit code auditing
first to find more suspects


Re: [PATCH v2 net-next] net: sk_buff rbnode reorg

2017-09-19 Thread Soheil Hassas Yeganeh
On Tue, Sep 19, 2017 at 8:14 AM, Eric Dumazet  wrote:
> From: Eric Dumazet 
>
> skb->rbnode shares space with skb->next, skb->prev and skb->tstamp
>
> Current uses (TCP receive ofo queue and netem) need to save/restore
> tstamp, while skb->dev is either NULL (TCP) or a constant for a given
> queue (netem).
>
> Since we plan using an RB tree for TCP retransmit queue to speedup SACK
> processing with large BDP, this patch exchanges skb->dev and
> skb->tstamp.
>
> This saves some overhead in both TCP and netem.
>
> v2: removes the swtstamp field from struct tcp_skb_cb
>
> Signed-off-by: Eric Dumazet 
> Cc: Soheil Hassas Yeganeh 
> Cc: Wei Wang 
> Cc: Willem de Bruijn 

Acked-by: Soheil Hassas Yeganeh 

Very nice!


[patch net-next] team: fall back to hash if table entry is empty

2017-09-19 Thread Jim Hanko
If the hash to port mapping table does not have a valid port (i.e. when
a port goes down), fall back to the simple hashing mechanism to avoid
dropping packets.

Signed-off-by: Jim Hanko 
Acked-by: Jiri Pirko 
---
 drivers/net/team/team_mode_loadbalance.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/team/team_mode_loadbalance.c 
b/drivers/net/team/team_mode_loadbalance.c
index 1468ddf..a5ef970 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -137,7 +137,13 @@ static struct team_port *lb_htpm_select_tx_port(struct 
team *team,
struct sk_buff *skb,
unsigned char hash)
 {
-   return rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv, hash));
+   struct team_port *port;
+
+   port = rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv, hash));
+   if (likely(port))
+   return port;
+   /* If no valid port in the table, fall back to simple hash */
+   return lb_hash_select_tx_port(team, lb_priv, skb, hash);
 }
 
 struct lb_select_tx_port {
-- 
2.7.4



[PATCH] isdn/i4l: check the message proto does not change across fetches

2017-09-19 Thread Meng Xu
In isdn_ppp_write(), the header (i.e., protobuf) of the buffer is fetched
twice from userspace. The first fetch is used to peek at the protocol
of the message and reset the huptimer if necessary; while the second
fetch copies in the whole buffer. However, given that buf resides in
userspace memory, a user process can race to change its memory content
across fetches. By doing so, we can either avoid resetting the huptimer
for any type of packets (by first setting proto to PPP_LCP and later
change to the actual type) or force resetting the huptimer for LCP packets.

This patch does a memcmp between the two fetches and abort if changes to
the protobuf is detected across fetches.

Signed-off-by: Meng Xu 
---
 drivers/isdn/i4l/isdn_ppp.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c
index 6c44609..21a9ae8 100644
--- a/drivers/isdn/i4l/isdn_ppp.c
+++ b/drivers/isdn/i4l/isdn_ppp.c
@@ -857,6 +857,7 @@ isdn_ppp_write(int min, struct file *file, const char 
__user *buf, int count)
(lp->flags & ISDN_NET_CONNECTED)) {
unsigned short hl;
struct sk_buff *skb;
+   void *skb_tail;
/*
 * we need to reserve enough space in front of
 * sk_buff. old call to dev_alloc_skb only reserved
@@ -869,11 +870,21 @@ isdn_ppp_write(int min, struct file *file, const char 
__user *buf, int count)
return count;
}
skb_reserve(skb, hl);
-   if (copy_from_user(skb_put(skb, count), buf, count))
+   skb_tail = skb_put(skb, count);
+   if (copy_from_user(skb_tail, buf, count))
{
kfree_skb(skb);
return -EFAULT;
}
+
+   /*
+* abort if the message proto is changed between the 
fetches
+*/
+   if (memcmp(skb_tail, protobuf, 4)) {
+   kfree_skb(skb);
+   return -EFAULT;
+   }
+
if (is->debug & 0x40) {
printk(KERN_DEBUG "ppp xmit: len %d\n", (int) 
skb->len);
isdn_ppp_frame_log("xmit", skb->data, skb->len, 
32, is->unit, lp->ppp_slot);
-- 
2.7.4



Re: [5/5] e1000e: Avoid receiver overrun interrupt bursts

2017-09-19 Thread Philip Prindeville
Hi.

We’ve been running this patchset (all 5) for about as long as they’ve been 
under review… about 2 months.  And in a burn-in lab with heavy traffic.

We’ve not seen a single link-flap in hundreds of ours of saturated traffic.

Would love to see some resolution soon on this as we don’t want to ship a 
release with unsanctioned patches.

Is there an estimate on when that might be?

Thanks,

-Philip



> On Jul 21, 2017, at 12:36 PM, Benjamin Poirier  wrote:
> 
> When e1000e_poll() is not fast enough to keep up with incoming traffic, the
> adapter (when operating in msix mode) raises the Other interrupt to signal
> Receiver Overrun.
> 
> This is a double problem because 1) at the moment e1000_msix_other()
> assumes that it is only called in case of Link Status Change and 2) if the
> condition persists, the interrupt is repeatedly raised again in quick
> succession.
> 
> Ideally we would configure the Other interrupt to not be raised in case of
> receiver overrun but this doesn't seem possible on this adapter. Instead,
> we handle the first part of the problem by reverting to the practice of
> reading ICR in the other interrupt handler, like before commit 16ecba59bc33
> ("e1000e: Do not read ICR in Other interrupt"). Thanks to commit
> 0a8047ac68e5 ("e1000e: Fix msi-x interrupt automask") which cleared IAME
> from CTRL_EXT, reading ICR doesn't interfere with RxQ0, TxQ0 interrupts
> anymore. We handle the second part of the problem by not re-enabling the
> Other interrupt right away when there is overrun. Instead, we wait until
> traffic subsides, napi polling mode is exited and interrupts are
> re-enabled.
> 
> Reported-by: Lennart Sorensen 
> Fixes: 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt")
> Signed-off-by: Benjamin Poirier 
> Tested-by: Aaron Brown 
> ---
> drivers/net/ethernet/intel/e1000e/defines.h |  1 +
> drivers/net/ethernet/intel/e1000e/netdev.c  | 33 +++--
> 2 files changed, 27 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/e1000e/defines.h 
> b/drivers/net/ethernet/intel/e1000e/defines.h
> index 0641c0098738..afb7ebe20b24 100644
> --- a/drivers/net/ethernet/intel/e1000e/defines.h
> +++ b/drivers/net/ethernet/intel/e1000e/defines.h
> @@ -398,6 +398,7 @@
> #define E1000_ICR_LSC   0x0004 /* Link Status Change */
> #define E1000_ICR_RXSEQ 0x0008 /* Rx sequence error */
> #define E1000_ICR_RXDMT00x0010 /* Rx desc min. threshold (0) */
> +#define E1000_ICR_RXO   0x0040 /* Receiver Overrun */
> #define E1000_ICR_RXT0  0x0080 /* Rx timer intr (ring 0) */
> #define E1000_ICR_ECCER 0x0040 /* Uncorrectable ECC Error */
> /* If this bit asserted, the driver should claim the interrupt */
> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
> b/drivers/net/ethernet/intel/e1000e/netdev.c
> index 5a8ab1136566..803edd1a6401 100644
> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
> @@ -1910,12 +1910,30 @@ static irqreturn_t e1000_msix_other(int 
> __always_unused irq, void *data)
>   struct net_device *netdev = data;
>   struct e1000_adapter *adapter = netdev_priv(netdev);
>   struct e1000_hw *hw = >hw;
> + u32 icr;
> + bool enable = true;
> +
> + icr = er32(ICR);
> + if (icr & E1000_ICR_RXO) {
> + ew32(ICR, E1000_ICR_RXO);
> + enable = false;
> + /* napi poll will re-enable Other, make sure it runs */
> + if (napi_schedule_prep(>napi)) {
> + adapter->total_rx_bytes = 0;
> + adapter->total_rx_packets = 0;
> + __napi_schedule(>napi);
> + }
> + }
> + if (icr & E1000_ICR_LSC) {
> + ew32(ICR, E1000_ICR_LSC);
> + hw->mac.get_link_status = true;
> + /* guard against interrupt when we're going down */
> + if (!test_bit(__E1000_DOWN, >state)) {
> + mod_timer(>watchdog_timer, jiffies + 1);
> + }
> + }
> 
> - hw->mac.get_link_status = true;
> -
> - /* guard against interrupt when we're going down */
> - if (!test_bit(__E1000_DOWN, >state)) {
> - mod_timer(>watchdog_timer, jiffies + 1);
> + if (enable && !test_bit(__E1000_DOWN, >state)) {
>   ew32(IMS, E1000_IMS_OTHER);
>   }
> 
> @@ -2687,7 +2705,8 @@ static int e1000e_poll(struct napi_struct *napi, int 
> weight)
>   napi_complete_done(napi, work_done);
>   if (!test_bit(__E1000_DOWN, >state)) {
>   if (adapter->msix_entries)
> - ew32(IMS, adapter->rx_ring->ims_val);
> + ew32(IMS, adapter->rx_ring->ims_val |
> +  E1000_IMS_OTHER);
>   else
>

<    1   2   3