date:20170619

[PATCH net-next 06/12] nfp: add stats and xmit helpers for representors

2017-06-19 Thread Simon Horman

Provide helpers for stats and xmit on representor netdevs.

Parts based on work by Bert van Leeuwen, Benjamin LaHaise and
Jakub Kicinski.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 198 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h |  28 +++
 2 files changed, 225 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index bdd34d206d22..a97bb6f2cc12 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -38,9 +38,191 @@
 #include "nfpcore/nfp_cpp.h"
 #include "nfp_app.h"
 #include "nfp_main.h"
+#include "nfp_net_ctrl.h"
 #include "nfp_net_repr.h"
 #include "nfp_port.h"
 
+static void
+nfp_repr_inc_tx_stats(struct net_device *netdev, unsigned int len,
+ int tx_status)
+{
+   struct nfp_repr *repr = netdev_priv(netdev);
+   struct nfp_repr_pcpu_stats *stats;
+
+   if (unlikely(tx_status != NET_XMIT_SUCCESS &&
+tx_status != NET_XMIT_CN)) {
+   this_cpu_inc(repr->stats->tx_drops);
+   return;
+   }
+
+   stats = this_cpu_ptr(repr->stats);
+   u64_stats_update_begin(>syncp);
+   stats->tx_packets++;
+   stats->tx_bytes += len;
+   u64_stats_update_end(>syncp);
+}
+
+void nfp_repr_inc_rx_stats(struct net_device *netdev, unsigned int len)
+{
+   struct nfp_repr *repr = netdev_priv(netdev);
+   struct nfp_repr_pcpu_stats *stats;
+
+   stats = this_cpu_ptr(repr->stats);
+   u64_stats_update_begin(>syncp);
+   stats->rx_packets++;
+   stats->rx_bytes += len;
+   u64_stats_update_end(>syncp);
+}
+
+void
+nfp_repr_phy_port_get_stats64(const struct nfp_app *app, u8 phy_port,
+ struct rtnl_link_stats64 *stats)
+{
+   u8 __iomem *mem;
+
+   mem = app->pf->mac_stats_mem + phy_port * NFP_MAC_STATS_SIZE;
+
+   /* TX and RX stats are flipped as we are returning the stats as seen
+* at the switch port corresponding to the phys port.
+*/
+   stats->tx_packets = readq(mem + NFP_MAC_STATS_RX_FRAMES_RECEIVED_OK);
+   stats->tx_bytes = readq(mem + NFP_MAC_STATS_RX_IN_OCTETS);
+   stats->tx_dropped = readq(mem + NFP_MAC_STATS_RX_IN_ERRORS);
+
+   stats->rx_packets = readq(mem + NFP_MAC_STATS_TX_FRAMES_TRANSMITTED_OK);
+   stats->rx_bytes = readq(mem + NFP_MAC_STATS_TX_OUT_OCTETS);
+   stats->rx_dropped = readq(mem + NFP_MAC_STATS_TX_OUT_ERRORS);
+}
+
+void
+nfp_repr_vf_get_stats64(const struct nfp_app *app, u8 vf,
+   struct rtnl_link_stats64 *stats)
+{
+   u8 __iomem *mem;
+
+   mem = app->pf->vf_cfg_mem + vf * NFP_NET_CFG_BAR_SZ;
+
+   /* TX and RX stats are flipped as we are returning the stats as seen
+* at the switch port corresponding to the VF.
+*/
+   stats->tx_packets = readq(mem + NFP_NET_CFG_STATS_RX_FRAMES);
+   stats->tx_bytes = readq(mem + NFP_NET_CFG_STATS_RX_OCTETS);
+   stats->tx_dropped = readq(mem + NFP_NET_CFG_STATS_RX_DISCARDS);
+
+   stats->rx_packets = readq(mem + NFP_NET_CFG_STATS_TX_FRAMES);
+   stats->rx_bytes = readq(mem + NFP_NET_CFG_STATS_TX_OCTETS);
+   stats->rx_dropped = readq(mem + NFP_NET_CFG_STATS_TX_DISCARDS);
+}
+
+void
+nfp_repr_pf_get_stats64(const struct nfp_app *app, u8 pf,
+   struct rtnl_link_stats64 *stats)
+{
+   u8 __iomem *mem;
+
+   if (pf)
+   return;
+
+   mem = nfp_cpp_area_iomem(app->pf->data_vnic_bar);
+
+   stats->tx_packets = readq(mem + NFP_NET_CFG_STATS_RX_FRAMES);
+   stats->tx_bytes = readq(mem + NFP_NET_CFG_STATS_RX_OCTETS);
+   stats->tx_dropped = readq(mem + NFP_NET_CFG_STATS_RX_DISCARDS);
+
+   stats->rx_packets = readq(mem + NFP_NET_CFG_STATS_TX_FRAMES);
+   stats->rx_bytes = readq(mem + NFP_NET_CFG_STATS_TX_OCTETS);
+   stats->rx_dropped = readq(mem + NFP_NET_CFG_STATS_TX_DISCARDS);
+}
+
+void
+nfp_repr_get_stats64(const struct nfp_app *app, enum nfp_repr_type type,
+u8 port, struct rtnl_link_stats64 *stats)
+{
+   switch (type) {
+   case NFP_REPR_TYPE_PHYS_PORT:
+   nfp_repr_phy_port_get_stats64(app, port, stats);
+   break;
+   case NFP_REPR_TYPE_PF:
+   nfp_repr_pf_get_stats64(app, port, stats);
+   break;
+   case NFP_REPR_TYPE_VF:
+   nfp_repr_vf_get_stats64(app, port, stats);
+   default:
+   break;
+   }
+}
+
+bool
+nfp_repr_has_offload_stats(const struct net_device *dev, int attr_id)
+{
+   switch (attr_id) {
+   case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+   return true;
+   }
+
+   return false;
+}
+
+static int
+nfp_repr_get_host_stats64(const struct net_device *netdev,
+

[PATCH net-next 02/12] nfp: devlink add support for getting eswitch mode

2017-06-19 Thread Simon Horman

From: Jakub Kicinski 

Add app callback for reporting eswitch mode.  Non-SRIOV apps
should not implement this callback, nfp_app code will then
respond with -EOPNOTSUPP.

Signed-off-by: Jakub Kicinski 
Signed-off-by: Simon Horman 
---
 drivers/net/ethernet/netronome/nfp/nfp_app.h | 15 +++
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 18 ++
 2 files changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h 
b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index f5e373fa8c3b..0fee14ffa081 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -34,6 +34,8 @@
 #ifndef _NFP_APP_H
 #define _NFP_APP_H 1
 
+#include 
+
 struct bpf_prog;
 struct net_device;
 struct pci_dev;
@@ -70,6 +72,7 @@ extern const struct nfp_app_type app_bpf;
  * @setup_tc:  setup TC ndo
  * @tc_busy:   TC HW offload busy (rules loaded)
  * @xdp_offload:offload an XDP program
+ * @eswitch_mode_get:get SR-IOV eswitch mode
  */
 struct nfp_app_type {
enum nfp_app_id id;
@@ -95,6 +98,8 @@ struct nfp_app_type {
bool (*tc_busy)(struct nfp_app *app, struct nfp_net *nn);
int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn,
   struct bpf_prog *prog);
+
+   enum devlink_eswitch_mode (*eswitch_mode_get)(struct nfp_app *app);
 };
 
 /**
@@ -216,6 +221,16 @@ static inline void nfp_app_ctrl_rx(struct nfp_app *app, 
struct sk_buff *skb)
app->type->ctrl_msg_rx(app, skb);
 }
 
+static inline int nfp_app_eswitch_mode_get(struct nfp_app *app, u16 *mode)
+{
+   if (!app->type->eswitch_mode_get)
+   return -EOPNOTSUPP;
+
+   *mode = app->type->eswitch_mode_get(app);
+
+   return 0;
+}
+
 const char *nfp_app_mip_name(struct nfp_app *app);
 struct sk_buff *nfp_app_ctrl_msg_alloc(struct nfp_app *app, unsigned int size);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c 
b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index 2609a0f28e81..6c9f29c2e975 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -149,9 +149,27 @@ nfp_devlink_port_unsplit(struct devlink *devlink, unsigned 
int port_index)
return ret;
 }
 
+static int nfp_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
+{
+   struct nfp_pf *pf = devlink_priv(devlink);
+   int ret;
+
+   mutex_lock(>lock);
+   if (!pf->app) {
+   ret = -EBUSY;
+   goto out;
+   }
+   ret = nfp_app_eswitch_mode_get(pf->app, mode);
+out:
+   mutex_unlock(>lock);
+
+   return ret;
+}
+
 const struct devlink_ops nfp_devlink_ops = {
.port_split = nfp_devlink_port_split,
.port_unsplit   = nfp_devlink_port_unsplit,
+   .eswitch_mode_get   = nfp_devlink_eswitch_mode_get,
 };
 
 int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port)
-- 
2.1.4

[PATCH net-next 11/12] nfp: add flower app

2017-06-19 Thread Simon Horman

Add app for flower offload. At this point the PF netdev and phys port
representor netdevs are initialised. Follow-up work will add support for
VF and PF representors and beyond that offloading the flower classifier.

Based in part on work by Benjamin LaHaise and Bert van Leeuwen.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/Makefile  |   1 +
 drivers/net/ethernet/netronome/nfp/flower/main.c | 294 +++
 drivers/net/ethernet/netronome/nfp/nfp_app.c |   1 +
 drivers/net/ethernet/netronome/nfp/nfp_app.h |   4 +
 4 files changed, 300 insertions(+)
 create mode 100644 drivers/net/ethernet/netronome/nfp/flower/main.c

diff --git a/drivers/net/ethernet/netronome/nfp/Makefile 
b/drivers/net/ethernet/netronome/nfp/Makefile
index e14f62863add..10b556b2c59d 100644
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -28,6 +28,7 @@ nfp-objs := \
bpf/main.o \
bpf/offload.o \
flower/cmsg.o \
+   flower/main.o \
nic/main.o
 
 ifeq ($(CONFIG_BPF_SYSCALL),y)
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c 
b/drivers/net/ethernet/netronome/nfp/flower/main.c
new file mode 100644
index ..01864840a21b
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../nfpcore/nfp_cpp.h"
+#include "../nfpcore/nfp_nsp.h"
+#include "../nfp_app.h"
+#include "../nfp_main.h"
+#include "../nfp_net.h"
+#include "../nfp_net_repr.h"
+#include "../nfp_port.h"
+#include "./cmsg.h"
+
+/**
+ * struct nfp_flower_priv - Flower APP per-vNIC priv data
+ * @nn: Pointer to vNIC
+ */
+struct nfp_flower_priv {
+   struct nfp_net *nn;
+};
+
+static const char *nfp_flower_extra_cap(struct nfp_app *app, struct nfp_net 
*nn)
+{
+   return "FLOWER";
+}
+
+static enum devlink_eswitch_mode eswitch_mode_get(struct nfp_app *app)
+{
+   return DEVLINK_ESWITCH_MODE_SWITCHDEV;
+}
+
+static enum nfp_repr_type
+nfp_flower_repr_get_type_and_port(struct nfp_app *app, u32 port_id, u8 *port)
+{
+   switch (FIELD_GET(NFP_FLOWER_CMSG_PORT_TYPE, port_id)) {
+   case NFP_FLOWER_CMSG_PORT_TYPE_PHYS_PORT:
+   *port = FIELD_GET(NFP_FLOWER_CMSG_PORT_PHYS_PORT_NUM,
+ port_id);
+   return NFP_REPR_TYPE_PHYS_PORT;
+
+   case NFP_FLOWER_CMSG_PORT_TYPE_PCIE_PORT:
+   *port = FIELD_GET(NFP_FLOWER_CMSG_PORT_VNIC, port_id);
+   if (FIELD_GET(NFP_FLOWER_CMSG_PORT_VNIC_TYPE, port_id) ==
+   NFP_FLOWER_CMSG_PORT_VNIC_TYPE_PF)
+   return NFP_REPR_TYPE_PF;
+   else
+   return NFP_REPR_TYPE_VF;
+   }
+
+   return NFP_FLOWER_CMSG_PORT_TYPE_UNSPEC;
+}
+
+static struct net_device *
+nfp_flower_repr_get(struct nfp_app *app, u32 port_id)
+{
+   enum nfp_repr_type repr_type;
+   struct nfp_reprs *reprs;
+   u8 port = 0;
+
+   repr_type = nfp_flower_repr_get_type_and_port(app, port_id, );
+
+   reprs = rcu_dereference(app->reprs[repr_type]);
+   if (!reprs)
+   return NULL;
+
+   if (port >= reprs->num_reprs)
+   return NULL;
+
+   return reprs->reprs[port];
+}
+
+static void
+nfp_flower_repr_netdev_get_stats64(struct

[PATCH net-next 05/12] nfp: general representor implementation

2017-06-19 Thread Simon Horman

Provide infrastructure to create and destroy representors of a given type.

Parts based on work by Bert van Leeuwen, Benjamin LaHaise,
and Jakub Kicinski.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/Makefile   |   1 +
 drivers/net/ethernet/netronome/nfp/nfp_app.c  |  20 +++
 drivers/net/ethernet/netronome/nfp/nfp_app.h  |  18 +++
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 156 ++
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h |  92 +
 5 files changed, 287 insertions(+)
 create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
 create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h

diff --git a/drivers/net/ethernet/netronome/nfp/Makefile 
b/drivers/net/ethernet/netronome/nfp/Makefile
index 5ad9a557f06a..a401113035f5 100644
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -22,6 +22,7 @@ nfp-objs := \
nfp_net_common.o \
nfp_net_ethtool.o \
nfp_net_main.o \
+   nfp_net_repr.o \
nfp_netvf_main.o \
nfp_port.o \
bpf/main.o \
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c 
b/drivers/net/ethernet/netronome/nfp/nfp_app.c
index 396b93f54823..c9ccb0f94604 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c
@@ -38,6 +38,7 @@
 #include "nfpcore/nfp_nffw.h"
 #include "nfp_app.h"
 #include "nfp_main.h"
+#include "nfp_net_repr.h"
 
 static const struct nfp_app_type *apps[] = {
_nic,
@@ -68,6 +69,25 @@ struct sk_buff *nfp_app_ctrl_msg_alloc(struct nfp_app *app, 
unsigned int size)
return skb;
 }
 
+struct nfp_reprs *
+nfp_app_reprs_set(struct nfp_app *app, enum nfp_repr_type type,
+ struct nfp_reprs *reprs)
+{
+   struct nfp_reprs *old;
+
+   old = rcu_dereference_protected(app->reprs[type],
+   lockdep_is_held(>pf->lock));
+   if (reprs && old) {
+   old = ERR_PTR(-EBUSY);
+   goto exit_unlock;
+   }
+
+   rcu_assign_pointer(app->reprs[type], reprs);
+
+exit_unlock:
+   return old;
+}
+
 struct nfp_app *nfp_app_alloc(struct nfp_pf *pf, enum nfp_app_id id)
 {
struct nfp_app *app;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h 
b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 0fee14ffa081..af023a0491e7 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -36,6 +36,8 @@
 
 #include 
 
+#include "nfp_net_repr.h"
+
 struct bpf_prog;
 struct net_device;
 struct pci_dev;
@@ -73,6 +75,7 @@ extern const struct nfp_app_type app_bpf;
  * @tc_busy:   TC HW offload busy (rules loaded)
  * @xdp_offload:offload an XDP program
  * @eswitch_mode_get:get SR-IOV eswitch mode
+ * @repr_get:  get representor netdev
  */
 struct nfp_app_type {
enum nfp_app_id id;
@@ -100,6 +103,7 @@ struct nfp_app_type {
   struct bpf_prog *prog);
 
enum devlink_eswitch_mode (*eswitch_mode_get)(struct nfp_app *app);
+   struct net_device *(*repr_get)(struct nfp_app *app, u32 id);
 };
 
 /**
@@ -108,6 +112,7 @@ struct nfp_app_type {
  * @pf:backpointer to NFP PF structure
  * @cpp:   pointer to the CPP handle
  * @ctrl:  pointer to ctrl vNIC struct
+ * @reprs: array of pointers to representors
  * @type:  pointer to const application ops and info
  */
 struct nfp_app {
@@ -116,6 +121,7 @@ struct nfp_app {
struct nfp_cpp *cpp;
 
struct nfp_net *ctrl;
+   struct nfp_reprs __rcu *reprs[NFP_REPR_TYPE_MAX + 1];
 
const struct nfp_app_type *type;
 };
@@ -231,6 +237,18 @@ static inline int nfp_app_eswitch_mode_get(struct nfp_app 
*app, u16 *mode)
return 0;
 }
 
+static inline struct net_device *nfp_app_repr_get(struct nfp_app *app, u32 id)
+{
+   if (unlikely(!app || !app->type->repr_get))
+   return NULL;
+
+   return app->type->repr_get(app, id);
+}
+
+struct nfp_reprs *
+nfp_app_reprs_set(struct nfp_app *app, enum nfp_repr_type type,
+ struct nfp_reprs *reprs);
+
 const char *nfp_app_mip_name(struct nfp_app *app);
 struct sk_buff *nfp_app_ctrl_msg_alloc(struct nfp_app *app, unsigned int size);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
new file mode 100644
index ..bdd34d206d22
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the

[PATCH net-next 04/12] nfp: map mac_stats and vf_cfg BARs

2017-06-19 Thread Simon Horman

If present map mac_stats and vf_cfg BARs. These will be used by
representor netdevs to read statistics for phys port and vf representors.

Also provide defines describing the layout of the mac_stats area.
Similar defines are already present for the cf_cfg area.

Based in part on work by Jakub Kicinski.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_main.h  |   8 ++
 drivers/net/ethernet/netronome/nfp/nfp_net_main.c  | 116 +++--
 drivers/net/ethernet/netronome/nfp/nfp_port.h  |  60 +++
 .../net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h   |   2 +
 .../ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c   |   5 +-
 5 files changed, 161 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.h 
b/drivers/net/ethernet/netronome/nfp/nfp_main.h
index 88724f8d0dcd..aa69d4101eb9 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.h
@@ -68,6 +68,10 @@ struct nfp_rtsym_table;
  * @data_vnic_bar: Pointer to the CPP area for the data vNICs' BARs
  * @ctrl_vnic_bar: Pointer to the CPP area for the ctrl vNIC's BAR
  * @qc_area:   Pointer to the CPP area for the queues
+ * @mac_stats_bar: Pointer to the CPP area for the MAC stats
+ * @mac_stats_mem: Pointer to mapped MAC stats area
+ * @vf_cfg_bar:Pointer to the CPP area for the VF 
configuration BAR
+ * @vf_cfg_mem:Pointer to mapped VF configuration area
  * @irq_entries:   Array of MSI-X entries for all vNICs
  * @limit_vfs: Number of VFs supported by firmware (~0 for PCI limit)
  * @num_vfs:   Number of SR-IOV VFs enabled
@@ -97,6 +101,10 @@ struct nfp_pf {
struct nfp_cpp_area *data_vnic_bar;
struct nfp_cpp_area *ctrl_vnic_bar;
struct nfp_cpp_area *qc_area;
+   struct nfp_cpp_area *mac_stats_bar;
+   u8 __iomem *mac_stats_mem;
+   struct nfp_cpp_area *vf_cfg_bar;
+   u8 __iomem *vf_cfg_mem;
 
struct msix_entry *irq_entries;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
index bc2bc0886176..eb87e1c08bb1 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
@@ -570,6 +570,79 @@ static void nfp_net_pf_app_stop(struct nfp_pf *pf)
nfp_net_pf_app_stop_ctrl(pf);
 }
 
+static void nfp_net_pci_unmap_mem(struct nfp_pf *pf)
+{
+   if (pf->vf_cfg_bar)
+   nfp_cpp_area_release_free(pf->vf_cfg_bar);
+   if (pf->mac_stats_bar)
+   nfp_cpp_area_release_free(pf->mac_stats_bar);
+   nfp_cpp_area_release_free(pf->qc_area);
+   nfp_cpp_area_release_free(pf->data_vnic_bar);
+}
+
+static int nfp_net_pci_map_mem(struct nfp_pf *pf)
+{
+   u32 ctrl_bar_sz;
+   u8 __iomem *mem;
+   int err;
+
+   ctrl_bar_sz = pf->max_data_vnics * NFP_PF_CSR_SLICE_SIZE;
+   mem = nfp_net_pf_map_rtsym(pf, "net.ctrl", "_pf%d_net_bar0",
+  ctrl_bar_sz, >data_vnic_bar);
+   if (IS_ERR(mem)) {
+   err = PTR_ERR(mem);
+   if (!pf->fw_loaded && err == -ENOENT)
+   err = -EPROBE_DEFER;
+   return err;
+   }
+
+   pf->mac_stats_mem = nfp_net_pf_map_rtsym(pf, "net.macstats",
+"_mac_stats",
+NFP_MAC_STATS_SIZE *
+(pf->eth_tbl->max_index + 1),
+>mac_stats_bar);
+   if (IS_ERR(pf->mac_stats_mem)) {
+   if (PTR_ERR(pf->mac_stats_mem) != -ENOENT) {
+   err = PTR_ERR(pf->mac_stats_mem);
+   goto err_unmap_ctrl;
+   }
+   pf->mac_stats_mem = NULL;
+   }
+
+   pf->vf_cfg_mem = nfp_net_pf_map_rtsym(pf, "net.vfcfg",
+ "_pf%d_net_vf_bar",
+ NFP_NET_CFG_BAR_SZ *
+ pf->limit_vfs, >vf_cfg_bar);
+   if (IS_ERR(pf->vf_cfg_mem)) {
+   if (PTR_ERR(pf->vf_cfg_mem) != -ENOENT) {
+   err = PTR_ERR(pf->vf_cfg_mem);
+   goto err_unmap_mac_stats;
+   }
+   pf->vf_cfg_mem = NULL;
+   }
+
+   mem = nfp_net_map_area(pf->cpp, "net.qc", 0, 0,
+  NFP_PCIE_QUEUE(0), NFP_QCP_QUEUE_AREA_SZ,
+  >qc_area);
+   if (IS_ERR(mem)) {
+   nfp_err(pf->cpp, "Failed to map Queue Controller area.\n");
+   err = PTR_ERR(mem);
+   goto err_unmap_vf_cfg;
+   }
+
+   return 0;
+
+err_unmap_vf_cfg:
+   if (pf->vf_cfg_bar)
+

[PATCH net-next 12/12] nfp: add VF and PF representors to flower app

2017-06-19 Thread Simon Horman

Initialise VF and PF representors in flower app.

Based in part on work by Benjamin LaHaise, Bert van Leeuwen and
Jakub Kicinski.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/flower/main.c | 86 +++-
 1 file changed, 84 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c 
b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 01864840a21b..b30f1c4ffd3a 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -149,15 +149,81 @@ static const struct net_device_ops 
nfp_flower_repr_netdev_ops = {
.ndo_get_offload_stats  = nfp_repr_get_offload_stats,
 };
 
+static void nfp_flower_sriov_disable(struct nfp_app *app)
+{
+   nfp_reprs_clean_and_free_by_type(app, NFP_REPR_TYPE_VF);
+}
+
+static int
+nfp_flower_spawn_vnic_reprs(struct nfp_app *app,
+   enum nfp_flower_cmsg_port_vnic_type vnic_type,
+   enum nfp_repr_type repr_type, unsigned int cnt)
+{
+   u8 nfp_pcie = nfp_cppcore_pcie_unit(app->pf->cpp);
+   struct nfp_flower_priv *priv = app->priv;
+   struct nfp_reprs *reprs, *old_reprs;
+   const u8 queue = 0;
+   int i, err;
+
+   reprs = nfp_reprs_alloc(cnt);
+   if (!reprs)
+   return -ENOMEM;
+
+   for (i = 0; i < cnt; i++) {
+   u32 port_id;
+
+   reprs->reprs[i] = nfp_repr_alloc(app);
+   if (!reprs->reprs[i]) {
+   err = -ENOMEM;
+   goto err_reprs_clean;
+   }
+
+   SET_NETDEV_DEV(reprs->reprs[i], >nn->pdev->dev);
+   eth_hw_addr_inherit(reprs->reprs[i], priv->nn->dp.netdev);
+
+   port_id = nfp_flower_cmsg_pcie_port(nfp_pcie, vnic_type,
+   i, queue);
+   err = nfp_repr_init(app, reprs->reprs[i],
+   _flower_repr_netdev_ops,
+   port_id, NULL, priv->nn->dp.netdev);
+   if (err)
+   goto err_reprs_clean;
+
+   nfp_info(app->cpp, "%s%d Representor(%s) created\n",
+repr_type == NFP_REPR_TYPE_PF ? "PF" : "VF", i,
+reprs->reprs[i]->name);
+   }
+
+   old_reprs = nfp_app_reprs_set(app, repr_type, reprs);
+   if (IS_ERR(old_reprs)) {
+   err = PTR_ERR(old_reprs);
+   goto err_reprs_clean;
+   }
+
+   return 0;
+err_reprs_clean:
+   nfp_reprs_clean_and_free(reprs);
+   return err;
+}
+
+static int nfp_flower_sriov_enable(struct nfp_app *app, int num_vfs)
+{
+   return nfp_flower_spawn_vnic_reprs(app,
+  NFP_FLOWER_CMSG_PORT_VNIC_TYPE_VF,
+  NFP_REPR_TYPE_VF, num_vfs);
+}
+
 static void nfp_flower_stop(struct nfp_app *app)
 {
+   nfp_reprs_clean_and_free_by_type(app, NFP_REPR_TYPE_PF);
nfp_reprs_clean_and_free_by_type(app, NFP_REPR_TYPE_PHYS_PORT);
+
 }
 
-static int nfp_flower_start(struct nfp_app *app)
+static int
+nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 {
struct nfp_eth_table *eth_tbl = app->pf->eth_tbl;
-   struct nfp_flower_priv *priv = app->priv;
struct nfp_reprs *reprs, *old_reprs;
unsigned int i;
int err;
@@ -218,6 +284,19 @@ static int nfp_flower_start(struct nfp_app *app)
return err;
 }
 
+static int nfp_flower_start(struct nfp_app *app)
+{
+   int err;
+
+   err = nfp_flower_spawn_phy_reprs(app, app->priv);
+   if (err)
+   return err;
+
+   return nfp_flower_spawn_vnic_reprs(app,
+  NFP_FLOWER_CMSG_PORT_VNIC_TYPE_PF,
+  NFP_REPR_TYPE_PF, 1);
+}
+
 static void nfp_flower_vnic_clean(struct nfp_app *app, struct nfp_net *nn)
 {
kfree(app->priv);
@@ -289,6 +368,9 @@ const struct nfp_app_type app_flower = {
 
.ctrl_msg_rx= nfp_flower_cmsg_rx,
 
+   .sriov_enable   = nfp_flower_sriov_enable,
+   .sriov_disable  = nfp_flower_sriov_disable,
+
.eswitch_mode_get  = eswitch_mode_get,
.repr_get   = nfp_flower_repr_get,
 };
-- 
2.1.4

[PATCH net-next 07/12] nfp: app callbacks for SRIOV

2017-06-19 Thread Simon Horman

Add app-callbacks for app-specific initialisation of SRIOV.

Disabling SRIOV is brought forward in nfp_pci_remove()
so that nfp_app_sriov_disable is called while the app still exists.

This is intended to be used to implement representor netdevs for virtual
ports.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_app.h  | 18 
 drivers/net/ethernet/netronome/nfp/nfp_main.c | 42 +++
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h 
b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index af023a0491e7..ff2d43615808 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -75,6 +75,8 @@ extern const struct nfp_app_type app_bpf;
  * @tc_busy:   TC HW offload busy (rules loaded)
  * @xdp_offload:offload an XDP program
  * @eswitch_mode_get:get SR-IOV eswitch mode
+ * @sriov_enable: app-specific sriov initialisation
+ * @sriov_disable: app-specific sriov clean-up
  * @repr_get:  get representor netdev
  */
 struct nfp_app_type {
@@ -102,6 +104,9 @@ struct nfp_app_type {
int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn,
   struct bpf_prog *prog);
 
+   int (*sriov_enable)(struct nfp_app *app, int num_vfs);
+   void (*sriov_disable)(struct nfp_app *app);
+
enum devlink_eswitch_mode (*eswitch_mode_get)(struct nfp_app *app);
struct net_device *(*repr_get)(struct nfp_app *app, u32 id);
 };
@@ -237,6 +242,19 @@ static inline int nfp_app_eswitch_mode_get(struct nfp_app 
*app, u16 *mode)
return 0;
 }
 
+static inline int nfp_app_sriov_enable(struct nfp_app *app, int num_vfs)
+{
+   if (!app || !app->type->sriov_enable)
+   return -EOPNOTSUPP;
+   return app->type->sriov_enable(app, num_vfs);
+}
+
+static inline void nfp_app_sriov_disable(struct nfp_app *app)
+{
+   if (app && app->type->sriov_disable)
+   app->type->sriov_disable(app);
+}
+
 static inline struct net_device *nfp_app_repr_get(struct nfp_app *app, u32 id)
 {
if (unlikely(!app || !app->type->repr_get))
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c 
b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index 4e59dcb78c36..748e54cc885e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -54,6 +54,7 @@
 
 #include "nfpcore/nfp6000_pcie.h"
 
+#include "nfp_app.h"
 #include "nfp_main.h"
 #include "nfp_net.h"
 
@@ -97,28 +98,45 @@ static int nfp_pcie_sriov_enable(struct pci_dev *pdev, int 
num_vfs)
struct nfp_pf *pf = pci_get_drvdata(pdev);
int err;
 
+   mutex_lock(>lock);
+
if (num_vfs > pf->limit_vfs) {
nfp_info(pf->cpp, "Firmware limits number of VFs to %u\n",
 pf->limit_vfs);
-   return -EINVAL;
+   err = -EINVAL;
+   goto err_unlock;
+   }
+
+   err = nfp_app_sriov_enable(pf->app, num_vfs);
+   if (err) {
+   dev_warn(>dev, "App specific PCI sriov configuration 
failed: %d\n",
+err);
+   goto err_unlock;
}
 
err = pci_enable_sriov(pdev, num_vfs);
if (err) {
dev_warn(>dev, "Failed to enable PCI sriov: %d\n", err);
-   return err;
+   goto err_app_sriov_disable;
}
 
pf->num_vfs = num_vfs;
 
dev_dbg(>dev, "Created %d VFs.\n", pf->num_vfs);
 
+   mutex_unlock(>lock);
return num_vfs;
+
+err_app_sriov_disable:
+   nfp_app_sriov_disable(pf->app);
+err_unlock:
+   mutex_unlock(>lock);
+   return err;
 #endif
return 0;
 }
 
-static int nfp_pcie_sriov_disable(struct pci_dev *pdev)
+static int __nfp_pcie_sriov_disable(struct pci_dev *pdev)
 {
 #ifdef CONFIG_PCI_IOV
struct nfp_pf *pf = pci_get_drvdata(pdev);
@@ -132,6 +150,8 @@ static int nfp_pcie_sriov_disable(struct pci_dev *pdev)
return -EPERM;
}
 
+   nfp_app_sriov_disable(pf->app);
+
pf->num_vfs = 0;
 
pci_disable_sriov(pdev);
@@ -140,6 +160,18 @@ static int nfp_pcie_sriov_disable(struct pci_dev *pdev)
return 0;
 }
 
+static int nfp_pcie_sriov_disable(struct pci_dev *pdev)
+{
+   struct nfp_pf *pf = pci_get_drvdata(pdev);
+   int err;
+
+   mutex_lock(>lock);
+   err = __nfp_pcie_sriov_disable(pdev);
+   mutex_unlock(>lock);
+
+   return err;
+}
+
 static int nfp_pcie_sriov_configure(struct pci_dev *pdev, int num_vfs)
 {
if (num_vfs == 0)
@@ -431,11 +463,11 @@ static void nfp_pci_remove(struct pci_dev *pdev)
 
devlink = priv_to_devlink(pf);
 
-   nfp_net_pci_remove(pf);
-
nfp_pcie_sriov_disable(pdev);
pci_sriov_set_totalvfs(pf->pdev, 0);
 
+   nfp_net_pci_remove(pf);
+

[PATCH net-next 09/12] nfp: add support for tx/rx with metadata portid

2017-06-19 Thread Simon Horman

Allow tx/rx with metadata port id. This will be used for tx/rx of
representor netdevs acting as upper-devices while a pf netdev acts
as a lower-device.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 57 +++---
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 02fd8d4e253c..96c8ea476c05 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -318,6 +318,7 @@ struct nfp_meta_parsed {
u8 csum_type;
u32 hash;
u32 mark;
+   u32 portid;
__wsum csum;
 };
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 2b1ae666..046e4d929e93 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -755,6 +755,26 @@ static void nfp_net_tx_xmit_more_flush(struct 
nfp_net_tx_ring *tx_ring)
tx_ring->wr_ptr_add = 0;
 }
 
+static int nfp_net_prep_port_id(struct sk_buff *skb)
+{
+   struct metadata_dst *md_dst = skb_metadata_dst(skb);
+   unsigned char *data;
+
+   if (likely(!md_dst))
+   return 0;
+   if (unlikely(md_dst->type != METADATA_HW_PORT_MUX))
+   return 0;
+
+   if (unlikely(skb_cow_head(skb, 8)))
+   return -ENOMEM;
+
+   data = skb_push(skb, 8);
+   put_unaligned_be32(NFP_NET_META_PORTID, data);
+   memcpy(data + 4, _dst->u.port_info.port_id, 4);
+
+   return 8;
+}
+
 /**
  * nfp_net_tx() - Main transmit entry point
  * @skb:SKB to transmit
@@ -767,6 +787,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct 
net_device *netdev)
struct nfp_net *nn = netdev_priv(netdev);
const struct skb_frag_struct *frag;
struct nfp_net_tx_desc *txd, txdg;
+   int f, nr_frags, wr_idx, md_bytes;
struct nfp_net_tx_ring *tx_ring;
struct nfp_net_r_vector *r_vec;
struct nfp_net_tx_buf *txbuf;
@@ -774,8 +795,6 @@ static int nfp_net_tx(struct sk_buff *skb, struct 
net_device *netdev)
struct nfp_net_dp *dp;
dma_addr_t dma_addr;
unsigned int fsize;
-   int f, nr_frags;
-   int wr_idx;
u16 qidx;
 
dp = >dp;
@@ -797,6 +816,13 @@ static int nfp_net_tx(struct sk_buff *skb, struct 
net_device *netdev)
return NETDEV_TX_BUSY;
}
 
+   md_bytes = nfp_net_prep_port_id(skb);
+   if (unlikely(md_bytes < 0)) {
+   nfp_net_tx_xmit_more_flush(tx_ring);
+   dev_kfree_skb_any(skb);
+   return NETDEV_TX_OK;
+   }
+
/* Start with the head skbuf */
dma_addr = dma_map_single(dp->dev, skb->data, skb_headlen(skb),
  DMA_TO_DEVICE);
@@ -815,7 +841,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct 
net_device *netdev)
 
/* Build TX descriptor */
txd = _ring->txds[wr_idx];
-   txd->offset_eop = (nr_frags == 0) ? PCIE_DESC_TX_EOP : 0;
+   txd->offset_eop = (nr_frags ? 0 : PCIE_DESC_TX_EOP) | md_bytes;
txd->dma_len = cpu_to_le16(skb_headlen(skb));
nfp_desc_set_dma_addr(txd, dma_addr);
txd->data_len = cpu_to_le16(skb->len);
@@ -855,7 +881,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct 
net_device *netdev)
*txd = txdg;
txd->dma_len = cpu_to_le16(fsize);
nfp_desc_set_dma_addr(txd, dma_addr);
-   txd->offset_eop =
+   txd->offset_eop |=
(f == nr_frags - 1) ? PCIE_DESC_TX_EOP : 0;
}
 
@@ -1450,6 +1476,10 @@ nfp_net_parse_meta(struct net_device *netdev, struct 
nfp_meta_parsed *meta,
meta->mark = get_unaligned_be32(data);
data += 4;
break;
+   case NFP_NET_META_PORTID:
+   meta->portid = get_unaligned_be32(data);
+   data += 4;
+   break;
case NFP_NET_META_CSUM:
meta->csum_type = CHECKSUM_COMPLETE;
meta->csum =
@@ -1594,6 +1624,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, 
int budget)
struct nfp_net_rx_buf *rxbuf;
struct nfp_net_rx_desc *rxd;
struct nfp_meta_parsed meta;
+   struct net_device *netdev;
dma_addr_t new_dma_addr;
void *new_frag;
 
@@ -1672,7 +1703,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, 
int budget)
}
 
if (xdp_prog && !(rxd->rxd.flags &

[PATCH net-next 08/12] nfp: provide nfp_port to of nfp_net_get_mac_addr()

2017-06-19 Thread Simon Horman

Provide port rather than vNIC as parameter of nfp_net_get_mac_addr.
This is to allow this function to be used by representor netdevs where
a vNIC may have more than one physical port none of which are associated
with the vNIC.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_app_nic.c  |  2 +-
 drivers/net/ethernet/netronome/nfp/nfp_main.h |  3 ++-
 drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 25 +++
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c 
b/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c
index 7b966bd3d214..c11a6c34e217 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c
@@ -69,7 +69,7 @@ int nfp_app_nic_vnic_init(struct nfp_app *app, struct nfp_net 
*nn,
if (err)
return err < 0 ? err : 0;
 
-   nfp_net_get_mac_addr(app->pf, nn, id);
+   nfp_net_get_mac_addr(app->pf, nn->port, id);
 
return 0;
 }
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.h 
b/drivers/net/ethernet/netronome/nfp/nfp_main.h
index aa69d4101eb9..edc14dc78674 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.h
@@ -58,6 +58,7 @@ struct nfp_hwinfo;
 struct nfp_mip;
 struct nfp_net;
 struct nfp_nsp_identify;
+struct nfp_port;
 struct nfp_rtsym_table;
 
 /**
@@ -147,7 +148,7 @@ void nfp_hwmon_unregister(struct nfp_pf *pf);
 struct nfp_eth_table_port *
 nfp_net_find_port(struct nfp_eth_table *eth_tbl, unsigned int id);
 void
-nfp_net_get_mac_addr(struct nfp_pf *pf, struct nfp_net *nn, unsigned int id);
+nfp_net_get_mac_addr(struct nfp_pf *pf, struct nfp_port *port, unsigned int 
id);
 
 bool nfp_ctrl_tx(struct nfp_net *nn, struct sk_buff *skb);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
index eb87e1c08bb1..e16a5fa92279 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
@@ -135,25 +135,24 @@ static u8 __iomem *nfp_net_map_area(struct nfp_cpp *cpp,
 /**
  * nfp_net_get_mac_addr() - Get the MAC address.
  * @pf:   NFP PF handle
- * @nn:   NFP Network structure
+ * @port: NFP port structure
  * @id:  NFP port id
  *
  * First try to get the MAC address from NSP ETH table. If that
  * fails try HWInfo.  As a last resort generate a random address.
  */
 void
-nfp_net_get_mac_addr(struct nfp_pf *pf, struct nfp_net *nn, unsigned int id)
+nfp_net_get_mac_addr(struct nfp_pf *pf, struct nfp_port *port, unsigned int id)
 {
struct nfp_eth_table_port *eth_port;
-   struct nfp_net_dp *dp = >dp;
u8 mac_addr[ETH_ALEN];
const char *mac_str;
char name[32];
 
-   eth_port = __nfp_port_get_eth_port(nn->port);
+   eth_port = __nfp_port_get_eth_port(port);
if (eth_port) {
-   ether_addr_copy(dp->netdev->dev_addr, eth_port->mac_addr);
-   ether_addr_copy(dp->netdev->perm_addr, eth_port->mac_addr);
+   ether_addr_copy(port->netdev->dev_addr, eth_port->mac_addr);
+   ether_addr_copy(port->netdev->perm_addr, eth_port->mac_addr);
return;
}
 
@@ -161,22 +160,22 @@ nfp_net_get_mac_addr(struct nfp_pf *pf, struct nfp_net 
*nn, unsigned int id)
 
mac_str = nfp_hwinfo_lookup(pf->hwinfo, name);
if (!mac_str) {
-   dev_warn(dp->dev, "Can't lookup MAC address. Generate\n");
-   eth_hw_addr_random(dp->netdev);
+   nfp_warn(pf->cpp, "Can't lookup MAC address. Generate\n");
+   eth_hw_addr_random(port->netdev);
return;
}
 
if (sscanf(mac_str, "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx",
   _addr[0], _addr[1], _addr[2],
   _addr[3], _addr[4], _addr[5]) != 6) {
-   dev_warn(dp->dev,
-"Can't parse MAC address (%s). Generate.\n", mac_str);
-   eth_hw_addr_random(dp->netdev);
+   nfp_warn(pf->cpp, "Can't parse MAC address (%s). Generate.\n",
+mac_str);
+   eth_hw_addr_random(port->netdev);
return;
}
 
-   ether_addr_copy(dp->netdev->dev_addr, mac_addr);
-   ether_addr_copy(dp->netdev->perm_addr, mac_addr);
+   ether_addr_copy(port->netdev->dev_addr, mac_addr);
+   ether_addr_copy(port->netdev->perm_addr, mac_addr);
 }
 
 struct nfp_eth_table_port *
-- 
2.1.4

[PATCH net-next 10/12] nfp: add support for control messages for flower app

2017-06-19 Thread Simon Horman

In preparation for adding a new flower app - targeted at offloading
the flower classifier - provide support for control message that it will
use to communicate with the NFP.

Based in part on work by Bert van Leeuwen.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/Makefile  |   1 +
 drivers/net/ethernet/netronome/nfp/flower/cmsg.c | 159 +++
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h | 116 +
 drivers/net/ethernet/netronome/nfp/nfp_app.c |   5 +-
 drivers/net/ethernet/netronome/nfp/nfp_app.h |   3 +-
 5 files changed, 281 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/netronome/nfp/flower/cmsg.c
 create mode 100644 drivers/net/ethernet/netronome/nfp/flower/cmsg.h

diff --git a/drivers/net/ethernet/netronome/nfp/Makefile 
b/drivers/net/ethernet/netronome/nfp/Makefile
index a401113035f5..e14f62863add 100644
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -27,6 +27,7 @@ nfp-objs := \
nfp_port.o \
bpf/main.o \
bpf/offload.o \
+   flower/cmsg.o \
nic/main.o
 
 ifeq ($(CONFIG_BPF_SYSCALL),y)
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c 
b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
new file mode 100644
index ..523ae03b49c5
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright (C) 2015-2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "../nfpcore/nfp_cpp.h"
+#include "../nfp_net_repr.h"
+#include "./cmsg.h"
+
+#define nfp_flower_cmsg_warn(app, fmt, args...)
\
+   do {\
+   if (net_ratelimit())\
+   nfp_warn((app)->cpp, fmt, ## args); \
+   } while (0)
+
+static struct nfp_flower_cmsg_hdr *
+nfp_flower_cmsg_get_hdr(struct sk_buff *skb)
+{
+   return (struct nfp_flower_cmsg_hdr *)skb->data;
+}
+
+static void *nfp_flower_cmsg_get_data(struct sk_buff *skb)
+{
+   return (unsigned char *)skb->data + NFP_FLOWER_CMSG_HLEN;
+}
+
+static struct sk_buff *
+nfp_flower_cmsg_alloc(struct nfp_app *app, unsigned int size,
+ enum nfp_flower_cmsg_type_port type)
+{
+   struct nfp_flower_cmsg_hdr *ch;
+   struct sk_buff *skb;
+
+   size += NFP_FLOWER_CMSG_HLEN;
+
+   skb = nfp_app_ctrl_msg_alloc(app, size, GFP_KERNEL);
+   if (!skb)
+   return NULL;
+
+   ch = nfp_flower_cmsg_get_hdr(skb);
+   ch->pad = 0;
+   ch->version = NFP_FLOWER_CMSG_VER1;
+   ch->type = type;
+   skb_put(skb, size);
+
+   return skb;
+}
+
+int nfp_flower_cmsg_portmod(struct net_device *netdev)
+{
+   struct nfp_repr *repr = netdev_priv(netdev);
+   struct nfp_flower_cmsg_portmod *msg;
+   struct sk_buff *skb;
+
+   skb = nfp_flower_cmsg_alloc(repr->app, sizeof(*msg),
+   NFP_FLOWER_CMSG_TYPE_PORT_MOD);
+   if (!skb)
+   return -ENOMEM;
+
+   msg = nfp_flower_cmsg_get_data(skb);
+   msg->portnum = repr->dst->u.port_info.port_id;
+   msg->reserved = 0;
+   msg->info = netif_carrier_ok(netdev);
+   msg->mtu = cpu_to_be16(netdev->mtu);
+
+   nfp_ctrl_tx(repr->app->ctrl, skb);
+
+   return

[PATCH net-next 03/12] nfp: move physical port init into a helper

2017-06-19 Thread Simon Horman

From: Jakub Kicinski 

Move MAC/PHY port init into a helper to make it easier to reuse
it in the representor code.

Signed-off-by: Jakub Kicinski 
Signed-off-by: Simon Horman 
---
 drivers/net/ethernet/netronome/nfp/nfp_app_nic.c | 23 ++
 drivers/net/ethernet/netronome/nfp/nfp_port.c| 25 
 drivers/net/ethernet/netronome/nfp/nfp_port.h|  3 +++
 3 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c 
b/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c
index 83c65e6291ee..7b966bd3d214 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c
@@ -42,6 +42,8 @@ static int
 nfp_app_nic_vnic_init_phy_port(struct nfp_pf *pf, struct nfp_app *app,
   struct nfp_net *nn, unsigned int id)
 {
+   int err;
+
if (!pf->eth_tbl)
return 0;
 
@@ -49,26 +51,13 @@ nfp_app_nic_vnic_init_phy_port(struct nfp_pf *pf, struct 
nfp_app *app,
if (IS_ERR(nn->port))
return PTR_ERR(nn->port);
 
-   nn->port->eth_id = id;
-   nn->port->eth_port = nfp_net_find_port(pf->eth_tbl, id);
-
-   /* Check if vNIC has external port associated and cfg is OK */
-   if (!nn->port->eth_port) {
-   nfp_err(app->cpp,
-   "NSP port entries don't match vNICs (no entry for port 
#%d)\n",
-   id);
+   err = nfp_port_init_phy_port(pf, app, nn->port, id);
+   if (err) {
nfp_port_free(nn->port);
-   return -EINVAL;
-   }
-   if (nn->port->eth_port->override_changed) {
-   nfp_warn(app->cpp,
-"Config changed for port #%d, reboot required before 
port will be operational\n",
-id);
-   nn->port->type = NFP_PORT_INVALID;
-   return 1;
+   return err;
}
 
-   return 0;
+   return nn->port->type == NFP_PORT_INVALID;
 }
 
 int nfp_app_nic_vnic_init(struct nfp_app *app, struct nfp_net *nn,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c 
b/drivers/net/ethernet/netronome/nfp/nfp_port.c
index a17410ac01ab..19bceeb82225 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c
@@ -33,6 +33,7 @@
 
 #include 
 
+#include "nfpcore/nfp_cpp.h"
 #include "nfpcore/nfp_nsp.h"
 #include "nfp_app.h"
 #include "nfp_main.h"
@@ -112,6 +113,30 @@ nfp_port_get_phys_port_name(struct net_device *netdev, 
char *name, size_t len)
return 0;
 }
 
+int nfp_port_init_phy_port(struct nfp_pf *pf, struct nfp_app *app,
+  struct nfp_port *port, unsigned int id)
+{
+   port->eth_id = id;
+   port->eth_port = nfp_net_find_port(pf->eth_tbl, id);
+
+   /* Check if vNIC has external port associated and cfg is OK */
+   if (!port->eth_port) {
+   nfp_err(app->cpp,
+   "NSP port entries don't match vNICs (no entry for port 
#%d)\n",
+   id);
+   return -EINVAL;
+   }
+   if (port->eth_port->override_changed) {
+   nfp_warn(app->cpp,
+"Config changed for port #%d, reboot required before 
port will be operational\n",
+id);
+   port->type = NFP_PORT_INVALID;
+   return 0;
+   }
+
+   return 0;
+}
+
 struct nfp_port *
 nfp_port_alloc(struct nfp_app *app, enum nfp_port_type type,
   struct net_device *netdev)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h 
b/drivers/net/ethernet/netronome/nfp/nfp_port.h
index 4d1a9b3fed41..fb28c7071987 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h
@@ -104,6 +104,9 @@ nfp_port_alloc(struct nfp_app *app, enum nfp_port_type type,
   struct net_device *netdev);
 void nfp_port_free(struct nfp_port *port);
 
+int nfp_port_init_phy_port(struct nfp_pf *pf, struct nfp_app *app,
+  struct nfp_port *port, unsigned int id);
+
 int nfp_net_refresh_eth_port(struct nfp_port *port);
 void nfp_net_refresh_port_table(struct nfp_port *port);
 int nfp_net_refresh_port_table_sync(struct nfp_pf *pf);
-- 
2.1.4

[PATCH net-next 01/12] net: store port/representator id in metadata_dst

2017-06-19 Thread Simon Horman

From: Jakub Kicinski 

Switches and modern SR-IOV enabled NICs may multiplex traffic from Port
representators and control messages over single set of hardware queues.
Control messages and muxed traffic may need ordered delivery.

Those requirements make it hard to comfortably use TC infrastructure today
unless we have a way of attaching metadata to skbs at the upper device.
Because single set of queues is used for many netdevs stopping TC/sched queues
of all of them reliably is impossible and lower device has to retreat to
returning NETDEV_TX_BUSY and usually has to take extra locks on the fastpath.

This patch attempts to enable port/representative devs to attach metadata to
skbs which carry port id.  This way representatives can be queueless and all
queuing can be performed at the lower netdev in the usual way.

Traffic arriving on the port/representative interfaces will be have metadata
attached and will subsequently be queued to the lower device for transmission.
The lower device should recognize the metadata and translate it to HW specific
format which is most likely either a special header inserted before the network
headers or descriptor/metadata fields.

Metadata is associated with the lower device by storing the netdev pointer
along with port id so that if TC decides to redirect or mirror the new netdev
will not try to interpret it.

This is mostly for SR-IOV devices since switches don't have lower netdevs
today.

Signed-off-by: Jakub Kicinski 
Signed-off-by: Sridhar Samudrala 
Signed-off-by: Simon Horman 
---
 include/net/dst_metadata.h | 41 -
 net/core/dst.c | 15 ++-
 net/core/filter.c  |  1 +
 net/ipv4/ip_tunnel_core.c  |  6 --
 net/openvswitch/flow_netlink.c |  4 +++-
 5 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 701fc814d0af..a803129a4849 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -5,10 +5,22 @@
 #include 
 #include 
 
+enum metadata_type {
+   METADATA_IP_TUNNEL,
+   METADATA_HW_PORT_MUX,
+};
+
+struct hw_port_info {
+   struct net_device *lower_dev;
+   u32 port_id;
+};
+
 struct metadata_dst {
struct dst_entrydst;
+   enum metadata_type  type;
union {
struct ip_tunnel_info   tun_info;
+   struct hw_port_info port_info;
} u;
 };
 
@@ -27,7 +39,7 @@ static inline struct ip_tunnel_info *skb_tunnel_info(struct 
sk_buff *skb)
struct metadata_dst *md_dst = skb_metadata_dst(skb);
struct dst_entry *dst;
 
-   if (md_dst)
+   if (md_dst && md_dst->type == METADATA_IP_TUNNEL)
return _dst->u.tun_info;
 
dst = skb_dst(skb);
@@ -55,22 +67,33 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff 
*skb_a,
a = (const struct metadata_dst *) skb_dst(skb_a);
b = (const struct metadata_dst *) skb_dst(skb_b);
 
-   if (!a != !b || a->u.tun_info.options_len != b->u.tun_info.options_len)
+   if (!a != !b || a->type != b->type)
return 1;
 
-   return memcmp(>u.tun_info, >u.tun_info,
- sizeof(a->u.tun_info) + a->u.tun_info.options_len);
+   switch (a->type) {
+   case METADATA_HW_PORT_MUX:
+   return memcmp(>u.port_info, >u.port_info,
+ sizeof(a->u.port_info));
+   case METADATA_IP_TUNNEL:
+   return memcmp(>u.tun_info, >u.tun_info,
+ sizeof(a->u.tun_info) +
+a->u.tun_info.options_len);
+   default:
+   return 1;
+   }
 }
 
 void metadata_dst_free(struct metadata_dst *);
-struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
-struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t 
flags);
+struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
+   gfp_t flags);
+struct metadata_dst __percpu *
+metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags);
 
 static inline struct metadata_dst *tun_rx_dst(int md_size)
 {
struct metadata_dst *tun_dst;
 
-   tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC);
+   tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
if (!tun_dst)
return NULL;
 
@@ -85,11 +108,11 @@ static inline struct metadata_dst *tun_dst_unclone(struct 
sk_buff *skb)
int md_size;
struct metadata_dst *new_md;
 
-   if (!md_dst)
+   if (!md_dst || md_dst->type != METADATA_IP_TUNNEL)
return ERR_PTR(-EINVAL);
 
md_size = md_dst->u.tun_info.options_len;
-   new_md = metadata_dst_alloc(md_size, GFP_ATOMIC);
+

[PATCH net-next 00/12] nfp: add flower app with representors

2017-06-19 Thread Simon Horman

Hi,

this series adds a flower app to the NFP driver.
It initialises four types of netdevs:

* PF netdev - lower-device for communication of packets to device
* PF representor netdev
* VF representor netdevs
* Phys port representor netdevs

The PF netdev acts as a lower-device which sends and receives packets to
and from the firmware. The representors act as upper-devices. For TX
representors attach a metadata dst to the skb which is used by the PF
netdev to prepend metadata to the packet before forwarding the firmware. On
RX the PF netdev looks up the representor based on the prepended metadata
recieved from the firmware and forwards the skb to the representor after
removing the metadata.

Control queues are used to send and receive control messages which are
used to communicate configuration information with the firmware. These
are in separate vNIC to the queues belonging to the PF netdev. The control
queues are not exposed to use-space via a netdev or any other means.

As the name implies this app is targeted at providing offload of TC flower.
That will be added by follow-up work. This patchset focuses on adding phys
port and VF representor netdevs to which flower classifiers may be attached.

Jakub Kicinski (3):
  net: store port/representator id in metadata_dst
  nfp: devlink add support for getting eswitch mode
  nfp: move physical port init into a helper

Simon Horman (9):
  nfp: map mac_stats and vf_cfg BARs
  nfp: general representor implementation
  nfp: add stats and xmit helpers for representors
  nfp: app callbacks for SRIOV
  nfp: provide nfp_port to of nfp_net_get_mac_addr()
  nfp: add support for tx/rx with metadata portid
  nfp: add support for control messages for flower app
  nfp: add flower app
  nfp: add VF and PF representors to flower app

 drivers/net/ethernet/netronome/nfp/Makefile|   3 +
 drivers/net/ethernet/netronome/nfp/flower/cmsg.c   | 159 +
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   | 116 +++
 drivers/net/ethernet/netronome/nfp/flower/main.c   | 376 +
 drivers/net/ethernet/netronome/nfp/nfp_app.c   |  26 +-
 drivers/net/ethernet/netronome/nfp/nfp_app.h   |  58 +++-
 drivers/net/ethernet/netronome/nfp/nfp_app_nic.c   |  25 +-
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c   |  18 +
 drivers/net/ethernet/netronome/nfp/nfp_main.c  |  42 ++-
 drivers/net/ethernet/netronome/nfp/nfp_main.h  |  11 +-
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |   1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c|  57 +++-
 drivers/net/ethernet/netronome/nfp/nfp_net_main.c  | 141 +---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c  | 352 +++
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h  | 120 +++
 drivers/net/ethernet/netronome/nfp/nfp_port.c  |  25 ++
 drivers/net/ethernet/netronome/nfp/nfp_port.h  |  63 
 .../net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h   |   2 +
 .../ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c   |   5 +-
 include/net/dst_metadata.h |  41 ++-
 net/core/dst.c |  15 +-
 net/core/filter.c  |   1 +
 net/ipv4/ip_tunnel_core.c  |   6 +-
 net/openvswitch/flow_netlink.c |   4 +-
 24 files changed, 1574 insertions(+), 93 deletions(-)
 create mode 100644 drivers/net/ethernet/netronome/nfp/flower/cmsg.c
 create mode 100644 drivers/net/ethernet/netronome/nfp/flower/cmsg.h
 create mode 100644 drivers/net/ethernet/netronome/nfp/flower/main.c
 create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
 create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h

-- 
2.1.4

Re: [PATCH net-next 0/1] Introduction of the tc tests

2017-06-19 Thread Stephen Hemminger

On Mon, 19 Jun 2017 23:48:19 -0400 (EDT)
David Miller  wrote:

> From: Cong Wang 
> Date: Mon, 19 Jun 2017 16:37:29 -0700
> 
> > Hi,
> > 
> > On Fri, Jun 16, 2017 at 2:22 PM, Lucas Bates  wrote:  
> >> Apologies for sending this as one big patch. I've been sitting on this a 
> >> little
> >> too long, but it's ready and I wanted to get it out.
> >>
> >> There are a limited number of tests to start - I plan to add more on a 
> >> regular
> >> basis.
> >>
> >> Lucas Bates (1):
> >>   selftests: Introduce tc testsuite  
> > 
> > Nice work!
> > 
> > Is there any particular reason you want to put these tests in kernel tree
> > especially tools/testing/selftests/ ?  
> 
> Yeah, it would be absolutely terrible if we had more tests in the
> kernel selftests area for networking.
> 
> More seriously, we need more, not less, tests in the kernel networking
> selftests directory.
> 
> It doesn't belong in iproute2 because we want a place to put things
> that automatically get tested when someone makes kernel changes and
> can be integrated into the kernel development workflow.
> 
> I want as many tests as possible under there, so I'm really surprised
> that you're asking "why" tests are being added there.

The "Occum's razor" for deciding where tests belong should be does
the test need to change to respond to kernel change? Don't want to have
iproute2 tests that have if (kernel_version > ...)

Re: [PATCH net-next 0/1] Introduction of the tc tests

2017-06-19 Thread Stephen Hemminger

On Mon, 19 Jun 2017 23:48:19 -0400 (EDT)
David Miller  wrote:

> From: Cong Wang 
> Date: Mon, 19 Jun 2017 16:37:29 -0700
> 
> > Hi,
> > 
> > On Fri, Jun 16, 2017 at 2:22 PM, Lucas Bates  wrote:  
> >> Apologies for sending this as one big patch. I've been sitting on this a 
> >> little
> >> too long, but it's ready and I wanted to get it out.
> >>
> >> There are a limited number of tests to start - I plan to add more on a 
> >> regular
> >> basis.
> >>
> >> Lucas Bates (1):
> >>   selftests: Introduce tc testsuite  
> > 
> > Nice work!
> > 
> > Is there any particular reason you want to put these tests in kernel tree
> > especially tools/testing/selftests/ ?  
> 
> Yeah, it would be absolutely terrible if we had more tests in the
> kernel selftests area for networking.
> 
> More seriously, we need more, not less, tests in the kernel networking
> selftests directory.
> 
> It doesn't belong in iproute2 because we want a place to put things
> that automatically get tested when someone makes kernel changes and
> can be integrated into the kernel development workflow.
> 
> I want as many tests as possible under there, so I'm really surprised
> that you're asking "why" tests are being added there.

I agree these tests should be more about kernel behavior and updated when kernel
changes. Iproute2 has some outdated tests of its own, but these are more 
functional
tests for the command portion.

Re: [net,v2] ipv6: reorder ip6_route_dev_notifier after ipv6_dev_notf

2017-06-19 Thread Cong Wang

Hello,

On Mon, Jun 19, 2017 at 8:15 PM, jeffy  wrote:
> but actually they are not guaranteed to be paired:
>
> the netdev_run_todo(see the first dump stack above) would call
> netdev_wait_allrefs to rebroadcast unregister notification multiple times,
> unless timed out or all of the "struct net_device"'s refs released:
>
>  * This is called when unregistering network devices.
>  *
>  * Any protocol or device that holds a reference should register
>  * for netdevice notification, and cleanup and put back the
>  * reference if they receive an UNREGISTER event.
>  * We can get stuck here if buggy protocols don't correctly
>  * call dev_put.
>  */
> static void netdev_wait_allrefs(struct net_device *dev)
> {
> ...
> while (refcnt != 0) {
> if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
> rtnl_lock();
>
> /* Rebroadcast unregister notification */
> call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
>
> __rtnl_unlock();
> rcu_barrier();
> rtnl_lock();
>
>
> call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);

Interesting, I didn't notice this corner-case, because normally
we would hit the one in rollback_registered_many(). Probably
we need to add a check

if (dev->reg_state == NETREG_UNREGISTERING)

in ip6_route_dev_notify(). Can you give it a try?

I guess we probably need to revise other NETDEV_UNREGISTER
handlers too.

I will send a patch tomorrow.

Thanks!

Re: [PATCH net-next 0/1] Introduction of the tc tests

2017-06-19 Thread Cong Wang

On Mon, Jun 19, 2017 at 8:48 PM, David Miller  wrote:
> From: Cong Wang 
> Date: Mon, 19 Jun 2017 16:37:29 -0700
>
>> Hi,
>>
>> On Fri, Jun 16, 2017 at 2:22 PM, Lucas Bates  wrote:
>>> Apologies for sending this as one big patch. I've been sitting on this a 
>>> little
>>> too long, but it's ready and I wanted to get it out.
>>>
>>> There are a limited number of tests to start - I plan to add more on a 
>>> regular
>>> basis.
>>>
>>> Lucas Bates (1):
>>>   selftests: Introduce tc testsuite
>>
>> Nice work!
>>
>> Is there any particular reason you want to put these tests in kernel tree
>> especially tools/testing/selftests/ ?
>
> Yeah, it would be absolutely terrible if we had more tests in the
> kernel selftests area for networking.
>
> More seriously, we need more, not less, tests in the kernel networking
> selftests directory.
>
> It doesn't belong in iproute2 because we want a place to put things
> that automatically get tested when someone makes kernel changes and
> can be integrated into the kernel development workflow.
>
> I want as many tests as possible under there, so I'm really surprised
> that you're asking "why" tests are being added there.

I thought tools/testing/selftests/ is mainly for those tests close to
kernel ABI and API. What is the criteria for these tests? If any test
can fit in, we somehow would merge the whole LTP...

I definitely don't object more tests, I am just wondering if we should
put it to tools/testing/selftests/ or host it somewhere else.

Re: [PATCH 0/9] Bug fixes and ctr mode of operation

2017-06-19 Thread Herbert Xu

On Thu, Jun 15, 2017 at 12:43:38PM +0530, Harsh Jain wrote:
> This series is based on cryptodev2.6 tree and includes bug fix ,ctr(aes), 
> rfc3686(ctr(aes)) algo.
> 
> Harsh Jain (7):
>   crypto: chcr - Pass lcb bit setting to firmware
>   crypto: chcr - Set fallback key
>   crypto: chcr - Return correct error code
>   crypto: chcr - Avoid changing request structure
>   crypto:chcr - Add ctr mode and process large sg entries for cipher
>   MAINTAINERS:Add maintainer for chelsio crypto driver
>   crypto: chcr - Ensure Destination sg entry size less than  2k
> Atul Gupta (2):
>   chcr - Add debug counters
>   crypto: chcr - Select device in Round Robin fashion

All applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Re: [PATCH net-next 0/1] Introduction of the tc tests

2017-06-19 Thread David Miller

From: Cong Wang 
Date: Mon, 19 Jun 2017 16:37:29 -0700

> Hi,
> 
> On Fri, Jun 16, 2017 at 2:22 PM, Lucas Bates  wrote:
>> Apologies for sending this as one big patch. I've been sitting on this a 
>> little
>> too long, but it's ready and I wanted to get it out.
>>
>> There are a limited number of tests to start - I plan to add more on a 
>> regular
>> basis.
>>
>> Lucas Bates (1):
>>   selftests: Introduce tc testsuite
> 
> Nice work!
> 
> Is there any particular reason you want to put these tests in kernel tree
> especially tools/testing/selftests/ ?

Yeah, it would be absolutely terrible if we had more tests in the
kernel selftests area for networking.

More seriously, we need more, not less, tests in the kernel networking
selftests directory.

It doesn't belong in iproute2 because we want a place to put things
that automatically get tested when someone makes kernel changes and
can be integrated into the kernel development workflow.

I want as many tests as possible under there, so I'm really surprised
that you're asking "why" tests are being added there.

Re: [PATCH NET] net/hns:bugfix of ethtool -t phy self_test

2017-06-19 Thread l00371289

Hi, Andrew

On 2017/6/20 5:54, Andrew Lunn wrote:
> On Mon, Jun 19, 2017 at 02:00:43PM -0700, Florian Fainelli wrote:
>> On 06/16/2017 02:24 AM, Lin Yun Sheng wrote:
>>> This patch fixes the phy loopback self_test failed issue. when
>>> Marvell Phy Module is loaded, it will powerdown fiber when doing
>>> phy loopback self test, which cause phy loopback self_test fail.
>>>
>>> Signed-off-by: Lin Yun Sheng 
>>> ---
>>>  drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 16 ++--
>>>  1 file changed, 14 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c 
>>> b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
>>> index b8fab14..e95795b 100644
>>> --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
>>> +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
>>> @@ -288,9 +288,15 @@ static int hns_nic_config_phy_loopback(struct 
>>> phy_device *phy_dev, u8 en)
>>
>> The question really is, why is not this properly integrated into the PHY
>> driver and PHYLIB such that the only thing the Ethernet MAC driver has
>> to call is a function of the PHY driver putting it in self-test?
> 
> This whole driver pokes various PHY registers, rather than use
> phylib. And it does so without taking the PHY lock. 
I will consider using phylib as much as possible, thanks.

It also assumes it
> is a Marvell PHY and i don't see anywhere it actually verifies this.
When it said Marvell Phy , I meant Marvell Phy with fibre support.
I will send anther patch to only setting bit in Fiber Control when
it is a Marvell Phy with fibre support.

Thanks for reply.
Best Regards
Yunsheng Lin
> 
> This is all broken.
> 
>  Andrew
> 
> .
>

Re: [net,v2] ipv6: reorder ip6_route_dev_notifier after ipv6_dev_notf

2017-06-19 Thread jeffy


Hi guys,

i hit some warnings when testing this patch on my local 4.4 kernel(arm64 
chromebook) with KASAN & SLUB_DEBUG:


[9.919374] BUG: KASAN: use-after-free in 
ip6_route_dev_notify+0x194/0x2bc at addr ffc0c9d4e480

[9.928469] Read of size 4 by task kworker/u12:3/124
[9.933463] 
=

[9.941686] BUG kmalloc-1024 (Not tainted): kasan: bad access detected
...
[   10.741337] Workqueue: netns cleanup_net
[   10.745300] Call trace:
[   10.747776] [] dump_backtrace+0x0/0x200
[   10.753203] [] show_stack+0x24/0x30
[   10.758284] [] dump_stack+0xa8/0xcc
[   10.763364] [] print_trailer+0x158/0x168
[   10.768877] [] object_err+0x4c/0x5c
[   10.773956] [] kasan_report+0x338/0x4ec
[   10.779383] [] __asan_load4+0x7c/0x84
[   10.784640] [] ip6_route_dev_notify+0x194/0x2bc
[   10.790763] [] notifier_call_chain+0x78/0xc0
[   10.796625] [] raw_notifier_call_chain+0x3c/0x4c
[   10.802835] [] call_netdevice_notifiers_info+0x8c/0x9c
[   10.809564] [] call_netdevice_notifiers+0x9c/0xcc
[   10.815859] [] netdev_run_todo+0x224/0x3f0
[   10.821547] [] rtnl_unlock+0x14/0x1c
[   10.826716] [] default_device_exit_batch+0x258/0x2a0
[   10.833269] [] ops_exit_list+0x74/0xdc
[   10.838608] [] cleanup_net+0x290/0x400


and also this:
[   11.607268] BUG kmalloc-1024 (Tainted: GB  ): Poison 
overwritten
[   11.607270] 
-
[   11.607274] INFO: 0xffc0c9d4e478-0xffc0c9d4e478. First byte 
0x67 instead of 0x6b

...
[   11.607679] [] print_trailer+0x158/0x168
[   11.607683] [] check_bytes_and_report+0xd8/0x13c
[   11.607688] [] check_object+0x134/0x230
[   11.607692] [] alloc_debug_processing+0x104/0x178
[   11.607697] [] ___slab_alloc.constprop.26+0x2ec/0x434
[   11.607702] [] 
__slab_alloc.isra.23.constprop.25+0x48/0x5c

[   11.607707] [] __kmalloc_track_caller+0x12c/0x338



it looks like the "struct inet6_dev" been touched after freed, and 
refcnt changed(0xffc0c9d4e478, 376 bytes of struct inet6_dev) when 
reusing this memory.





i think the problem would be we are assuming NETDEV_REGISTER and 
NETDEV_UNREGISTER be paired in ip6_route_dev_notify:


> +  if (event == NETDEV_REGISTER) {
>net->ipv6.ip6_null_entry->dst.dev = dev;
>net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
>   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
> @@ -3718,6 +3721,12 @@ static int ip6_route_dev_notify(struct 
notifier_block *this,

>net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
>net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
>   #endif
> +   } else if (event == NETDEV_UNREGISTER) {
> +  in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
> +#ifdef CONFIG_IPV6_MULTIPLE_TABLES
> +  in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
> +  in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev);
> +#endif
>}

but actually they are not guaranteed to be paired:

the netdev_run_todo(see the first dump stack above) would call 
netdev_wait_allrefs to rebroadcast unregister notification multiple 
times, unless timed out or all of the "struct net_device"'s refs released:


 * This is called when unregistering network devices.
 *
 * Any protocol or device that holds a reference should register
 * for netdevice notification, and cleanup and put back the
 * reference if they receive an UNREGISTER event.
 * We can get stuck here if buggy protocols don't correctly
 * call dev_put.
 */
static void netdev_wait_allrefs(struct net_device *dev)
{
...
while (refcnt != 0) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();

/* Rebroadcast unregister notification */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);

__rtnl_unlock();
rcu_barrier();
rtnl_lock();


call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);



On 05/05/2017 01:36 AM, WANG Cong wrote:

For each netns (except init_net), we initialize its null entry
in 3 places:

1) The template itself, as we use kmemdup()
2) Code around dst_init_metrics() in ip6_route_net_init()
3) ip6_route_dev_notify(), which is supposed to initialize it after
loopback registers

Unfortunately the last one still happens in a wrong order because
we expect to initialize net->ipv6.ip6_null_entry->rt6i_idev to
net->loopback_dev's idev, so we have to do that after we add
idev to it. However, this notifier has priority == 0 same as
ipv6_dev_notf, and ipv6_dev_notf is registered after
ip6_route_dev_notifier so it is called actually after
ip6_route_dev_notifier.

Fix it by picking a smaller priority for ip6_route_dev_notifier.
Also, we have to release the refcnt accordingly when unregistering
loopback_dev because device exit functions are called before subsys
exit functions.

Cc: David

Re: [PATCH NET] net/hns:bugfix of ethtool -t phy self_test

2017-06-19 Thread l00371289

hi, Florian

On 2017/6/20 5:00, Florian Fainelli wrote:
> On 06/16/2017 02:24 AM, Lin Yun Sheng wrote:
>> This patch fixes the phy loopback self_test failed issue. when
>> Marvell Phy Module is loaded, it will powerdown fiber when doing
>> phy loopback self test, which cause phy loopback self_test fail.
>>
>> Signed-off-by: Lin Yun Sheng 
>> ---
>>  drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 16 ++--
>>  1 file changed, 14 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c 
>> b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
>> index b8fab14..e95795b 100644
>> --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
>> +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
>> @@ -288,9 +288,15 @@ static int hns_nic_config_phy_loopback(struct 
>> phy_device *phy_dev, u8 en)
> 
> The question really is, why is not this properly integrated into the PHY
> driver and PHYLIB such that the only thing the Ethernet MAC driver has
> to call is a function of the PHY driver putting it in self-test?
Do you meaning calling phy_dev->drv->resume and phy_dev->drv->suspend function?
I tried it, but it failed. if that is what you mean, I will look into it why it 
fail.

Thanks for your reply.

Best regards
YunSheng Lin
> 
>>  
>>  /* Force 1000M Link, Default is 0x0200 */
>>  phy_write(phy_dev, 7, 0x20C);
>> -phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
>>  
>> -/* Enable PHY loop-back */
>> +/* Powerup Fiber */
>> +phy_write(phy_dev, HNS_PHY_PAGE_REG, 1);
>> +val = phy_read(phy_dev, COPPER_CONTROL_REG);
>> +val &= ~PHY_POWER_DOWN;
>> +phy_write(phy_dev, COPPER_CONTROL_REG, val);
>> +
>> +/* Enable Phy Loopback */
>> +phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
>>  val = phy_read(phy_dev, COPPER_CONTROL_REG);
>>  val |= PHY_LOOP_BACK;
>>  val &= ~PHY_POWER_DOWN;
>> @@ -299,6 +305,12 @@ static int hns_nic_config_phy_loopback(struct 
>> phy_device *phy_dev, u8 en)
>>  phy_write(phy_dev, HNS_PHY_PAGE_REG, 0xFA);
>>  phy_write(phy_dev, 1, 0x400);
>>  phy_write(phy_dev, 7, 0x200);
>> +
>> +phy_write(phy_dev, HNS_PHY_PAGE_REG, 1);
>> +val = phy_read(phy_dev, COPPER_CONTROL_REG);
>> +val |= PHY_POWER_DOWN;
>> +phy_write(phy_dev, COPPER_CONTROL_REG, val);
>> +
>>  phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
>>  phy_write(phy_dev, 9, 0xF00);
>>  
>>
> 
>

[PATCH net-next v3 07/15] bpf: Add setsockopt helper function to bpf

2017-06-19 Thread Lawrence Brakmo

Added support for calling a subset of socket setsockopts from
BPF_PROG_TYPE_SOCK_OPS programs. The code was duplicated rather
than making the changes to call the socket setsockopt function because
the changes required would have been larger.

The ops supported are:
  SO_RCVBUF
  SO_SNDBUF
  SO_MAX_PACING_RATE
  SO_PRIORITY
  SO_RCVLOWAT
  SO_MARK

Signed-off-by: Lawrence Brakmo 
---
 include/uapi/linux/bpf.h  | 14 -
 net/core/filter.c | 77 ++-
 samples/bpf/bpf_helpers.h |  3 ++
 3 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 314fdf3..86595f9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -520,6 +520,17 @@ union bpf_attr {
  * Set full skb->hash.
  * @skb: pointer to skb
  * @hash: hash to set
+ *
+ * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
+ * Calls setsockopt. Not all opts are available, only those with
+ * integer optvals plus TCP_CONGESTION.
+ * Supported levels: SOL_SOCKET and IPROTO_TCP
+ * @bpf_socket: pointer to bpf_socket
+ * @level: SOL_SOCKET or IPROTO_TCP
+ * @optname: option name
+ * @optval: pointer to option value
+ * @optlen: length of optval in byes
+ * Return: 0 or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -570,7 +581,8 @@ union bpf_attr {
FN(probe_read_str), \
FN(get_socket_cookie),  \
FN(get_socket_uid), \
-   FN(set_hash),
+   FN(set_hash),   \
+   FN(setsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 7d69d16..b114ae1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -54,6 +54,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * sk_filter_trim_cap - run a packet through a socket filter
@@ -2671,6 +2672,69 @@ static const struct bpf_func_proto 
bpf_get_socket_uid_proto = {
.arg1_type  = ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+  int, level, int, optname, char *, optval, int, optlen)
+{
+   struct sock *sk = bpf_sock->sk;
+   int ret = 0;
+   int val;
+
+   if (bpf_sock->is_req_sock)
+   return -EINVAL;
+
+   if (level == SOL_SOCKET) {
+   /* Only some socketops are supported */
+   val = *((int *)optval);
+
+   switch (optname) {
+   case SO_RCVBUF:
+   sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+   sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+   break;
+   case SO_SNDBUF:
+   sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+   sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+   break;
+   case SO_MAX_PACING_RATE:
+   sk->sk_max_pacing_rate = val;
+   sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+sk->sk_max_pacing_rate);
+   break;
+   case SO_PRIORITY:
+   sk->sk_priority = val;
+   break;
+   case SO_RCVLOWAT:
+   if (val < 0)
+   val = INT_MAX;
+   sk->sk_rcvlowat = val ? : 1;
+   break;
+   case SO_MARK:
+   sk->sk_mark = val;
+   break;
+   default:
+   ret = -EINVAL;
+   }
+   } else if (level == SOL_TCP &&
+  sk->sk_prot->setsockopt == tcp_setsockopt) {
+   /* Place holder */
+   ret = -EINVAL;
+   } else {
+   ret = -EINVAL;
+   }
+   return ret;
+}
+
+static const struct bpf_func_proto bpf_setsockopt_proto = {
+   .func   = bpf_setsockopt,
+   .gpl_only   = true,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_CTX,
+   .arg2_type  = ARG_ANYTHING,
+   .arg3_type  = ARG_ANYTHING,
+   .arg4_type  = ARG_PTR_TO_MEM,
+   .arg5_type  = ARG_CONST_SIZE_OR_ZERO,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -2822,6 +2886,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
+   sock_ops_func_proto(enum bpf_func_id func_id)
+{
+   switch (func_id) {
+   case BPF_FUNC_setsockopt:
+   return _setsockopt_proto;
+   default:
+   return bpf_base_func_proto(func_id);
+   }
+}
+
+static const struct bpf_func_proto *

[PATCH net-next v3 11/15] bpf: Sample BPF program to set congestion control

2017-06-19 Thread Lawrence Brakmo

Sample BPF program that sets congestion control to dctcp when both hosts
are within the same datacenter. In this example that is assumed to be
when they have the first 5.5 bytes of their IPv6 address are the same.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile|  1 +
 samples/bpf/tcp_cong_kern.c | 73 +
 2 files changed, 74 insertions(+)
 create mode 100644 samples/bpf/tcp_cong_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 942c7c7..eb324e0 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -116,6 +116,7 @@ always += cookie_uid_helper_example.o
 always += tcp_synrto_kern.o
 always += tcp_rwnd_kern.o
 always += tcp_bufs_kern.o
+always += tcp_cong_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c
new file mode 100644
index 000..d56fb8a
--- /dev/null
+++ b/samples/bpf/tcp_cong_kern.c
@@ -0,0 +1,73 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set congestion control to dctcp when both hosts are
+ * in the same datacenter (as deteremined by IPv6 prefix).
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_cong(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   char cong[] = "dctcp";
+   int rv = 0;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Check if both hosts are in the same datacenter. For this
+* example they are if the 1st 5.5 bytes in the IPv6 address
+* are the same.
+*/
+   if (skops->family == AF_INET6 &&
+   skops->local_ip6[0] == skops->remote_ip6[0] &&
+   (skops->local_ip6[1] & 0xfff0) ==
+   (skops->remote_ip6[1] & 0xfff0)) {
+   switch (op) {
+   case BPF_SOCK_OPS_NEEDS_ECN:
+   rv = 1;
+   break;
+   case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+   rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
+   cong, sizeof(cong));
+   break;
+   case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+   rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
+   cong, sizeof(cong));
+   break;
+   default:
+   rv = -1;
+   }
+   } else {
+   rv = -1;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   return rv;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3

[PATCH net-next v3 15/15] bpf: Sample bpf program to set sndcwnd clamp

2017-06-19 Thread Lawrence Brakmo

Sample BPF program, tcp_clamp_kern.c, to demostrate the use
of setting the sndcwnd clamp. This program assumes that if the
first 5.5 bytes of the host's IPv6 addresses are the same, then
the hosts are in the same datacenter and sets sndcwnd clamp to
100 packets, SYN and SYN-ACK RTOs to 10ms and send/receive buffer
sizes to 150KB.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile |  1 +
 samples/bpf/tcp_clamp_kern.c | 93 
 2 files changed, 94 insertions(+)
 create mode 100644 samples/bpf/tcp_clamp_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 3ec96a0..59975c3 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -118,6 +118,7 @@ always += tcp_rwnd_kern.o
 always += tcp_bufs_kern.o
 always += tcp_cong_kern.o
 always += tcp_iw_kern.o
+always += tcp_clamp_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c
new file mode 100644
index 000..413eeba
--- /dev/null
+++ b/samples/bpf/tcp_clamp_kern.c
@@ -0,0 +1,93 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample BPF program to set send and receive buffers to 150KB, sndcwnd clamp
+ * to 100 packets and SYN and SYN_ACK RTOs to 10ms when both hosts are within
+ * the same datacenter. For his example, we assume they are within the same
+ * datacenter when the first 5.5 bytes of their IPv6 addresses are the same.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_clamp(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   int bufsize = 15;
+   int to_init = 10;
+   int clamp = 100;
+   int rv = 0;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Check that both hosts are within same datacenter. For this example
+* it is the case when the first 5.5 bytes of their IPv6 addresses are
+* the same.
+*/
+   if (skops->family == AF_INET6 &&
+   skops->local_ip6[0] == skops->remote_ip6[0] &&
+   (skops->local_ip6[1] & 0xfff0) ==
+   (skops->remote_ip6[1] & 0xfff0)) {
+   switch (op) {
+   case BPF_SOCK_OPS_TIMEOUT_INIT:
+   rv = to_init;
+   break;
+   case BPF_SOCK_OPS_TCP_CONNECT_CB:
+   /* Set sndbuf and rcvbuf of active connections */
+   rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF,
+   , sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, ,
+ sizeof(bufsize));
+   break;
+   case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+   rv = bpf_setsockopt(skops, SOL_TCP,
+   TCP_BPF_SNDCWND_CLAMP,
+   , sizeof(clamp));
+   break;
+   case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+   /* Set sndbuf and rcvbuf of passive connections */
+   rv = bpf_setsockopt(skops, SOL_TCP,
+   TCP_BPF_SNDCWND_CLAMP,
+   , sizeof(clamp));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
+ SO_SNDBUF, ,
+ sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, ,
+ sizeof(bufsize));
+   break;
+   default:
+   rv = -1;
+   }
+   } else {
+   rv = -1;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   return rv;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3

[PATCH net-next v3 10/15] bpf: Add support for changing congestion control

2017-06-19 Thread Lawrence Brakmo

Added support for changing congestion control for SOCK_OPS bpf
programs through the setsockopt bpf helper function. It also adds
a new SOCK_OPS op, BPF_SOCK_OPS_NEEDS_ECN, that is needed for
congestion controls, like dctcp, that need to enable ECN in the
SYN packets.

Signed-off-by: Lawrence Brakmo 
---
 include/net/tcp.h|  9 -
 include/uapi/linux/bpf.h |  3 +++
 net/core/filter.c| 11 +--
 net/ipv4/tcp.c   |  2 +-
 net/ipv4/tcp_cong.c  | 32 ++--
 net/ipv4/tcp_input.c |  3 ++-
 net/ipv4/tcp_output.c|  8 +---
 7 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ff806d7..58d67be 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1003,7 +1003,9 @@ void tcp_get_default_congestion_control(char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
-int tcp_set_congestion_control(struct sock *sk, const char *name);
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
+void tcp_reinit_congestion_control(struct sock *sk,
+  const struct tcp_congestion_ops *ca);
 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
@@ -2072,4 +2074,9 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk, bool 
is_req_sock)
rwnd = 0;
return rwnd;
 }
+
+static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
+{
+   return (tcp_call_bpf(sk, true, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+}
 #endif /* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4856d16..c222059 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -776,6 +776,9 @@ enum {
 * passive connection is
 * established
 */
+   BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control
+* needs ECN
+*/
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index b114ae1..bbf8f78 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2716,8 +2716,15 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, 
bpf_sock,
}
} else if (level == SOL_TCP &&
   sk->sk_prot->setsockopt == tcp_setsockopt) {
-   /* Place holder */
-   ret = -EINVAL;
+   if (optname == TCP_CONGESTION) {
+   ret = tcp_set_congestion_control(sk, optval, false);
+   if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
+   /* replacing an existing ca */
+   tcp_reinit_congestion_control(sk,
+   inet_csk(sk)->icsk_ca_ops);
+   } else {
+   ret = -EINVAL;
+   }
} else {
ret = -EINVAL;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 058f509..9476fd6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2479,7 +2479,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
name[val] = 0;
 
lock_sock(sk);
-   err = tcp_set_congestion_control(sk, name);
+   err = tcp_set_congestion_control(sk, name, true);
release_sock(sk);
return err;
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 324c9bc..fde983f 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk)
INET_ECN_dontxmit(sk);
 }
 
-static void tcp_reinit_congestion_control(struct sock *sk,
- const struct tcp_congestion_ops *ca)
+void tcp_reinit_congestion_control(struct sock *sk,
+  const struct tcp_congestion_ops *ca)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -333,8 +333,12 @@ int tcp_set_allowed_congestion_control(char *val)
return ret;
 }
 
-/* Change congestion control for socket */
-int tcp_set_congestion_control(struct sock *sk, const char *name)
+/* Change congestion control for socket. If load is false, then it is the
+ * responsibility of the caller to call tcp_init_congestion_control or
+ * tcp_reinit_congestion_control (if the current congestion control was
+ * already initialized.
+ */
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca;
@@ -344,21 +348,29 @@ int

[PATCH net-next v3 13/15] bpf: Sample BPF program to set initial cwnd

2017-06-19 Thread Lawrence Brakmo

Sample BPF program that assumes hosts are far away (i.e. large RTTs)
and sets initial cwnd and initial receive window to 40 packets,
send and receive buffers to 1.5MB.

In practice there would be a test to insure the hosts are actually
far enough away.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile  |  1 +
 samples/bpf/tcp_iw_kern.c | 78 +++
 2 files changed, 79 insertions(+)
 create mode 100644 samples/bpf/tcp_iw_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index eb324e0..3ec96a0 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -117,6 +117,7 @@ always += tcp_synrto_kern.o
 always += tcp_rwnd_kern.o
 always += tcp_bufs_kern.o
 always += tcp_cong_kern.o
+always += tcp_iw_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c
new file mode 100644
index 000..4f978fc
--- /dev/null
+++ b/samples/bpf/tcp_iw_kern.c
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial congestion window and initial receive
+ * window to 40 packets and send and receive buffers to 1.5MB. This
+ * would usually be done after doing appropriate checks that indicate
+ * the hosts are far enough away (i.e. large RTT).
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_iw(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   int bufsize = 150;
+   int rwnd_init = 40;
+   int iw = 40;
+   int rv = 0;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Usually there would be a check to insure the hosts are far
+* from each other so it makes sense to increase buffer sizes
+*/
+   switch (op) {
+   case BPF_SOCK_OPS_RWND_INIT:
+   rv = rwnd_init;
+   break;
+   case BPF_SOCK_OPS_TCP_CONNECT_CB:
+   /* Set sndbuf and rcvbuf of active connections */
+   rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, ,
+   sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+, sizeof(bufsize));
+   break;
+   case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+   rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, ,
+   sizeof(iw));
+   break;
+   case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+   /* Set sndbuf and rcvbuf of passive connections */
+   rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, ,
+   sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+, sizeof(bufsize));
+   break;
+   default:
+   rv = -1;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   return rv;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3

[PATCH net-next v3 14/15] bpf: Adds support for setting sndcwnd clamp

2017-06-19 Thread Lawrence Brakmo

Adds a new bpf_setsockopt for TCP sockets, TCP_BPF_SNDCWND_CLAMP, which
sets the initial congestion window. It is useful to limit the sndcwnd
when the host are close to each other (small RTT).

Signed-off-by: Lawrence Brakmo 
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c| 7 +++
 2 files changed, 8 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a07acc6..47189e5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -782,5 +782,6 @@ enum {
 };
 
 #define TCP_BPF_IW 1001/* Set TCP initial congestion window */
+#define TCP_BPF_SNDCWND_CLAMP  1002/* Set sndcwnd_clamp */
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index db6d30c0..664bb9f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2733,6 +2733,13 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, 
bpf_sock,
else
tp->snd_cwnd = val;
break;
+   case TCP_BPF_SNDCWND_CLAMP:
+   if (val <= 0) {
+   ret = -EINVAL;
+   } else {
+   tp->snd_cwnd_clamp = val;
+   tp->snd_ssthresh = val;
+   }
default:
ret = -EINVAL;
}
-- 
2.9.3

[PATCH net-next v3 01/15] bpf: BPF support for sock_ops

2017-06-19 Thread Lawrence Brakmo

Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.). Currently there is
functionality to load one global BPF program of this type which can be
called at appropriate times to set relevant connection parameters such
as buffer sizes, SYN and SYN-ACK RTOs, etc., based on connection
information such as IP addresses, port numbers, etc.

Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
disticnt advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it oes not require
application changes and it can be updated easily at any time.

I plan to add support for loading per cgroup sock_ops BPF programs in
the near future. One question is whether I should add this functionality
into David Ahern's BPF_PROG_TYPE_CGROUP_SOCK or create a new cgroup bpf
type. Whereas the current cgroup_sock type expects to be called only once
during a connection's lifetime, the new sock_ops type could be called
multipe times. For example, before sending SYN and SYN-ACKs to set an
appropriate timeout, when the connection is established to set
congestion control, etc. As a result it has "op" field to specify the
type of operation requested.

The purpose of this new program type is to simplify setting connection
parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is
easy to use facebook's internal IPv6 addresses to determine if both hosts
of a connection are in the same datacenter. Therefore, it is easy to
write a BPF program to choose a small SYN RTO value when both hosts are
in the same datacenter.

This patch only contains the framework to support the new BPF program
type, following patches add the functionality to set various connection
parameters.

This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS
and a new bpf syscall command to load a new program of this type:
BPF_PROG_LOAD_SOCKET_OPS.

Two new corresponding structs (one for the kernel one for the user/BPF
program):

/* kernel version */
struct bpf_sock_ops_kern {
struct sock *sk;
bool   is_req_sock:1;
__u32  op;
union {
__u32 reply;
__u32 replylong[4];
};
};

/* user version */
struct bpf_sock_ops {
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
};
__u32 family;
__u32 remote_ip4;
__u32 local_ip4;
__u32 remote_ip6[4];
__u32 local_ip6[4];
__u32 remote_port;
__u32 local_port;
};

Currently there are two types of ops. The first type expects the BPF
program to return a value which is then used by the caller (or a
negative value to indicate the operation is not supported). The second
type expects state changes to be done by the BPF program, for example
through a setsockopt BPF helper function, and they ignore the return
value.

The reply fields of the bpf_sockt_ops struct are there in case a bpf
program needs to return a value larger than an integer.

Signed-off-by: Lawrence Brakmo 
---
 include/linux/bpf.h   |   6 ++
 include/linux/bpf_types.h |   1 +
 include/linux/filter.h|  10 +++
 include/net/tcp.h |  30 
 include/uapi/linux/bpf.h  |  28 
 kernel/bpf/syscall.c  |  62 +
 net/core/Makefile |   3 +-
 net/core/filter.c | 170 ++
 net/core/sock_bpfops.c|  65 ++
 samples/bpf/bpf_load.c|  13 +++-
 10 files changed, 370 insertions(+), 18 deletions(-)
 create mode 100644 net/core/sock_bpfops.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1bcbf0a..a1a1f2f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -362,4 +362,10 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+/* sock_ops related */
+struct bpf_sock_ops_kern;
+
+int bpf_sock_ops_attach_global_prog(int fd);
+int bpf_sock_ops_detach_global_prog(void);
+int bpf_sock_ops_call(struct bpf_sock_ops_kern *bpf_sock);
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 03bf223..3d137c3 100644
--- a/include/linux/bpf_types.h
+++

[PATCH net-next v3 08/15] bpf: Add TCP connection BPF callbacks

2017-06-19 Thread Lawrence Brakmo

Added callbacks to BPF SOCK_OPS type program before an active
connection is intialized and after a passive or active connection is
established.

The following patch demostrates how they can be used to set send and
receive buffer sizes.

Signed-off-by: Lawrence Brakmo 
---
 include/uapi/linux/bpf.h | 11 +++
 net/ipv4/tcp_fastopen.c  |  1 +
 net/ipv4/tcp_input.c |  4 +++-
 net/ipv4/tcp_output.c|  1 +
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 86595f9..4856d16 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -765,6 +765,17 @@ enum {
 * window (in packets) or -1 if default
 * value should be used
 */
+   BPF_SOCK_OPS_TCP_CONNECT_CB,/* Calls BPF program right before an
+* active connection is initialized
+*/
+   BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an
+* active connection is
+* established
+*/
+   BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,/* Calls BPF program when a
+* passive connection is
+* established
+*/
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 4af82b9..ed6b549 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -221,6 +221,7 @@ static struct sock *tcp_fastopen_create_child(struct sock 
*sk,
tcp_init_congestion_control(child);
tcp_mtup_init(child);
tcp_init_metrics(child);
+   tcp_call_bpf(child, false, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tcp_init_buffer_space(child);
 
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0867b05..1b868ae 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5571,7 +5571,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff 
*skb)
icsk->icsk_af_ops->rebuild_header(sk);
 
tcp_init_metrics(sk);
-
+   tcp_call_bpf(sk, false, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
tcp_init_congestion_control(sk);
 
/* Prevent spurious tcp_cwnd_restart() on first data
@@ -5977,6 +5977,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff 
*skb)
} else {
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
+   tcp_call_bpf(sk, false,
+BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tcp_init_congestion_control(sk);
 
tcp_mtup_init(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e5f623f..958edc8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3445,6 +3445,7 @@ int tcp_connect(struct sock *sk)
struct sk_buff *buff;
int err;
 
+   tcp_call_bpf(sk, false, BPF_SOCK_OPS_TCP_CONNECT_CB);
tcp_connect_init(sk);
 
if (unlikely(tp->repair)) {
-- 
2.9.3

[PATCH net-next v3 12/15] bpf: Adds support for setting initial cwnd

2017-06-19 Thread Lawrence Brakmo

Adds a new bpf_setsockopt for TCP sockets, TCP_BPF_IW, which sets the
initial congestion window. This can be used when the hosts are far
apart (large RTTs) and it is safe to start with a large inital cwnd.

Signed-off-by: Lawrence Brakmo 
---
 include/uapi/linux/bpf.h |  2 ++
 net/core/filter.c| 14 +-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c222059..a07acc6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -781,4 +781,6 @@ enum {
 */
 };
 
+#define TCP_BPF_IW 1001/* Set TCP initial congestion window */
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index bbf8f78..db6d30c0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2723,7 +2723,19 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, 
bpf_sock,
tcp_reinit_congestion_control(sk,
inet_csk(sk)->icsk_ca_ops);
} else {
-   ret = -EINVAL;
+   struct tcp_sock *tp = tcp_sk(sk);
+
+   val = *((int *)optval);
+   switch (optname) {
+   case TCP_BPF_IW:
+   if (val <= 0 || tp->data_segs_out > 0)
+   ret = -EINVAL;
+   else
+   tp->snd_cwnd = val;
+   break;
+   default:
+   ret = -EINVAL;
+   }
}
} else {
ret = -EINVAL;
-- 
2.9.3

[PATCH net-next v3 03/15] bpf: Support for per connection SYN/SYN-ACK RTOs

2017-06-19 Thread Lawrence Brakmo

This patch adds support for setting a per connection SYN and
SYN_ACK RTOs from within a BPF_SOCK_OPS program. For example,
to set small RTOs when it is known both hosts are within a
datacenter.

Signed-off-by: Lawrence Brakmo 
---
 include/net/tcp.h| 11 +++
 include/uapi/linux/bpf.h |  3 +++
 net/ipv4/tcp_input.c |  3 ++-
 net/ipv4/tcp_output.c|  2 +-
 4 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f6f415c..bdf6bfd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2051,4 +2051,15 @@ static inline int tcp_call_bpf(struct sock *sk, bool 
is_req_sock, int op)
 }
 #endif
 
+static inline u32 tcp_timeout_init(struct sock *sk, bool is_req_sock)
+{
+   int timeout;
+
+   timeout = tcp_call_bpf(sk, is_req_sock, BPF_SOCK_OPS_TIMEOUT_INIT);
+
+   if (timeout <= 0)
+   timeout = TCP_TIMEOUT_INIT;
+   return timeout;
+}
+
 #endif /* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 861dbe9..4532c31 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -746,6 +746,9 @@ struct bpf_sock_ops {
  */
 enum {
BPF_SOCK_OPS_VOID,
+   BPF_SOCK_OPS_TIMEOUT_INIT,  /* Should return SYN-RTO value to use or
+* -1 if default value should be used
+*/
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2ab7e2f..0867b05 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6406,7 +6406,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
} else {
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie)
-   inet_csk_reqsk_queue_hash_add(sk, req, 
TCP_TIMEOUT_INIT);
+   inet_csk_reqsk_queue_hash_add(sk, req,
+   tcp_timeout_init((struct sock *)req, true));
af_ops->send_synack(sk, dst, , req, ,
!want_cookie ? TCP_SYNACK_NORMAL :
   TCP_SYNACK_COOKIE);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9a9c395..5e478a1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3327,7 +3327,7 @@ static void tcp_connect_init(struct sock *sk)
tp->rcv_wup = tp->rcv_nxt;
tp->copied_seq = tp->rcv_nxt;
 
-   inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+   inet_csk(sk)->icsk_rto = tcp_timeout_init(sk, false);
inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
 }
-- 
2.9.3

[PATCH net-next v3 05/15] bpf: Support for setting initial receive window

2017-06-19 Thread Lawrence Brakmo

This patch adds suppport for setting the initial advertized window from
within a BPF_SOCK_OPS program. This can be used to support larger
initial cwnd values in environments where it is known to be safe.

Signed-off-by: Lawrence Brakmo 
---
 include/net/tcp.h| 10 ++
 include/uapi/linux/bpf.h |  4 
 net/ipv4/tcp_minisocks.c |  9 -
 net/ipv4/tcp_output.c|  7 ++-
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index bdf6bfd..ff806d7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2062,4 +2062,14 @@ static inline u32 tcp_timeout_init(struct sock *sk, bool 
is_req_sock)
return timeout;
 }
 
+static inline u32 tcp_rwnd_init_bpf(struct sock *sk, bool is_req_sock)
+{
+   int rwnd;
+
+   rwnd = tcp_call_bpf(sk, is_req_sock, BPF_SOCK_OPS_RWND_INIT);
+
+   if (rwnd < 0)
+   rwnd = 0;
+   return rwnd;
+}
 #endif /* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4532c31..314fdf3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -749,6 +749,10 @@ enum {
BPF_SOCK_OPS_TIMEOUT_INIT,  /* Should return SYN-RTO value to use or
 * -1 if default value should be used
 */
+   BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized
+* window (in packets) or -1 if default
+* value should be used
+*/
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d30ee31..bbaf3c6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -351,6 +351,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
int full_space = tcp_full_space(sk_listener);
u32 window_clamp;
__u8 rcv_wscale;
+   u32 rcv_wnd;
int mss;
 
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
@@ -363,6 +364,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
(req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
req->rsk_window_clamp = full_space;
 
+   rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req, true);
+   if (rcv_wnd == 0)
+   rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+   else if (full_space < rcv_wnd * mss)
+   full_space = rcv_wnd * mss;
+
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -370,7 +377,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
>rsk_window_clamp,
ireq->wscale_ok,
_wscale,
-   dst_metric(dst, RTAX_INITRWND));
+   rcv_wnd);
ireq->rcv_wscale = rcv_wscale;
 }
 EXPORT_SYMBOL(tcp_openreq_init_rwin);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5e478a1..e5f623f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3267,6 +3267,7 @@ static void tcp_connect_init(struct sock *sk)
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
+   u32 rcv_wnd;
 
/* We'll fix this up when we get a response from the other end.
 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -3300,13 +3301,17 @@ static void tcp_connect_init(struct sock *sk)
(tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
tp->window_clamp = tcp_full_space(sk);
 
+   rcv_wnd = tcp_rwnd_init_bpf(sk, false);
+   if (rcv_wnd == 0)
+   rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+
tcp_select_initial_window(tcp_full_space(sk),
  tp->advmss - (tp->rx_opt.ts_recent_stamp ? 
tp->tcp_header_len - sizeof(struct tcphdr) : 0),
  >rcv_wnd,
  >window_clamp,
  sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
  _wscale,
- dst_metric(dst, RTAX_INITRWND));
+ rcv_wnd);
 
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
-- 
2.9.3

PATCH net-next v3 00/15

2017-06-19 Thread Lawrence Brakmo

Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.) and setting
connection parameters such as buffer sizes, initial window, SYN/SYN-ACK
RTOs, etc.

Unlike current BPF program types that expect to be called at a particular
place in the network stack code, SOCK_OPS program can be called at
different places and use an "op" field to indicate the context. There
are currently two types of operations, those whose effect is through
their return value and those whose effect is through the new
bpf_setsocketop BPF helper function.

Example operands of the first type are:
  BPF_SOCK_OPS_TIMEOUT_INIT
  BPF_SOCK_OPS_RWND_INIT
  BPF_SOCK_OPS_NEEDS_ECN

Example operands of the secont type are:
  BPF_SOCK_OPS_TCP_CONNECT_CB
  BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB
  BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB

Current operands are only called during connection establishment so
there should not be any BPF overheads after connection establishment. The
main idea is to use connection information form both hosts, such as IP
addresses and ports to allow setting of per connection parameters to
optimize the connection's peformance.

Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
disticnt advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it does not require
application changes and it can be updated easily at any time.

Currently there is functionality to load one global BPF program of this
type but I plan to add support for loading per cgroup socket ops BPF
programs in the near future. When that is done, the global program could
be called when a cgroup has no program associated with it.

One question is whether I should add this functionality into David Ahern's
BPF_PROG_TYPE_CGROUP_SOCK or create a new cgroup bpf type. Whereas the
current cgroup_sock type expects to be called only once during a connection's
lifetime, the new socket_ops type could be called multipe times. My preference
is to define a new bpf attach type, BPF_CGROUP_SOCK_OPS, to attach
BPF_PROG_TYPE_SOCK_OPS to cgroups.

This patch set also includes sample BPF programs to demostrate the differnet
features.

v2: Formatting changes, rebased to latest net-next

v3: Fixed build issues, changed socket_ops to sock_ops throught,
fixed formatting issues, removed the syscall to load sock_ops
program and added functionality to use existing bpf attach and
bpf detach system calls, removed reader/writer locks in
sock_bpfops.c (used when saving sock_ops global program)

Consists of the following patches:


 include/linux/bpf.h   |   6 ++
 include/linux/bpf_types.h |   1 +
 include/linux/filter.h|  10 ++
 include/net/tcp.h |  60 ++-
 include/uapi/linux/bpf.h  |  66 +++-
 kernel/bpf/syscall.c  |  62 +---
 net/core/Makefile |   3 +-
 net/core/filter.c | 271 
++
 net/core/sock_bpfops.c|  65 
 net/ipv4/tcp.c|   2 +-
 net/ipv4/tcp_cong.c   |  32 --
 net/ipv4/tcp_fastopen.c   |   1 +
 net/ipv4/tcp_input.c  |  10 +-
 net/ipv4/tcp_minisocks.c  |   9 +-
 net/ipv4/tcp_output.c |  18 +++-
 samples/bpf/Makefile  |   9 ++
 samples/bpf/bpf_helpers.h |   3 +
 samples/bpf/bpf_load.c|  13 ++-
 samples/bpf/tcp_bpf.c |  86 
 samples/bpf/tcp_bufs_kern.c   |  76 ++
 samples/bpf/tcp_clamp_kern.c  |  93 +
 samples/bpf/tcp_cong_kern.c   |  73 ++
 samples/bpf/tcp_iw_kern.c |  78 +++
 samples/bpf/tcp_rwnd_kern.c   |  60 +++
 samples/bpf/tcp_synrto_kern.c |  59 +++
 25 files changed, 1126 insertions(+), 40 deletions(-)

[PATCH net-next v3 06/15] bpf: Sample bpf program to set initial window

2017-06-19 Thread Lawrence Brakmo

The sample bpf program, tcp_rwnd_kern.c, sets the initial
advertized window to 40 packets in an environment where
distinct IPv6 prefixes indicate that both hosts are not
in the same data center.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile|  1 +
 samples/bpf/tcp_rwnd_kern.c | 60 +
 2 files changed, 61 insertions(+)
 create mode 100644 samples/bpf/tcp_rwnd_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 21cb016..9aca209 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -114,6 +114,7 @@ always += xdp_tx_iptunnel_kern.o
 always += test_map_in_map_kern.o
 always += cookie_uid_helper_example.o
 always += tcp_synrto_kern.o
+always += tcp_rwnd_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c
new file mode 100644
index 000..26c1370
--- /dev/null
+++ b/samples/bpf/tcp_rwnd_kern.c
@@ -0,0 +1,60 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial receive window to 40 packets when using IPv6
+ * and the first 5.5 bytes of the IPv6 addresses are not the same (in this
+ * example that means both hosts are not the same datacenter.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_rwnd(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   int rv = -1;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Check for RWND_INIT operation and IPv6 addresses */
+   if (op == BPF_SOCK_OPS_RWND_INIT &&
+   skops->family == AF_INET6) {
+
+   /* If the first 5.5 bytes of the IPv6 address are not the same
+* then both hosts are not in the same datacenter
+* so use a larger initial advertized window (40 packets)
+*/
+   if (skops->local_ip6[0] != skops->remote_ip6[0] ||
+   (skops->local_ip6[1] & 0xf000) !=
+   (skops->remote_ip6[1] & 0xf000))
+   bpf_trace_printk(fmt2, sizeof(fmt2), -1);
+   rv = 40;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   return rv;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3

[PATCH net-next v3 09/15] bpf: Sample BPF program to set buffer sizes

2017-06-19 Thread Lawrence Brakmo

This patch contains a BPF program to set initial receive window to
40 packets and send and receive buffers to 1.5MB. This would usually
be done after doing appropriate checks that indicate the hosts are
far enough away (i.e. large RTT).

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile|  1 +
 samples/bpf/tcp_bufs_kern.c | 76 +
 2 files changed, 77 insertions(+)
 create mode 100644 samples/bpf/tcp_bufs_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 9aca209..942c7c7 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -115,6 +115,7 @@ always += test_map_in_map_kern.o
 always += cookie_uid_helper_example.o
 always += tcp_synrto_kern.o
 always += tcp_rwnd_kern.o
+always += tcp_bufs_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c
new file mode 100644
index 000..6cc096c
--- /dev/null
+++ b/samples/bpf/tcp_bufs_kern.c
@@ -0,0 +1,76 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial receive window to 40 packets and send
+ * and receive buffers to 1.5MB. This would usually be done after
+ * doing appropriate checks that indicate the hosts are far enough
+ * away (i.e. large RTT).
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_bufs(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   int bufsize = 150;
+   int rwnd_init = 40;
+   int rv = 0;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Usually there would be a check to insure the hosts are far
+* from each other so it makes sense to increase buffer sizes
+*/
+   switch (op) {
+   case BPF_SOCK_OPS_RWND_INIT:
+   rv = rwnd_init;
+   break;
+   case BPF_SOCK_OPS_TCP_CONNECT_CB:
+   /* Set sndbuf and rcvbuf of active connections */
+   rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, ,
+   sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+, sizeof(bufsize));
+   break;
+   case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+   /* Nothing to do */
+   break;
+   case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+   /* Set sndbuf and rcvbuf of passive connections */
+   rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, ,
+   sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+, sizeof(bufsize));
+   break;
+   default:
+   rv = -1;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   return rv;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3

[PATCH net-next v3 02/15] bpf: program to load sock_ops BPF programs

2017-06-19 Thread Lawrence Brakmo

The program tcp_bpf can be used to remove current global sock_ops program
and to load/replace sock_ops BPF programs. There is also an option to
print the bpf trace buffer (for debugging purposes).

USAGE:
  ./tcp_bpf [-r] [-l] []
WHERE:
  -r  remove current loaded sock_ops BPF program
  not needed if loading a new program
  -l  print BPF trace buffer. Used when loading a new program
   name of BPF sock_ops program to load
  if  does not end in ".o", then "_kern.o" is appended
  example: using tcp_rto will load tcp_rto_kern.o

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile  |  3 ++
 samples/bpf/tcp_bpf.c | 86 +++
 2 files changed, 89 insertions(+)
 create mode 100644 samples/bpf/tcp_bpf.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index a0561dc..ed6bc75 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -36,6 +36,7 @@ hostprogs-y += lwt_len_hist
 hostprogs-y += xdp_tx_iptunnel
 hostprogs-y += test_map_in_map
 hostprogs-y += per_socket_stats_example
+hostprogs-y += tcp_bpf
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o
@@ -52,6 +53,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tcp_bpf-objs := bpf_load.o $(LIBBPF) tcp_bpf.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
 lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
@@ -130,6 +132,7 @@ HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
+HOSTLOADLIBES_tcp_bpf += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
 HOSTLOADLIBES_trace_output += -lelf -lrt
 HOSTLOADLIBES_lathist += -lelf
diff --git a/samples/bpf/tcp_bpf.c b/samples/bpf/tcp_bpf.c
new file mode 100644
index 000..735b8b2
--- /dev/null
+++ b/samples/bpf/tcp_bpf.c
@@ -0,0 +1,86 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+#include 
+#include 
+#include 
+
+static void usage(char *pname)
+{
+   printf("USAGE:\n  %s [-r] [-l] \n", pname);
+   printf("WHERE:\n");
+   printf("  -r  remove current loaded socketops BPF program\n");
+   printf("  not needed if loading a new program\n");
+   printf("  -l  print out BPF log buffer\n");
+   printf("   name of BPF sockeops program to load\n");
+   printf("  if  does not end in \".o\", then \"_kern.o\" "
+  "is appended\n");
+   printf("  example: using tcp1 will load tcp1_kern.o\n");
+   printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+   //union bpf_attr attr;
+   int k, logFlag = 0;
+   int error = 0;
+   char fn[500];
+
+   if (argc <= 1)
+   usage(argv[0]);
+   for (k = 1; k < argc; k++) {
+   if (!strcmp(argv[k], "-r")) {
+   error = bpf_prog_detach(0, BPF_GLOBAL_SOCK_OPS);
+   if (error) {
+   printf("ERROR: bpf_prog_detach: %d (%s)\n",
+  error, strerror(errno));
+   error =  1;
+   }
+   } else if (!strcmp(argv[k], "-l")) {
+   logFlag = 1;
+   } else if (!strcmp(argv[k], "-h")) {
+   usage(argv[0]);
+   } else if (argv[k][0] == '-') {
+   printf("Error, unknown flag: %s\n", argv[k]);
+   error = 2;
+   } else if (strlen(argv[k]) > 450) {
+   printf("Error, program name too long %d\n",
+  (int) strlen(argv[k]));
+   error = 3;
+   } else {
+   if (!strcmp(argv[k]+strlen(argv[k])-2, ".o"))
+   strcpy(fn, argv[k]);
+   else
+   sprintf(fn, "%s_kern.o", argv[k]);
+   if (logFlag)
+   printf("loading bpf file:%s\n", fn);
+   if (load_bpf_file(fn)) {
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+   if (logFlag) {
+   printf("TCP BPF Loaded %s\n", fn);
+   printf("%s\n", bpf_log_buf);
+   }
+   error = bpf_prog_attach(prog_fd[0], 0,
+

[PATCH net-next v3 04/15] bpf: Sample bpf program to set SYN/SYN-ACK RTOs

2017-06-19 Thread Lawrence Brakmo

The sample BPF program, tcp_synrto_kern.c, sets the SYN and SYN-ACK
RTOs to 10ms when both hosts are within the same datacenter (i.e.
small RTTs) in an environment where common IPv6 prefixes indicate
both hosts are in the same data center.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile  |  1 +
 samples/bpf/tcp_synrto_kern.c | 59 +++
 2 files changed, 60 insertions(+)
 create mode 100644 samples/bpf/tcp_synrto_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ed6bc75..21cb016 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -113,6 +113,7 @@ always += lwt_len_hist_kern.o
 always += xdp_tx_iptunnel_kern.o
 always += test_map_in_map_kern.o
 always += cookie_uid_helper_example.o
+always += tcp_synrto_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c
new file mode 100644
index 000..b11efd8
--- /dev/null
+++ b/samples/bpf/tcp_synrto_kern.c
@@ -0,0 +1,59 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set SYN and SYN-ACK RTOs to 10ms when using IPv6 addresses
+ * and the first 5.5 bytes of the IPv6 addresses are the same (in this example
+ * that means both hosts are in the same datacenter.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_synrto(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   int rv = -1;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Check for TIMEOUT_INIT operation and IPv6 addresses */
+   if (op == BPF_SOCK_OPS_TIMEOUT_INIT &&
+   skops->family == AF_INET6) {
+
+   /* If the first 5.5 bytes of the IPv6 address are the same
+* then both hosts are in the same datacenter
+* so use an RTO of 10ms
+*/
+   if (skops->local_ip6[0] == skops->remote_ip6[0] &&
+   (skops->local_ip6[1] & 0xfff0) ==
+   (skops->remote_ip6[1] & 0xfff0))
+   rv = 10;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   return rv;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3

Re: [PATCH v2] arm: eBPF JIT compiler

2017-06-19 Thread Shubham Bansal

Hi Daniel,

>
> Sorry, had a travel over the weekend, so didn't read it in time.
>
> What is the issue with imitating in JIT what the interpreter is
> doing as a starting point? That should be generic enough to handle
> any case.
>
> Otherwise you'd need some sort of reverse mapping since verifier
> already converted BPF_CALL insns into relative helper addresses
> in imm part.
>
Sorry but I don't get what you are trying to say. Can you explain it
with an example?

-Shubham

Re: [RFC PATCH net-next v2 10/15] bpf: Add support for changing congestion control

2017-06-19 Thread Lawrence Brakmo


On 6/19/17, 3:34 PM, "Daniel Borkmann"  wrote:

On 06/18/2017 04:39 AM, Lawrence Brakmo wrote:
> On 6/16/17, 6:58 AM, "Daniel Borkmann"  wrote:
[...]
>  >   /* Change congestion control for socket */
>  > -int tcp_set_congestion_control(struct sock *sk, const char *name)
>  > +int tcp_set_congestion_control(struct sock *sk, const char *name, 
bool load)
>  >   {
>  >struct inet_connection_sock *icsk = inet_csk(sk);
>  >const struct tcp_congestion_ops *ca;
>  > @@ -344,7 +344,10 @@ int tcp_set_congestion_control(struct sock 
*sk, const char *name)
>  >return -EPERM;
>  >
>  >rcu_read_lock();
>  > -  ca = __tcp_ca_find_autoload(name);
>  > +  if (!load)
>  > +  ca = tcp_ca_find(name);
>  > +  else
>  > +  ca = __tcp_ca_find_autoload(name);
>
>   From BPF program side, we call with !load since we're not allowed
>  to sleep under RCU, that's correct ...
>
>  >/* No change asking for existing value */
>  >if (ca == icsk->icsk_ca_ops) {
>  >icsk->icsk_ca_setsockopt = 1;
>  > @@ -352,8 +355,10 @@ int tcp_set_congestion_control(struct sock 
*sk, const char *name)
>  >}
>  >if (!ca)
>  >err = -ENOENT;
>  > +  else if (!load)
>  > +  icsk->icsk_ca_ops = ca;
>
>  ... but don't we also need to hold a module ref in this case as done
>  below?
>
>  Meaning, tcp_ca_find() could return a ca that was previously loaded
>  to the tcp_cong_list as module, then resulting in ref count imbalance
>  when set from BPF?
>
> As I mentioned above, this can be called before congestion has been
> initialized (op <= BPF_SOCKET_OPS_NEEDS_ECN) in which case
> tcp_init_congestion_control will be called later. If op > ..OPS_NEEDS_ECN
> then bpf_setsockopt() will call the reinit_congestion_control().
>
> But this points to an issue where someone else could call
> tcp_set_congestion_control() with load == false not knowing they
> need to call either init or reinit. I will add a comment to the function
> to make it clear.

Hm, I'm not sure it answers my question. What I meant was that from BPF
prog, you're setting tcp_set_congestion_control(..., false) so if
tcp_ca_find() returns a ca that was loaded earlier as a from a module
(so it becomes available in tcp_cong_list), the above...

   [...]
   else if (!load)
   icsk->icsk_ca_ops = ca;
   [...]

... will basically prevent the later try_module_get() on the ca. So any
later tcp_reinit_congestion_control() or tcp_init_congestion_control()
will still run not having the refcount held on the owner module. Meaning
a module unload would let the machine crash due to the refcnt imbalance?
What am I missing?

Nothing, you are correct. I was mistakenly thinking that the refcount update
was being done in tcp_init_congestion_control. Done.

Re: [RFC net-next 6/8] nfp: bpf: add support for XDP_FLAGS_HW_MODE

2017-06-19 Thread Daniel Borkmann


On 06/20/2017 02:01 AM, Jakub Kicinski wrote:

On Tue, 20 Jun 2017 01:50:17 +0200, Daniel Borkmann wrote:

On 06/17/2017 01:57 AM, Jakub Kicinski wrote:

Respect the XDP_FLAGS_HW_MODE.  When it's set install the program
on the NIC and skip enabling XDP in the driver.

Signed-off-by: Jakub Kicinski 
---
   drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 10 +++---
   1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 68648e312129..c5903b6e58c5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3310,19 +3310,22 @@ static int
   nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags,
  struct netlink_ext_ack *extack)
   {
-   struct bpf_prog *offload_prog;
+   struct bpf_prog *drv_prog, *offload_prog;
int err;

if (nn->xdp_prog && (flags ^ nn->xdp_flags) & XDP_FLAGS_MODES)
return -EBUSY;

+   drv_prog = flags & XDP_FLAGS_HW_MODE  ? NULL : prog;
offload_prog = flags & XDP_FLAGS_DRV_MODE ? NULL : prog;


Can you make this assumption here? If dev_change_xdp_fd() is called
without XDP_FLAGS_HW_MODE or XDP_FLAGS_DRV_MODE flags, then we set prog
to both, drv_prog and offload_prog. Is this expected?

Maybe in nfp_net_xdp_setup() check for !hweight32(xdp_flags & XDP_FLAGS_MODES)
and then set flags |= XDP_FLAGS_DRV_MODE before both assignments?


I thought we did want both.  In case the program is loaded to both the
HW/FW will mark the packets with BPF bit in the descriptor so that they
are not processed twice.  But the driver path will be configured for
running bpf and when user replaces the program with one which cannot be
offloaded the driver will not have to reconfigure itself.


Okay, that's a good point ... so that you can just use xchg() later on.
Probably worth explaining this rationale in a short comment.

Re: [RFC net-next 6/8] nfp: bpf: add support for XDP_FLAGS_HW_MODE

2017-06-19 Thread Jakub Kicinski

On Tue, 20 Jun 2017 01:50:17 +0200, Daniel Borkmann wrote:
> On 06/17/2017 01:57 AM, Jakub Kicinski wrote:
> > Respect the XDP_FLAGS_HW_MODE.  When it's set install the program
> > on the NIC and skip enabling XDP in the driver.
> >
> > Signed-off-by: Jakub Kicinski 
> > ---
> >   drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 10 +++---
> >   1 file changed, 7 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
> > b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
> > index 68648e312129..c5903b6e58c5 100644
> > --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
> > +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
> > @@ -3310,19 +3310,22 @@ static int
> >   nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags,
> >   struct netlink_ext_ack *extack)
> >   {
> > -   struct bpf_prog *offload_prog;
> > +   struct bpf_prog *drv_prog, *offload_prog;
> > int err;
> >
> > if (nn->xdp_prog && (flags ^ nn->xdp_flags) & XDP_FLAGS_MODES)
> > return -EBUSY;
> >
> > +   drv_prog = flags & XDP_FLAGS_HW_MODE  ? NULL : prog;
> > offload_prog = flags & XDP_FLAGS_DRV_MODE ? NULL : prog;  
> 
> Can you make this assumption here? If dev_change_xdp_fd() is called
> without XDP_FLAGS_HW_MODE or XDP_FLAGS_DRV_MODE flags, then we set prog
> to both, drv_prog and offload_prog. Is this expected?
> 
> Maybe in nfp_net_xdp_setup() check for !hweight32(xdp_flags & XDP_FLAGS_MODES)
> and then set flags |= XDP_FLAGS_DRV_MODE before both assignments?

I thought we did want both.  In case the program is loaded to both the
HW/FW will mark the packets with BPF bit in the descriptor so that they
are not processed twice.  But the driver path will be configured for
running bpf and when user replaces the program with one which cannot be
offloaded the driver will not have to reconfigure itself.

Re: [RFC net-next 6/8] nfp: bpf: add support for XDP_FLAGS_HW_MODE

2017-06-19 Thread Daniel Borkmann


On 06/17/2017 01:57 AM, Jakub Kicinski wrote:

Respect the XDP_FLAGS_HW_MODE.  When it's set install the program
on the NIC and skip enabling XDP in the driver.

Signed-off-by: Jakub Kicinski 
---
  drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 10 +++---
  1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 68648e312129..c5903b6e58c5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3310,19 +3310,22 @@ static int
  nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags,
  struct netlink_ext_ack *extack)
  {
-   struct bpf_prog *offload_prog;
+   struct bpf_prog *drv_prog, *offload_prog;
int err;

if (nn->xdp_prog && (flags ^ nn->xdp_flags) & XDP_FLAGS_MODES)
return -EBUSY;

+   drv_prog = flags & XDP_FLAGS_HW_MODE  ? NULL : prog;
offload_prog = flags & XDP_FLAGS_DRV_MODE ? NULL : prog;


Can you make this assumption here? If dev_change_xdp_fd() is called
without XDP_FLAGS_HW_MODE or XDP_FLAGS_DRV_MODE flags, then we set prog
to both, drv_prog and offload_prog. Is this expected?

Maybe in nfp_net_xdp_setup() check for !hweight32(xdp_flags & XDP_FLAGS_MODES)
and then set flags |= XDP_FLAGS_DRV_MODE before both assignments?


-   err = nfp_net_xdp_setup_drv(nn, prog, extack);
+   err = nfp_net_xdp_setup_drv(nn, drv_prog, extack);
if (err)
return err;

-   nfp_app_xdp_offload(nn->app, nn, offload_prog);
+   err = nfp_app_xdp_offload(nn->app, nn, offload_prog);
+   if (err && flags & XDP_FLAGS_HW_MODE)
+   return err;

if (nn->xdp_prog)
bpf_prog_put(nn->xdp_prog);
@@ -3338,6 +3341,7 @@ static int nfp_net_xdp(struct net_device *netdev, struct 
netdev_xdp *xdp)

switch (xdp->command) {
case XDP_SETUP_PROG:
+   case XDP_SETUP_PROG_HW:
return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags,
 xdp->extack);
case XDP_QUERY_PROG:

Re: [RFC net-next 7/8] xdp: add reporting of offload mode

2017-06-19 Thread Daniel Borkmann


On 06/17/2017 01:57 AM, Jakub Kicinski wrote:

Extend the XDP_ATTACHED_* values to include offloaded mode.
Let drivers report whether program is installed in the driver
or the HW by changing the prog_attached field from bool to
u8 (type of the netlink attribute).

Exploit the fact that the value of XDP_ATTACHED_DRV is 1,
therefore since all drivers currently assign the mode with
double negation:
mode = !!xdp_prog;
no drivers have to be modified.

Signed-off-by: Jakub Kicinski 


Acked-by: Daniel Borkmann

Re: [RFC net-next 2/8] xdp: add HW offload mode flag for installing programs

2017-06-19 Thread Daniel Borkmann


On 06/17/2017 01:57 AM, Jakub Kicinski wrote:

Add an installation-time flag for requesting that the program
be installed only if it can be offloaded to HW.

Internally new command for ndo_xdp is added, this way we avoid
putting checks into drivers since they all return -EINVAL on
an unknown command.

Signed-off-by: Jakub Kicinski 


Acked-by: Daniel Borkmann

Re: [RFC net-next 1/8] xdp: pass XDP flags into install handlers

2017-06-19 Thread Daniel Borkmann


On 06/17/2017 01:57 AM, Jakub Kicinski wrote:

Pass XDP flags to the xdp ndo.  This will allow drivers to look
at the mode flags and make decisions about offload.

Signed-off-by: Jakub Kicinski 


Acked-by: Daniel Borkmann

Re: [PATCH net-next 0/1] Introduction of the tc tests

2017-06-19 Thread Cong Wang

Hi,

On Fri, Jun 16, 2017 at 2:22 PM, Lucas Bates  wrote:
> Apologies for sending this as one big patch. I've been sitting on this a 
> little
> too long, but it's ready and I wanted to get it out.
>
> There are a limited number of tests to start - I plan to add more on a regular
> basis.
>
> Lucas Bates (1):
>   selftests: Introduce tc testsuite

Nice work!

Is there any particular reason you want to put these tests in kernel tree
especially tools/testing/selftests/ ?

Re: [RFC net-next 2/8] xdp: add HW offload mode flag for installing programs

2017-06-19 Thread Daniel Borkmann


On 06/20/2017 01:24 AM, Jakub Kicinski wrote:
[...]

The XDP_SETUP_PROG_HW command is purely for convenience of drivers
without an offload.  I felt it's not appropriate to burden all drivers
with:

if (xdp->flags & XDP_FLAGS_HW_MODE)
return -EOPNOTSUPP;

But, I do have a patch which does it, so I'm happy to drop the new
command if it's preferred.


Ahh, that makes sense, yep. I was only focused on reviewing this in
the context of nfp driver. Lack of coffee. ;)

Re: [RFC net-next 5/8] nfp: bpf: take a reference on offloaded programs

2017-06-19 Thread Jakub Kicinski

On Tue, 20 Jun 2017 01:23:05 +0200, Daniel Borkmann wrote:
> On 06/17/2017 01:57 AM, Jakub Kicinski wrote:
> > The xdp_prog member of the adapter's data path structure is used
> > for XDP in driver mode.  In case a XDP program is loaded with in
> > HW-only mode, we need to store it somewhere else.  Add a new XDP
> > prog pointer in the main structure and use that when we need to
> > know whether any XDP program is loaded, not only a driver mode
> > one.  Only release our reference on adapter free instead of
> > immediately after netdev unregister to allow offload to be disabled
> > first.
> >
> > Signed-off-by: Jakub Kicinski   
> [...]
> > @@ -3327,6 +3323,10 @@ nfp_net_xdp_setup(struct nfp_net *nn, struct 
> > bpf_prog 
> > return err;
> >
> > nfp_app_xdp_offload(nn->app, nn, offload_prog);
> > +
> > +   if (nn->xdp_prog)
> > +   bpf_prog_put(nn->xdp_prog);
> > +   nn->xdp_prog = prog;
> > nn->xdp_flags = flags;
> >
> > return 0;  
> 
> Can you elaborate on the extra reference on the prog? 

Sorry, this patch went through a few revisions and the subject doesn't
express the intent too well any more :S  Originally it was about making
sure we have a reference on the program when it's offloaded but not
loaded in the driver, but I realized the we have the reference from 
dev_change_xdp_fd() already, so now the patch just releases the
reference on the offloaded program.

> So in nfp_net_xdp_setup(), assuming a prog was already loaded on
> driver side: after your set, nfp_net_xdp_setup_drv() will then
> do the xchg() on nn->dp.xdp_prog, bpf_prog_put() this one and
> later back in nfp_net_xdp_setup() we check nn->xdp_prog and
> bpf_prog_put() it if it existed before and update nn->xdp_prog
> to the current prog. So you end up with two puts on the same
> program, but I don't see where you take the one additional ref
> aside from the ref that you already get from dev_change_xdp_fd().
> What am I missing?

You are right, I missed there were two spots where I was doing a
bpf_prog_put() in nfp_net_xdp_setup_drv(), thanks!

[PATCH net-next v2] enic: Fix format truncation warning

2017-06-19 Thread Govindarajulu Varadarajan

With -Wformat-truncation, gcc throws the following warning.

Fix this by increasing the size of devname to accommodate 15 character
netdev interface name and description.

Remove length format precision for %s. We can fit entire name.

Also increment the version.

drivers/net/ethernet/cisco/enic/enic_main.c: In function ‘enic_open’:
drivers/net/ethernet/cisco/enic/enic_main.c:1740:15: warning: ‘%u’ directive 
output may be truncated writing between 1 and 2 bytes into a region of size 
between 1 and 12 [-Wformat-truncation=]
 "%.11s-rx-%u", netdev->name, i);
   ^~
drivers/net/ethernet/cisco/enic/enic_main.c:1740:5: note: directive argument in 
the range [0, 16]
 "%.11s-rx-%u", netdev->name, i);
 ^
drivers/net/ethernet/cisco/enic/enic_main.c:1738:4: note: ‘snprintf’ output 
between 6 and 18 bytes into a destination of size 16
snprintf(enic->msix[intr].devname,
^~
 sizeof(enic->msix[intr].devname),
 ~
 "%.11s-rx-%u", netdev->name, i);
 ~~~

Signed-off-by: Govindarajulu Varadarajan 
---
v2: dont use kasprintf, increase the devname size
http://patchwork.ozlabs.org/patch/777568/

 drivers/net/ethernet/cisco/enic/enic.h  | 4 ++--
 drivers/net/ethernet/cisco/enic/enic_main.c | 8 
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic.h 
b/drivers/net/ethernet/cisco/enic/enic.h
index 2b23f46b34d3..ba032ac9ae86 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h
@@ -33,7 +33,7 @@
 
 #define DRV_NAME   "enic"
 #define DRV_DESCRIPTION"Cisco VIC Ethernet NIC Driver"
-#define DRV_VERSION"2.3.0.31"
+#define DRV_VERSION"2.3.0.42"
 #define DRV_COPYRIGHT  "Copyright 2008-2013 Cisco Systems, Inc"
 
 #define ENIC_BARS_MAX  6
@@ -47,7 +47,7 @@
 
 struct enic_msix_entry {
int requested;
-   char devname[IFNAMSIZ];
+   char devname[IFNAMSIZ + 8];
irqreturn_t (*isr)(int, void *);
void *devid;
cpumask_var_t affinity_mask;
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c 
b/drivers/net/ethernet/cisco/enic/enic_main.c
index 6a9c8878aca0..d24ee1ad3be1 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -1737,7 +1737,7 @@ static int enic_request_intr(struct enic *enic)
intr = enic_msix_rq_intr(enic, i);
snprintf(enic->msix[intr].devname,
sizeof(enic->msix[intr].devname),
-   "%.11s-rx-%u", netdev->name, i);
+   "%s-rx-%u", netdev->name, i);
enic->msix[intr].isr = enic_isr_msix;
enic->msix[intr].devid = >napi[i];
}
@@ -1748,7 +1748,7 @@ static int enic_request_intr(struct enic *enic)
intr = enic_msix_wq_intr(enic, i);
snprintf(enic->msix[intr].devname,
sizeof(enic->msix[intr].devname),
-   "%.11s-tx-%u", netdev->name, i);
+   "%s-tx-%u", netdev->name, i);
enic->msix[intr].isr = enic_isr_msix;
enic->msix[intr].devid = >napi[wq];
}
@@ -1756,14 +1756,14 @@ static int enic_request_intr(struct enic *enic)
intr = enic_msix_err_intr(enic);
snprintf(enic->msix[intr].devname,
sizeof(enic->msix[intr].devname),
-   "%.11s-err", netdev->name);
+   "%s-err", netdev->name);
enic->msix[intr].isr = enic_isr_msix_err;
enic->msix[intr].devid = enic;
 
intr = enic_msix_notify_intr(enic);
snprintf(enic->msix[intr].devname,
sizeof(enic->msix[intr].devname),
-   "%.11s-notify", netdev->name);
+   "%s-notify", netdev->name);
enic->msix[intr].isr = enic_isr_msix_notify;
enic->msix[intr].devid = enic;
 
-- 
2.13.1

Re: [RFC net-next 2/8] xdp: add HW offload mode flag for installing programs

2017-06-19 Thread Jakub Kicinski

On Tue, 20 Jun 2017 00:55:41 +0200, Daniel Borkmann wrote:
> On 06/17/2017 01:57 AM, Jakub Kicinski wrote:
> > Add an installation-time flag for requesting that the program
> > be installed only if it can be offloaded to HW.
> >
> > Internally new command for ndo_xdp is added, this way we avoid
> > putting checks into drivers since they all return -EINVAL on
> > an unknown command.
> >
> > Signed-off-by: Jakub Kicinski   
> [...]
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index a04db264aa1c..05cec8e2cd82 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -6959,7 +6959,10 @@ static int dev_xdp_install(struct net_device *dev, 
> > xdp_op_t xdp_op,
> > struct netdev_xdp xdp;
> >
> > memset(, 0, sizeof(xdp));
> > -   xdp.command = XDP_SETUP_PROG;
> > +   if (flags & XDP_FLAGS_HW_MODE)
> > +   xdp.command = XDP_SETUP_PROG_HW;
> > +   else
> > +   xdp.command = XDP_SETUP_PROG;
> > xdp.extack = extack;
> > xdp.flags = flags;
> > xdp.prog = prog;  
> 
> One thing I'm not sure I follow is that while you pass flags to the ndo
> in patch 1, add a new XDP_SETUP_PROG_HW command here in patch 2 based on
> the flags, and later on in patch 6, you don't really make use of it, but
> look at the flags anyway? Then, why adding separate XDP_SETUP_PROG_HW
> in the first place?
> 
> [patch 6:]
> @@ -3338,6 +3341,7 @@ static int nfp_net_xdp(struct net_device *netdev, 
> struct netdev_xdp *xdp)
> 
>   switch (xdp->command) {
>   case XDP_SETUP_PROG:
> + case XDP_SETUP_PROG_HW:
>   return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags,
>xdp->extack);

We still need the flags to be able to differentiate between default/no
flags case where we load to the driver and the HW ("both"), and when
the DRV_MODE flag is set, in which case we disable the HW offload and
only load to the driver.  We have three cases:

   drv offload
 no flag   yesattempted
DRV_MODE   yesno
 HW_MODEno   yes

The XDP_SETUP_PROG_HW command is purely for convenience of drivers
without an offload.  I felt it's not appropriate to burden all drivers
with:

if (xdp->flags & XDP_FLAGS_HW_MODE)
return -EOPNOTSUPP;

But, I do have a patch which does it, so I'm happy to drop the new
command if it's preferred.

Re: [RFC net-next 5/8] nfp: bpf: take a reference on offloaded programs

2017-06-19 Thread Daniel Borkmann


On 06/17/2017 01:57 AM, Jakub Kicinski wrote:

The xdp_prog member of the adapter's data path structure is used
for XDP in driver mode.  In case a XDP program is loaded with in
HW-only mode, we need to store it somewhere else.  Add a new XDP
prog pointer in the main structure and use that when we need to
know whether any XDP program is loaded, not only a driver mode
one.  Only release our reference on adapter free instead of
immediately after netdev unregister to allow offload to be disabled
first.

Signed-off-by: Jakub Kicinski 

[...]

@@ -3327,6 +3323,10 @@ nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog 
*prog, u32 flags,
return err;

nfp_app_xdp_offload(nn->app, nn, offload_prog);
+
+   if (nn->xdp_prog)
+   bpf_prog_put(nn->xdp_prog);
+   nn->xdp_prog = prog;
nn->xdp_flags = flags;

return 0;


Can you elaborate on the extra reference on the prog? So in
nfp_net_xdp_setup(), assuming a prog was already loaded on
driver side: after your set, nfp_net_xdp_setup_drv() will then
do the xchg() on nn->dp.xdp_prog, bpf_prog_put() this one and
later back in nfp_net_xdp_setup() we check nn->xdp_prog and
bpf_prog_put() it if it existed before and update nn->xdp_prog
to the current prog. So you end up with two puts on the same
program, but I don't see where you take the one additional ref
aside from the ref that you already get from dev_change_xdp_fd().
What am I missing?

Re: [RFC net-next 2/8] xdp: add HW offload mode flag for installing programs

2017-06-19 Thread Daniel Borkmann


On 06/17/2017 01:57 AM, Jakub Kicinski wrote:

Add an installation-time flag for requesting that the program
be installed only if it can be offloaded to HW.

Internally new command for ndo_xdp is added, this way we avoid
putting checks into drivers since they all return -EINVAL on
an unknown command.

Signed-off-by: Jakub Kicinski 

[...]

diff --git a/net/core/dev.c b/net/core/dev.c
index a04db264aa1c..05cec8e2cd82 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6959,7 +6959,10 @@ static int dev_xdp_install(struct net_device *dev, 
xdp_op_t xdp_op,
struct netdev_xdp xdp;

memset(, 0, sizeof(xdp));
-   xdp.command = XDP_SETUP_PROG;
+   if (flags & XDP_FLAGS_HW_MODE)
+   xdp.command = XDP_SETUP_PROG_HW;
+   else
+   xdp.command = XDP_SETUP_PROG;
xdp.extack = extack;
xdp.flags = flags;
xdp.prog = prog;


One thing I'm not sure I follow is that while you pass flags to the ndo
in patch 1, add a new XDP_SETUP_PROG_HW command here in patch 2 based on
the flags, and later on in patch 6, you don't really make use of it, but
look at the flags anyway? Then, why adding separate XDP_SETUP_PROG_HW
in the first place?

[patch 6:]
@@ -3338,6 +3341,7 @@ static int nfp_net_xdp(struct net_device *netdev, struct 
netdev_xdp *xdp)

switch (xdp->command) {
case XDP_SETUP_PROG:
+   case XDP_SETUP_PROG_HW:
return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags,
 xdp->extack);

Re: [RFC PATCH net-next v2 10/15] bpf: Add support for changing congestion control

2017-06-19 Thread Daniel Borkmann

On 06/18/2017 04:39 AM, Lawrence Brakmo wrote:

On 6/16/17, 6:58 AM, "Daniel Borkmann"  wrote:

[...]

 >   /* Change congestion control for socket */
 > -int tcp_set_congestion_control(struct sock *sk, const char *name)
 > +int tcp_set_congestion_control(struct sock *sk, const char *name, bool 
load)
 >   {
 >   struct inet_connection_sock *icsk = inet_csk(sk);
 >   const struct tcp_congestion_ops *ca;
 > @@ -344,7 +344,10 @@ int tcp_set_congestion_control(struct sock *sk, 
const char *name)
 >   return -EPERM;
 >
 >   rcu_read_lock();
 > - ca = __tcp_ca_find_autoload(name);
 > + if (!load)
 > + ca = tcp_ca_find(name);
 > + else
 > + ca = __tcp_ca_find_autoload(name);

  From BPF program side, we call with !load since we're not allowed
 to sleep under RCU, that's correct ...

 >   /* No change asking for existing value */
 >   if (ca == icsk->icsk_ca_ops) {
 >   icsk->icsk_ca_setsockopt = 1;
 > @@ -352,8 +355,10 @@ int tcp_set_congestion_control(struct sock *sk, 
const char *name)
 >   }
 >   if (!ca)
 >   err = -ENOENT;
 > + else if (!load)
 > + icsk->icsk_ca_ops = ca;

 ... but don't we also need to hold a module ref in this case as done
 below?

 Meaning, tcp_ca_find() could return a ca that was previously loaded
 to the tcp_cong_list as module, then resulting in ref count imbalance
 when set from BPF?

As I mentioned above, this can be called before congestion has been
initialized (op <= BPF_SOCKET_OPS_NEEDS_ECN) in which case
tcp_init_congestion_control will be called later. If op > ..OPS_NEEDS_ECN
then bpf_setsockopt() will call the reinit_congestion_control().

But this points to an issue where someone else could call
tcp_set_congestion_control() with load == false not knowing they
need to call either init or reinit. I will add a comment to the function
to make it clear.

Hm, I'm not sure it answers my question. What I meant was that from BPF
prog, you're setting tcp_set_congestion_control(..., false) so if
tcp_ca_find() returns a ca that was loaded earlier as a from a module
(so it becomes available in tcp_cong_list), the above...

  [...]
  else if (!load)
  icsk->icsk_ca_ops = ca;
  [...]

... will basically prevent the later try_module_get() on the ca. So any
later tcp_reinit_congestion_control() or tcp_init_congestion_control()
will still run not having the refcount held on the owner module. Meaning
a module unload would let the machine crash due to the refcnt imbalance?
What am I missing?

Re: [PATCH NET] net/hns:bugfix of ethtool -t phy self_test

2017-06-19 Thread Andrew Lunn

On Mon, Jun 19, 2017 at 02:00:43PM -0700, Florian Fainelli wrote:
> On 06/16/2017 02:24 AM, Lin Yun Sheng wrote:
> > This patch fixes the phy loopback self_test failed issue. when
> > Marvell Phy Module is loaded, it will powerdown fiber when doing
> > phy loopback self test, which cause phy loopback self_test fail.
> > 
> > Signed-off-by: Lin Yun Sheng 
> > ---
> >  drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 16 ++--
> >  1 file changed, 14 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c 
> > b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
> > index b8fab14..e95795b 100644
> > --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
> > +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
> > @@ -288,9 +288,15 @@ static int hns_nic_config_phy_loopback(struct 
> > phy_device *phy_dev, u8 en)
> 
> The question really is, why is not this properly integrated into the PHY
> driver and PHYLIB such that the only thing the Ethernet MAC driver has
> to call is a function of the PHY driver putting it in self-test?

This whole driver pokes various PHY registers, rather than use
phylib. And it does so without taking the PHY lock. It also assumes it
is a Marvell PHY and i don't see anywhere it actually verifies this.

This is all broken.

 Andrew

Re: Reply Urgent

2017-06-19 Thread INFO


Hello,

How are you doing? I have been sent to inform you that, We have an  
inheritance of a deceased client with your surname. Contact Mr Andrew  
Bailey Reply Email To: myinf...@gmail.com with your "Full Names" for  
more info.  Thanks for your understanding.


Reply ASAP thank you.

Melissa.
--
Correo Corporativo Hospital Universitario del Valle E.S.E
***

"Estamos re-dimensionandonos para crecer!"

**

Re: [PATCH NET] net/hns:bugfix of ethtool -t phy self_test

2017-06-19 Thread Florian Fainelli

On 06/16/2017 02:24 AM, Lin Yun Sheng wrote:
> This patch fixes the phy loopback self_test failed issue. when
> Marvell Phy Module is loaded, it will powerdown fiber when doing
> phy loopback self test, which cause phy loopback self_test fail.
> 
> Signed-off-by: Lin Yun Sheng 
> ---
>  drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 16 ++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c 
> b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
> index b8fab14..e95795b 100644
> --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
> +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
> @@ -288,9 +288,15 @@ static int hns_nic_config_phy_loopback(struct phy_device 
> *phy_dev, u8 en)

The question really is, why is not this properly integrated into the PHY
driver and PHYLIB such that the only thing the Ethernet MAC driver has
to call is a function of the PHY driver putting it in self-test?

>  
>   /* Force 1000M Link, Default is 0x0200 */
>   phy_write(phy_dev, 7, 0x20C);
> - phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
>  
> - /* Enable PHY loop-back */
> + /* Powerup Fiber */
> + phy_write(phy_dev, HNS_PHY_PAGE_REG, 1);
> + val = phy_read(phy_dev, COPPER_CONTROL_REG);
> + val &= ~PHY_POWER_DOWN;
> + phy_write(phy_dev, COPPER_CONTROL_REG, val);
> +
> + /* Enable Phy Loopback */
> + phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
>   val = phy_read(phy_dev, COPPER_CONTROL_REG);
>   val |= PHY_LOOP_BACK;
>   val &= ~PHY_POWER_DOWN;
> @@ -299,6 +305,12 @@ static int hns_nic_config_phy_loopback(struct phy_device 
> *phy_dev, u8 en)
>   phy_write(phy_dev, HNS_PHY_PAGE_REG, 0xFA);
>   phy_write(phy_dev, 1, 0x400);
>   phy_write(phy_dev, 7, 0x200);
> +
> + phy_write(phy_dev, HNS_PHY_PAGE_REG, 1);
> + val = phy_read(phy_dev, COPPER_CONTROL_REG);
> + val |= PHY_POWER_DOWN;
> + phy_write(phy_dev, COPPER_CONTROL_REG, val);
> +
>   phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
>   phy_write(phy_dev, 9, 0xF00);
>  
> 


-- 
Florian

Re: [RFC PATCH net-next v2 01/15] bpf: BPF support for socket ops

2017-06-19 Thread Lawrence Brakmo


On 6/19/17, 11:44 AM, "Daniel Borkmann"  wrote:

On 06/17/2017 01:41 AM, Lawrence Brakmo wrote:
> On 6/16/17, 5:07 AM, "Daniel Borkmann"  wrote:
[...]
> I see. You are saying have one struct in common but still keep the two
> PROG_TYPES? That makes sense. Do we really need two different
> is_valid_access functions? Both types should be able to see all
> the fields (otherwise adding new fields becomes messy).

Would probably leave the two is_valid_access() separate initially, and
once people ask for it we could potentially open this up to some of
the other fields that are available at that time.

As discussed in the other thread, I will keep the 2 structs

>  > Currently there are two types of ops. The first type expects the 
BPF
>  > program to return a value which is then used by the caller (or a
>  > negative value to indicate the operation is not supported). The 
second
>  > type expects state changes to be done by the BPF program, for 
example
>  > through a setsockopt BPF helper function, and they ignore the 
return
>  > value.
[...]
>  > +/* Call BPF_SOCKET_OPS program that returns an int. If the return 
value
>  > + * is < 0, then the BPF op failed (for example if the loaded BPF
>  > + * program does not support the chosen operation or there is no 
BPF
>  > + * program loaded).
>  > + */
>  > +#ifdef CONFIG_BPF
>  > +static inline int tcp_call_bpf(struct sock *sk, bool is_req_sock, 
int op)
>  > +{
>  > +  struct bpf_socket_ops_kern socket_ops;
>  > +
>  > +  memset(_ops, 0, sizeof(socket_ops));
>  > +  socket_ops.sk = sk;
>  > +  socket_ops.is_req_sock = is_req_sock ? 1 : 0;
>
>  Is is_req_sock actually used here in this patch (apart from setting 
it)?
>  Not seeing that BPF prog will access it, if it also shouldn't access 
it,
>  then bool type would be better.
>
> The only reason I used a bit was in case I wanted to add more fields 
later on.
> Does it make sense or should I just use bool?

Didn't know that, but I think starting out with bool seems a bit
cleaner, if needed we could later still switch to bitfield.

Done.

>  > +  socket_ops.op = op;
>  > +
>  > +  return bpf_socket_ops_call(_ops);
>  > +}
[...]
>  > +/* Global BPF program for sockets */
>  > +static struct bpf_prog *bpf_socket_ops_prog;
>  > +static DEFINE_RWLOCK(bpf_socket_ops_lock);
>  > +
>  > +int bpf_socket_ops_set_prog(int fd)
>  > +{
>  > +  int err = 0;
>  > +
>  > +  write_lock(_socket_ops_lock);
>  > +  if (bpf_socket_ops_prog) {
>  > +  bpf_prog_put(bpf_socket_ops_prog);
>  > +  bpf_socket_ops_prog = NULL;
>  > +  }
>  > +
>  > +  /* fd of zero is used as a signal to remove the current
>  > +   * bpf_socket_ops_prog.
>  > +   */
>  > +  if (fd == 0) {
>
>  Can we make the fd related semantics similar to dev_change_xdp_fd()?
>
> Do you mean remove program is fd < 0 instead of == 0?

Yes, that and also the ordering of dropping the ref of the existing
bpf_socket_ops_prog program with setting the new one, so you can
convert bpf_socket_ops_prog to RCU more easily.

I made lots of changes to how we set/attach the global_sock_ops program
affecting the files kernel/bpf/syscall.c, net/core/sock_bpfops.c and
samples/bpf/tcp_bpf.c. The patch set will be submitted later today.

>  > +  write_unlock(_socket_ops_lock);
>  > +  return 1;
>  > +  }
>  > +
>  > +  bpf_socket_ops_prog = bpf_prog_get_type(fd, 
BPF_PROG_TYPE_SOCKET_OPS);
>  > +  if (IS_ERR(bpf_socket_ops_prog)) {
>  > +  bpf_prog_put(bpf_socket_ops_prog);
>
>  This will crash the kernel, passing err value to bpf_prog_put().
[...]

Thanks again for the feedback.

Re: [RFC PATCH net-next v2 01/15] bpf: BPF support for socket ops

2017-06-19 Thread Lawrence Brakmo


On 6/19/17, 11:52 AM, "Daniel Borkmann"  wrote:

On 06/17/2017 11:48 PM, Lawrence Brakmo wrote:
> On 6/16/17, 5:07 AM, "Daniel Borkmann"  wrote:
>
>  On 06/15/2017 10:08 PM, Lawrence Brakmo wrote:
>  > Two new corresponding structs (one for the kernel one for the 
user/BPF
>  > program):
>  >
>  > /* kernel version */
>  > struct bpf_socket_ops_kern {
>  >  struct sock *sk;
>  >__u32  is_req_sock:1;
>  >  __u32  op;
>  >  union {
>  >  __u32 reply;
>  >  __u32 replylong[4];
>  >  };
>  > };
>  >
>  > /* user version */
>  > struct bpf_socket_ops {
>  >  __u32 op;
>  >  union {
>  >  __u32 reply;
>  >  __u32 replylong[4];
>  >  };
>  >  __u32 family;
>  >  __u32 remote_ip4;
>  >  __u32 local_ip4;
>  >  __u32 remote_ip6[4];
>  >  __u32 local_ip6[4];
>  >  __u32 remote_port;
>  >  __u32 local_port;
>  > };
>
>  Above and ...
>
>  struct bpf_sock {
>   __u32 bound_dev_if;
>   __u32 family;
>   __u32 type;
>   __u32 protocol;
>  };
>
>  ... would result in two BPF sock user versions. It's okayish, but
>  given struct bpf_sock is quite generic, couldn't we merge the members
>  from struct bpf_socket_ops into struct bpf_sock instead?
>
>  Idea would be that sock_filter_is_valid_access() for cgroups would
>  then check off < 0 || off + size > offsetofend(struct bpf_sock, 
protocol)
>  to disallow new members, and your socket_ops_is_valid_access() could
>  allow and xlate the full range. The family member is already 
duplicate
>  and the others could then be accessed from these kind of BPF progs as
>  well, plus we have a single user representation similar as with 
__sk_buff
>  that multiple types will use.
>
> I am concerned that it could make usage more confusing. One type of
> sock program (cgroup) could only use a subset of the fields while the
> other type (socket_ops) could use all (or a different subset). Then what
> happens if there is a need to add a new field to cgroup type sock program?
> In addition, in the near future I will have a patch to attach socket_ops
> programs to cgroups.
> I rather leave it as it is.

Okay, I'm fine with that as well. For the __sk_buff, we also have the
case that some members are not available for all program types like
tc_classid, so it's similar there. But if indeed the majority of members
cannot be supported for the most parts (?) then having different structs
seems okay, probably easier to use, but we should try hard to not ending
up with 10 different uapi socket structs that apply to program types
working on sockets in one way or another.

Agree 100%.

[PATCH net-next v2 1/4] rtnetlink: add NEWCACHEREPORT message type

2017-06-19 Thread Julien Gomes

New NEWCACHEREPORT message type to be used for cache reports sent
via Netlink, effectively allowing splitting cache report reception from
mroute programming.

Suggested-by: Ryan Halbrook 
Signed-off-by: Julien Gomes 
---
 include/uapi/linux/rtnetlink.h | 3 +++
 security/selinux/nlmsgtab.c| 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 564790e854f7..cd1afb900929 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -146,6 +146,9 @@ enum {
RTM_GETSTATS = 94,
 #define RTM_GETSTATS RTM_GETSTATS
 
+   RTM_NEWCACHEREPORT = 96,
+#define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT
+
__RTM_MAX,
 #define RTM_MAX(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 5aeaf30b7a13..7b7433a1a34c 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -79,6 +79,7 @@ static const struct nlmsg_perm nlmsg_route_perms[] =
{ RTM_GETNSID,  NETLINK_ROUTE_SOCKET__NLMSG_READ  },
{ RTM_NEWSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ },
{ RTM_GETSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ  },
+   { RTM_NEWCACHEREPORT,   NETLINK_ROUTE_SOCKET__NLMSG_READ },
 };
 
 static const struct nlmsg_perm nlmsg_tcpdiag_perms[] =
@@ -158,7 +159,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 
*perm)
switch (sclass) {
case SECCLASS_NETLINK_ROUTE_SOCKET:
/* RTM_MAX always point to RTM_SET, ie RTM_NEWxxx + 3 */
-   BUILD_BUG_ON(RTM_MAX != (RTM_NEWSTATS + 3));
+   BUILD_BUG_ON(RTM_MAX != (RTM_NEWCACHEREPORT + 3));
err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
 sizeof(nlmsg_route_perms));
break;
-- 
2.13.1

[PATCH net-next v2 0/4] ipmr/ip6mr: add Netlink notifications on cache reports

2017-06-19 Thread Julien Gomes

Currently, all ipmr/ip6mr cache reports are sent through the
mroute/mroute6 socket only.
This forces the use of a single socket for mroute programming, cache
reports and, regarding ipmr, IGMP messages without Router Alert option
reception.

The present patches are aiming to send Netlink notifications in addition
to the existing igmpmsg/mrt6msg to give user programs a way to handle
cache reports in parallel with multiple sockets other than the
mroute/mroute6 socket.

Changes in v2:
- Changed attributes naming from {IPMRA,IP6MRA}_CACHEREPORTA_* to
  {IPMRA,IP6MRA}_CREPORT_*
- Improved packet data copy to handle non-linear packets in
  ipmr/ip6mr cache report Netlink notification creation
- Added two rtnetlink groups with restricted-binding
- Changed cache report notified groups from RTNL_{IPV4,IPV6}_MROUTE to
  the new restricted groups in ipmr/ip6mr

Julien Gomes (4):
  rtnetlink: add NEWCACHEREPORT message type
  rtnetlink: add restricted rtnl groups for ipv4 and ipv6 mroute
  ipmr: add netlink notifications on igmpmsg cache reports
  ip6mr: add netlink notifications on mrt6msg cache reports

 include/uapi/linux/mroute.h| 12 
 include/uapi/linux/mroute6.h   | 12 
 include/uapi/linux/rtnetlink.h |  7 +
 net/core/rtnetlink.c   | 13 
 net/ipv4/ipmr.c| 67 --
 net/ipv6/ip6mr.c   | 67 --
 security/selinux/nlmsgtab.c|  3 +-
 7 files changed, 176 insertions(+), 5 deletions(-)

-- 
2.13.1

[PATCH net-next v2 3/4] ipmr: add netlink notifications on igmpmsg cache reports

2017-06-19 Thread Julien Gomes

Add Netlink notifications on cache reports in ipmr, in addition to the
existing igmpmsg sent to mroute_sk.
Send RTM_NEWCACHEREPORT notifications to RTNLGRP_IPV4_MROUTE_R.

MSGTYPE, VIF_ID, SRC_ADDR and DST_ADDR Netlink attributes contain the
same data as their equivalent fields in the igmpmsg header.
PKT attribute is the packet sent to mroute_sk, without the added igmpmsg
header.

Suggested-by: Ryan Halbrook 
Signed-off-by: Julien Gomes 
---
 include/uapi/linux/mroute.h | 12 
 net/ipv4/ipmr.c | 67 +++--
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
index f904367c0cee..e8e5041dea8e 100644
--- a/include/uapi/linux/mroute.h
+++ b/include/uapi/linux/mroute.h
@@ -152,6 +152,18 @@ enum {
 };
 #define IPMRA_VIFA_MAX (__IPMRA_VIFA_MAX - 1)
 
+/* ipmr netlink cache report attributes */
+enum {
+   IPMRA_CREPORT_UNSPEC,
+   IPMRA_CREPORT_MSGTYPE,
+   IPMRA_CREPORT_VIF_ID,
+   IPMRA_CREPORT_SRC_ADDR,
+   IPMRA_CREPORT_DST_ADDR,
+   IPMRA_CREPORT_PKT,
+   __IPMRA_CREPORT_MAX
+};
+#define IPMRA_CREPORT_MAX (__IPMRA_CREPORT_MAX - 1)
+
 /* That's all usermode folks */
 
 #define MFC_ASSERT_THRESH (3*HZ)   /* Maximal freq. of asserts */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3e7454aa49e8..1e591bcaad6d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -109,6 +109,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct 
sk_buff *skb,
  struct mfc_cache *c, struct rtmsg *rtm);
 static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 int cmd);
+static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
 static void mroute_clean_tables(struct mr_table *mrt, bool all);
 static void ipmr_expire_process(unsigned long arg);
 
@@ -995,8 +996,7 @@ static void ipmr_cache_resolve(struct net *net, struct 
mr_table *mrt,
}
 }
 
-/* Bounce a cache query up to mrouted. We could use netlink for this but 
mrouted
- * expects the following bizarre scheme.
+/* Bounce a cache query up to mrouted and netlink.
  *
  * Called under mrt_lock.
  */
@@ -1062,6 +1062,8 @@ static int ipmr_cache_report(struct mr_table *mrt,
return -EINVAL;
}
 
+   igmpmsg_netlink_event(mrt, skb);
+
/* Deliver to mrouted */
ret = sock_queue_rcv_skb(mroute_sk, skb);
rcu_read_unlock();
@@ -2341,6 +2343,67 @@ static void mroute_netlink_event(struct mr_table *mrt, 
struct mfc_cache *mfc,
rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
 }
 
+static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
+{
+   struct net *net = read_pnet(>net);
+   struct nlmsghdr *nlh;
+   struct rtgenmsg *rtgenm;
+   struct igmpmsg *msg;
+   struct sk_buff *skb;
+   struct nlattr *nla;
+   int payloadlen;
+   int msgsize;
+
+   payloadlen = pkt->len - sizeof(struct igmpmsg);
+   msg = (struct igmpmsg *)skb_network_header(pkt);
+   msgsize = NLMSG_ALIGN(sizeof(struct rtgenmsg))
+   + nla_total_size(1)
+   /* IPMRA_CREPORT_MSGTYPE */
+   + nla_total_size(1)
+   /* IPMRA_CREPORT_VIF_ID */
+   + nla_total_size(4)
+   /* IPMRA_CREPORT_SRC_ADDR */
+   + nla_total_size(4)
+   /* IPMRA_CREPORT_DST_ADDR */
+   + nla_total_size(payloadlen)
+   /* IPMRA_CREPORT_PKT */
+   ;
+
+   skb = nlmsg_new(msgsize, GFP_ATOMIC);
+   if (!skb)
+   goto errout;
+
+   nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
+   sizeof(struct rtgenmsg), 0);
+   if (!nlh)
+   goto errout;
+   rtgenm = nlmsg_data(nlh);
+   rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
+   if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
+   nla_put_u8(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) ||
+   nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
+   msg->im_src.s_addr) ||
+   nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
+   msg->im_dst.s_addr))
+   goto nla_put_failure;
+
+   nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
+   if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg),
+ nla_data(nla), payloadlen))
+   goto nla_put_failure;
+
+   nlmsg_end(skb, nlh);
+
+   rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC);
+   return;
+
+nla_put_failure:
+   nlmsg_cancel(skb, nlh);
+errout:
+   kfree_skb(skb);
+

[PATCH net-next v2 2/4] rtnetlink: add restricted rtnl groups for ipv4 and ipv6 mroute

2017-06-19 Thread Julien Gomes

Add RTNLGRP_{IPV4,IPV6}_MROUTE_R as two new restricted groups for the
NETLINK_ROUTE family.
Binding to these groups specifically requires CAP_NET_ADMIN to allow
multicast of sensitive messages (e.g. mroute cache reports).

Signed-off-by: Julien Gomes 
---
 include/uapi/linux/rtnetlink.h |  4 
 net/core/rtnetlink.c   | 13 +
 2 files changed, 17 insertions(+)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cd1afb900929..d148505010a7 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -669,6 +669,10 @@ enum rtnetlink_groups {
 #define RTNLGRP_NSID   RTNLGRP_NSID
RTNLGRP_MPLS_NETCONF,
 #define RTNLGRP_MPLS_NETCONF   RTNLGRP_MPLS_NETCONF
+   RTNLGRP_IPV4_MROUTE_R,
+#define RTNLGRP_IPV4_MROUTE_R  RTNLGRP_IPV4_MROUTE_R
+   RTNLGRP_IPV6_MROUTE_R,
+#define RTNLGRP_IPV6_MROUTE_R  RTNLGRP_IPV6_MROUTE_R
__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX(__RTNLGRP_MAX - 1)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3aa57848a895..4aefa5a2625f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4218,6 +4218,18 @@ static void rtnetlink_rcv(struct sk_buff *skb)
rtnl_unlock();
 }
 
+static int rtnetlink_bind(struct net *net, int group)
+{
+   switch (group) {
+   case RTNLGRP_IPV4_MROUTE_R:
+   case RTNLGRP_IPV6_MROUTE_R:
+   if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+   return -EPERM;
+   break;
+   }
+   return 0;
+}
+
 static int rtnetlink_event(struct notifier_block *this, unsigned long event, 
void *ptr)
 {
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
@@ -4252,6 +4264,7 @@ static int __net_init rtnetlink_net_init(struct net *net)
.input  = rtnetlink_rcv,
.cb_mutex   = _mutex,
.flags  = NL_CFG_F_NONROOT_RECV,
+   .bind   = rtnetlink_bind,
};
 
sk = netlink_kernel_create(net, NETLINK_ROUTE, );
-- 
2.13.1

[PATCH net-next v2 4/4] ip6mr: add netlink notifications on mrt6msg cache reports

2017-06-19 Thread Julien Gomes

Add Netlink notifications on cache reports in ip6mr, in addition to the
existing mrt6msg sent to mroute6_sk.
Send RTM_NEWCACHEREPORT notifications to RTNLGRP_IPV6_MROUTE_R.

MSGTYPE, MIF_ID, SRC_ADDR and DST_ADDR Netlink attributes contain the
same data as their equivalent fields in the mrt6msg header.
PKT attribute is the packet sent to mroute6_sk, without the added
mrt6msg header.

Suggested-by: Ryan Halbrook 
Signed-off-by: Julien Gomes 
---
 include/uapi/linux/mroute6.h | 12 
 net/ipv6/ip6mr.c | 67 ++--
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h
index ed5721148768..e4746816c855 100644
--- a/include/uapi/linux/mroute6.h
+++ b/include/uapi/linux/mroute6.h
@@ -133,4 +133,16 @@ struct mrt6msg {
struct in6_addr im6_src, im6_dst;
 };
 
+/* ip6mr netlink cache report attributes */
+enum {
+   IP6MRA_CREPORT_UNSPEC,
+   IP6MRA_CREPORT_MSGTYPE,
+   IP6MRA_CREPORT_MIF_ID,
+   IP6MRA_CREPORT_SRC_ADDR,
+   IP6MRA_CREPORT_DST_ADDR,
+   IP6MRA_CREPORT_PKT,
+   __IP6MRA_CREPORT_MAX
+};
+#define IP6MRA_CREPORT_MAX (__IP6MRA_CREPORT_MAX - 1)
+
 #endif /* _UAPI__LINUX_MROUTE6_H */
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index b0e2bf1f4212..28a1fb49f12e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -116,6 +116,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, 
struct sk_buff *skb,
   struct mfc6_cache *c, struct rtmsg *rtm);
 static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc,
  int cmd);
+static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt);
 static int ip6mr_rtm_dumproute(struct sk_buff *skb,
   struct netlink_callback *cb);
 static void mroute_clean_tables(struct mr6_table *mrt, bool all);
@@ -1125,8 +1126,7 @@ static void ip6mr_cache_resolve(struct net *net, struct 
mr6_table *mrt,
 }
 
 /*
- * Bounce a cache query up to pim6sd. We could use netlink for this but 
pim6sd
- * expects the following bizarre scheme.
+ * Bounce a cache query up to pim6sd and netlink.
  *
  * Called under mrt_lock.
  */
@@ -1208,6 +1208,8 @@ static int ip6mr_cache_report(struct mr6_table *mrt, 
struct sk_buff *pkt,
return -EINVAL;
}
 
+   mrt6msg_netlink_event(mrt, skb);
+
/*
 *  Deliver to user space multicast routing algorithms
 */
@@ -2457,6 +2459,67 @@ static void mr6_netlink_event(struct mr6_table *mrt, 
struct mfc6_cache *mfc,
rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err);
 }
 
+static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt)
+{
+   struct net *net = read_pnet(>net);
+   struct nlmsghdr *nlh;
+   struct rtgenmsg *rtgenm;
+   struct mrt6msg *msg;
+   struct sk_buff *skb;
+   struct nlattr *nla;
+   int payloadlen;
+   int msgsize;
+
+   payloadlen = pkt->len - sizeof(struct mrt6msg);
+   msg = (struct mrt6msg *)skb_transport_header(pkt);
+   msgsize = NLMSG_ALIGN(sizeof(struct rtgenmsg))
+   + nla_total_size(1)
+   /* IP6MRA_CREPORT_MSGTYPE */
+   + nla_total_size(2)
+   /* IP6MRA_CREPORT_MIF_ID */
+   + nla_total_size(sizeof(struct in6_addr))
+   /* IP6MRA_CREPORT_SRC_ADDR */
+   + nla_total_size(sizeof(struct in6_addr))
+   /* IP6MRA_CREPORT_DST_ADDR */
+   + nla_total_size(payloadlen)
+   /* IP6MRA_CREPORT_PKT */
+   ;
+
+   skb = nlmsg_new(msgsize, GFP_ATOMIC);
+   if (!skb)
+   goto errout;
+
+   nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
+   sizeof(struct rtgenmsg), 0);
+   if (!nlh)
+   goto errout;
+   rtgenm = nlmsg_data(nlh);
+   rtgenm->rtgen_family = RTNL_FAMILY_IP6MR;
+   if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) ||
+   nla_put_u16(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) ||
+   nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR,
+>im6_src) ||
+   nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR,
+>im6_dst))
+   goto nla_put_failure;
+
+   nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen);
+   if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg),
+ nla_data(nla), payloadlen))
+   goto nla_put_failure;
+
+   nlmsg_end(skb, nlh);
+
+   rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC);
+   return;
+
+nla_put_failure:
+

[PATCH 3/5] rtlwifi: Add and use convenience macro rtl_btc

2017-06-19 Thread Joe Perches

bluetooth coexistence functions always check get_btc_status before
accessing the function.  Centralize this via a convenience macro
to neaten the source code a little.

Signed-off-by: Joe Perches 
---
 drivers/net/wireless/realtek/rtlwifi/base.c |  8 ++--
 drivers/net/wireless/realtek/rtlwifi/core.c | 12 +++-
 drivers/net/wireless/realtek/rtlwifi/pci.c  |  4 +---
 drivers/net/wireless/realtek/rtlwifi/ps.c   | 24 ++--
 drivers/net/wireless/realtek/rtlwifi/wifi.h |  7 +++
 5 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c 
b/drivers/net/wireless/realtek/rtlwifi/base.c
index cc4b50e1b7e5..997dd692e6bb 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -1312,11 +1312,9 @@ static void setup_arp_tx(struct rtl_priv *rtlpriv, 
struct rtl_ps_ctl *ppsc)
 {
struct ieee80211_hw *hw = rtlpriv->hw;
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
-   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
 
rtlpriv->ra.is_special_data = true;
-   if (ops->get_btc_status())
-   btc_ops->btc_special_packet_notify(rtlpriv, 1);
+   rtl_btc(rtlpriv, ops, btc_special_packet_notify(rtlpriv, 1));
rtl_lps_leave(hw);
ppsc->last_delaylps_stamp_jiffies = jiffies;
 }
@@ -1575,7 +1573,6 @@ void rtl_watchdog_wq_callback(void *data)
struct ieee80211_hw *hw = rtlworks->hw;
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
-   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
struct rtl_hal *rtlhal = rtl_hal(rtl_priv(hw));
struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
bool busytraffic = false;
@@ -1714,8 +1711,7 @@ void rtl_watchdog_wq_callback(void *data)
}
}
 
-   if (ops->get_btc_status())
-   btc_ops->btc_periodical(rtlpriv);
+   rtl_btc(rtlpriv, ops, btc_periodical(rtlpriv));
 
rtlpriv->link_info.bcn_rx_inperiod = 0;
 }
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c 
b/drivers/net/wireless/realtek/rtlwifi/core.c
index e08febc2d0d6..8d3eddeeffea 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1046,7 +1046,6 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw 
*hw,
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
-   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
struct rtl_hal *rtlhal = rtl_hal(rtlpriv);
struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
struct rtl_ps_ctl *ppsc = rtl_psc(rtl_priv(hw));
@@ -1193,8 +1192,7 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw 
*hw,
ppsc->report_linked = (mstatus == RT_MEDIA_CONNECT) ?
  true : false;
 
-   if (ops->get_btc_status())
-   btc_ops->btc_mediastatus_notify(rtlpriv, mstatus);
+   rtl_btc(rtlpriv, ops, btc_mediastatus_notify(rtlpriv, mstatus));
}
 
if (changed & BSS_CHANGED_ERP_CTS_PROT) {
@@ -1428,7 +1426,6 @@ static void rtl_op_sw_scan_start(struct ieee80211_hw *hw,
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
-   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
 
RT_TRACE(rtlpriv, COMP_MAC80211, DBG_LOUD, "\n");
@@ -1438,8 +1435,7 @@ static void rtl_op_sw_scan_start(struct ieee80211_hw *hw,
return;
}
 
-   if (ops->get_btc_status())
-   btc_ops->btc_scan_notify(rtlpriv, 1);
+   rtl_btc(rtlpriv, ops, btc_scan_notify(rtlpriv, 1));
 
if (rtlpriv->dm.supp_phymode_switch) {
if (ops->chk_switch_dmdp)
@@ -1465,7 +1461,6 @@ static void rtl_op_sw_scan_complete(struct ieee80211_hw 
*hw,
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
-   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
 
RT_TRACE(rtlpriv, COMP_MAC80211, DBG_LOUD, "\n");
@@ -1492,8 +1487,7 @@ static void rtl_op_sw_scan_complete(struct ieee80211_hw 
*hw,
}
 
ops->scan_operation_backup(hw, SCAN_OPT_RESTORE);
-   if (ops->get_btc_status())
-   btc_ops->btc_scan_notify(rtlpriv, 0);
+   rtl_btc(rtlpriv, ops, btc_scan_notify(rtlpriv, 0));
 }
 
 static int rtl_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c 
b/drivers/net/wireless/realtek/rtlwifi/pci.c
index f129c4c76c05..fa93401acdab 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -1859,15 +1859,13 @@ static void rtl_pci_stop(struct

[PATCH 0/5] rtlwifi: Neatening

2017-06-19 Thread Joe Perches

Joe Perches (5):
  rtlwifi: Use temporary ops variable to reduce code size
  rtlwifi: Use temporary variable btc_ops for rtlpriv->btcoexist.btc_ops
  rtlwifi: Add and use convenience macro rtl_btc
  realtek: btcoexist: Make the rtl_btc_ops struct const
  realtek: rtlwifi: drivers: Use the rtl_btc convenience macro

 drivers/net/wireless/realtek/rtlwifi/base.c|  46 ++--
 .../wireless/realtek/rtlwifi/btcoexist/rtl_btc.c   |   4 +-
 .../wireless/realtek/rtlwifi/btcoexist/rtl_btc.h   |   2 +-
 drivers/net/wireless/realtek/rtlwifi/core.c| 189 
 drivers/net/wireless/realtek/rtlwifi/efuse.c   |   9 +-
 drivers/net/wireless/realtek/rtlwifi/pci.c | 242 +++--
 drivers/net/wireless/realtek/rtlwifi/ps.c  |  83 +++
 .../net/wireless/realtek/rtlwifi/rtl8192ee/hw.c|   3 +-
 .../wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c   |   4 +-
 .../net/wireless/realtek/rtlwifi/rtl8723ae/hw.c|   3 +-
 .../net/wireless/realtek/rtlwifi/rtl8723be/hw.c|   3 +-
 .../net/wireless/realtek/rtlwifi/rtl8821ae/fw.c|   6 +-
 .../net/wireless/realtek/rtlwifi/rtl8821ae/hw.c|   3 +-
 drivers/net/wireless/realtek/rtlwifi/usb.c |  40 ++--
 drivers/net/wireless/realtek/rtlwifi/wifi.h|   9 +-
 15 files changed, 336 insertions(+), 310 deletions(-)

-- 
2.10.0.rc2.1.g053435c

[PATCH 2/5] rtlwifi: Use temporary variable btc_ops for rtlpriv->btcoexist.btc_ops

2017-06-19 Thread Joe Perches

Reduce the code line length a little.

Signed-off-by: Joe Perches 
---
 drivers/net/wireless/realtek/rtlwifi/base.c |  7 ---
 drivers/net/wireless/realtek/rtlwifi/core.c | 10 ++
 drivers/net/wireless/realtek/rtlwifi/pci.c  |  8 +---
 drivers/net/wireless/realtek/rtlwifi/ps.c   | 16 ++--
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c 
b/drivers/net/wireless/realtek/rtlwifi/base.c
index 4436addcace3..cc4b50e1b7e5 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -1312,11 +1312,11 @@ static void setup_arp_tx(struct rtl_priv *rtlpriv, 
struct rtl_ps_ctl *ppsc)
 {
struct ieee80211_hw *hw = rtlpriv->hw;
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
+   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
 
rtlpriv->ra.is_special_data = true;
if (ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_special_packet_notify(
-   rtlpriv, 1);
+   btc_ops->btc_special_packet_notify(rtlpriv, 1);
rtl_lps_leave(hw);
ppsc->last_delaylps_stamp_jiffies = jiffies;
 }
@@ -1575,6 +1575,7 @@ void rtl_watchdog_wq_callback(void *data)
struct ieee80211_hw *hw = rtlworks->hw;
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
+   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
struct rtl_hal *rtlhal = rtl_hal(rtl_priv(hw));
struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
bool busytraffic = false;
@@ -1714,7 +1715,7 @@ void rtl_watchdog_wq_callback(void *data)
}
 
if (ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_periodical(rtlpriv);
+   btc_ops->btc_periodical(rtlpriv);
 
rtlpriv->link_info.bcn_rx_inperiod = 0;
 }
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c 
b/drivers/net/wireless/realtek/rtlwifi/core.c
index 63f5c0cd6935..e08febc2d0d6 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1046,6 +1046,7 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw 
*hw,
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
+   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
struct rtl_hal *rtlhal = rtl_hal(rtlpriv);
struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
struct rtl_ps_ctl *ppsc = rtl_psc(rtl_priv(hw));
@@ -1193,8 +1194,7 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw 
*hw,
  true : false;
 
if (ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_mediastatus_notify(
-   rtlpriv, mstatus);
+   btc_ops->btc_mediastatus_notify(rtlpriv, mstatus);
}
 
if (changed & BSS_CHANGED_ERP_CTS_PROT) {
@@ -1428,6 +1428,7 @@ static void rtl_op_sw_scan_start(struct ieee80211_hw *hw,
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
+   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
 
RT_TRACE(rtlpriv, COMP_MAC80211, DBG_LOUD, "\n");
@@ -1438,7 +1439,7 @@ static void rtl_op_sw_scan_start(struct ieee80211_hw *hw,
}
 
if (ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_scan_notify(rtlpriv, 1);
+   btc_ops->btc_scan_notify(rtlpriv, 1);
 
if (rtlpriv->dm.supp_phymode_switch) {
if (ops->chk_switch_dmdp)
@@ -1464,6 +1465,7 @@ static void rtl_op_sw_scan_complete(struct ieee80211_hw 
*hw,
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
+   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
 
RT_TRACE(rtlpriv, COMP_MAC80211, DBG_LOUD, "\n");
@@ -1491,7 +1493,7 @@ static void rtl_op_sw_scan_complete(struct ieee80211_hw 
*hw,
 
ops->scan_operation_backup(hw, SCAN_OPT_RESTORE);
if (ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_scan_notify(rtlpriv, 0);
+   btc_ops->btc_scan_notify(rtlpriv, 0);
 }
 
 static int rtl_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c 
b/drivers/net/wireless/realtek/rtlwifi/pci.c
index 81c36978df8a..f129c4c76c05 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -1814,6 +1814,7 @@ static int rtl_pci_start(struct ieee80211_hw *hw)
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
+   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;

[PATCH 1/5] rtlwifi: Use temporary ops variable to reduce code size

2017-06-19 Thread Joe Perches

rtlpriv->cfg->ops-> is used frequently in the source.

Repeated use of this multiply dereferenced table creates larger objects.

Using a temporary reduces code size as well as source code line length.

$ size -t drivers/net/wireless/realtek/rtlwifi/*.o.defconfig.new
   textdata bss dec hex filename
  126523024  24   157003d54 [...]/base.o.defconfig.new
  16700  83   0   16783418f [...]/core.o.defconfig.new
  10794   0   4   107982a2e [...]/efuse.o.defconfig.new
  20988   5   0   209935201 [...]/pci.o.defconfig.new
   6182   8   06190182e [...]/ps.o.defconfig.new
   8410   1   4841520df [...]/usb.o.defconfig.new
  757263121  32   78879   1341f (TOTALS)

$ size -t drivers/net/wireless/realtek/rtlwifi/*.o.defconfig.old
   textdata bss dec hex filename
  126043024  24   156523d24 [...]/base.o.defconfig.old
  16892  83   0   16975424f [...]/core.o.defconfig.old
  10794   0   4   107982a2e [...]/efuse.o.defconfig.old
  21161   5   0   2116652ae [...]/pci.o.defconfig.old
   6262   8   06270187e [...]/ps.o.defconfig.old
   8435   1   4844020f8 [...]/usb.o.defconfig.old
  761483121  32   79301   135c5 (TOTALS)

Miscellanea around modified code:

o Fix a few misindented code blocks
o Realign arguments
o Ignored 80 column checkpatch warnings

Signed-off-by: Joe Perches 
---
 drivers/net/wireless/realtek/rtlwifi/base.c  |  45 ++---
 drivers/net/wireless/realtek/rtlwifi/core.c  | 187 +++--
 drivers/net/wireless/realtek/rtlwifi/efuse.c |   9 +-
 drivers/net/wireless/realtek/rtlwifi/pci.c   | 236 ++-
 drivers/net/wireless/realtek/rtlwifi/ps.c|  75 +
 drivers/net/wireless/realtek/rtlwifi/usb.c   |  40 +++--
 6 files changed, 313 insertions(+), 279 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c 
b/drivers/net/wireless/realtek/rtlwifi/base.c
index 710e5b447cff..4436addcace3 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -498,7 +498,7 @@ EXPORT_SYMBOL_GPL(rtl_deinit_deferred_work);
 void rtl_init_rfkill(struct ieee80211_hw *hw)
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
-
+   struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
bool radio_state;
bool blocked;
u8 valid = 0;
@@ -507,7 +507,7 @@ void rtl_init_rfkill(struct ieee80211_hw *hw)
rtlpriv->rfkill.rfkill_state = true;
wiphy_rfkill_set_hw_state(hw->wiphy, 0);
 
-   radio_state = rtlpriv->cfg->ops->radio_onoff_checking(hw, );
+   radio_state = ops->radio_onoff_checking(hw, );
 
if (valid) {
pr_info("rtlwifi: wireless switch is %s\n",
@@ -588,8 +588,9 @@ void rtl_init_rx_config(struct ieee80211_hw *hw)
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
+   struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
 
-   rtlpriv->cfg->ops->get_hw_reg(hw, HW_VAR_RCR, (u8 *) (>rx_conf));
+   ops->get_hw_reg(hw, HW_VAR_RCR, (u8 *)>rx_conf);
 }
 EXPORT_SYMBOL_GPL(rtl_init_rx_config);
 
@@ -1178,13 +1179,14 @@ bool rtl_tx_mgmt_proc(struct ieee80211_hw *hw, struct 
sk_buff *skb)
 {
struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
struct rtl_priv *rtlpriv = rtl_priv(hw);
+   struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
__le16 fc = rtl_get_fc(skb);
 
if (rtlpriv->dm.supp_phymode_switch &&
mac->link_state < MAC80211_LINKED &&
(ieee80211_is_auth(fc) || ieee80211_is_probe_req(fc))) {
-   if (rtlpriv->cfg->ops->chk_switch_dmdp)
-   rtlpriv->cfg->ops->chk_switch_dmdp(hw);
+   if (ops->chk_switch_dmdp)
+   ops->chk_switch_dmdp(hw);
}
if (ieee80211_is_auth(fc)) {
RT_TRACE(rtlpriv, COMP_SEND, DBG_DMESG, "MAC80211_LINKING\n");
@@ -1309,11 +1311,12 @@ EXPORT_SYMBOL_GPL(rtl_action_proc);
 static void setup_arp_tx(struct rtl_priv *rtlpriv, struct rtl_ps_ctl *ppsc)
 {
struct ieee80211_hw *hw = rtlpriv->hw;
+   struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
 
rtlpriv->ra.is_special_data = true;
-   if (rtlpriv->cfg->ops->get_btc_status())
+   if (ops->get_btc_status())
rtlpriv->btcoexist.btc_ops->btc_special_packet_notify(
-   rtlpriv, 1);
+   rtlpriv, 1);
rtl_lps_leave(hw);
ppsc->last_delaylps_stamp_jiffies = jiffies;
 }
@@ -1571,6 +1574,7 @@ void rtl_watchdog_wq_callback(void *data)
watchdog_wq);
struct ieee80211_hw *hw = rtlworks->hw;
struct rtl_priv *rtlpriv = rtl_priv(hw);
+   struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
struct rtl_hal *rtlhal =

[PATCH 4/5] realtek: btcoexist: Make the rtl_btc_ops struct const

2017-06-19 Thread Joe Perches

Avoid allowing a write into what should be const.

Signed-off-by: Joe Perches 
---
 drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.c | 4 ++--
 drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.h | 2 +-
 drivers/net/wireless/realtek/rtlwifi/pci.c   | 2 +-
 drivers/net/wireless/realtek/rtlwifi/wifi.h  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.c 
b/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.c
index 3ab0cfe26513..19a95b69255d 100644
--- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.c
+++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.c
@@ -29,7 +29,7 @@
 #include "rtl_btc.h"
 #include "halbt_precomp.h"
 
-static struct rtl_btc_ops rtl_btc_operation = {
+static const struct rtl_btc_ops rtl_btc_operation = {
.btc_init_variables = rtl_btc_init_variables,
.btc_init_hal_vars = rtl_btc_init_hal_vars,
.btc_init_hw_config = rtl_btc_init_hw_config,
@@ -161,7 +161,7 @@ void rtl_btc_special_packet_notify(struct rtl_priv 
*rtlpriv, u8 pkt_type)
return exhalbtc_special_packet_notify(_bt_coexist, pkt_type);
 }
 
-struct rtl_btc_ops *rtl_btc_get_ops_pointer(void)
+const struct rtl_btc_ops *rtl_btc_get_ops_pointer(void)
 {
return _btc_operation;
 }
diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.h 
b/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.h
index fff5117e1c4e..83c5bb2d6ad8 100644
--- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.h
+++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.h
@@ -44,7 +44,7 @@ bool rtl_btc_is_disable_edca_turbo(struct rtl_priv *rtlpriv);
 bool rtl_btc_is_bt_disabled(struct rtl_priv *rtlpriv);
 void rtl_btc_special_packet_notify(struct rtl_priv *rtlpriv, u8 pkt_type);
 
-struct rtl_btc_ops *rtl_btc_get_ops_pointer(void);
+const struct rtl_btc_ops *rtl_btc_get_ops_pointer(void);
 
 u8 rtl_get_hwpg_bt_exist(struct rtl_priv *rtlpriv);
 u8 rtl_get_hwpg_bt_type(struct rtl_priv *rtlpriv);
diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c 
b/drivers/net/wireless/realtek/rtlwifi/pci.c
index fa93401acdab..8000894c4212 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -1814,7 +1814,7 @@ static int rtl_pci_start(struct ieee80211_hw *hw)
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_hal_ops *ops = rtlpriv->cfg->ops;
-   struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
+   const struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
struct rtl_hal *rtlhal = rtl_hal(rtl_priv(hw));
struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw));
struct rtl_ps_ctl *ppsc = rtl_psc(rtl_priv(hw));
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h 
b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index 9a916188a703..d03f0ca92530 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -2475,7 +2475,7 @@ struct rtl_btc_info {
 };
 
 struct bt_coexist_info {
-   struct rtl_btc_ops *btc_ops;
+   const struct rtl_btc_ops *btc_ops;
struct rtl_btc_info btc_info;
/* EEPROM BT info. */
u8 eeprom_bt_coexist;
-- 
2.10.0.rc2.1.g053435c

[PATCH 5/5] realtek: rtlwifi: drivers: Use the rtl_btc convenience macro

2017-06-19 Thread Joe Perches

Convert the uses of the btcoexist. to the rtl_btc macro to
save a few lines of code.

Signed-off-by: Joe Perches 
---
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c  | 3 +--
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c | 4 +---
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c  | 3 +--
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c  | 3 +--
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c  | 6 ++
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c  | 3 +--
 6 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c 
b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c
index 11d97fa0e921..9fc3e79e5a43 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c
@@ -2572,8 +2572,7 @@ void rtl92ee_bt_hw_init(struct ieee80211_hw *hw)
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
 
-   if (rtlpriv->cfg->ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_init_hw_config(rtlpriv);
+   rtl_btc(rtlpriv, rtlpriv->cfg->ops, btc_init_hw_config(rtlpriv));
 }
 
 void rtl92ee_suspend(struct ieee80211_hw *hw)
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c 
b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c
index ec9bcf32f0ab..8621ea8f6644 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c
@@ -1767,9 +1767,7 @@ void rtl_8723e_c2h_command_handle(struct ieee80211_hw *hw)
 
rtl8723e_dm_bt_parse_bt_info(hw, ptmp_buf, c2h_event.cmd_len);
 
-   if (rtlpriv->cfg->ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_periodical(rtlpriv);
-
+   rtl_btc(rtlpriv, rtlpriv->cfg->ops, btc_periodical(rtlpriv));
break;
default:
break;
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c 
b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c
index 5ac7b815648a..3b6d140bb863 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c
@@ -2419,8 +2419,7 @@ void rtl8723e_bt_hw_init(struct ieee80211_hw *hw)
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
 
-   if (rtlpriv->cfg->ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_init_hw_config(rtlpriv);
+   rtl_btc(rtlpriv, rtlpriv->cfg->ops, btc_init_hw_config(rtlpriv));
 }
 
 void rtl8723e_suspend(struct ieee80211_hw *hw)
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c 
b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c
index a79f936bb394..bddc4f56832d 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c
@@ -2719,8 +2719,7 @@ void rtl8723be_bt_hw_init(struct ieee80211_hw *hw)
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
 
-   if (rtlpriv->cfg->ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_init_hw_config(rtlpriv);
+   rtl_btc(rtlpriv, rtlpriv->cfg->ops, btc_init_hw_config(rtlpriv));
 
 }
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c 
b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
index 73350103b736..a7a537716a8e 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
@@ -1879,10 +1879,8 @@ void rtl8821ae_c2h_content_parsing(struct ieee80211_hw 
*hw,
case C2H_8812_BT_INFO:
RT_TRACE(rtlpriv, COMP_FW, DBG_LOUD,
 "[C2H], C2H_8812_BT_INFO!!\n");
-   if (rtlpriv->cfg->ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_btinfo_notify(rtlpriv,
- tmp_buf,
- 
c2h_cmd_len);
+   rtl_btc(rtlpriv, rtlpriv->cfg->ops,
+   btc_btinfo_notify(rtlpriv, tmp_buf, c2h_cmd_len));
break;
default:
break;
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c 
b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c
index 2bc6bace069c..f76c64570d16 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c
@@ -4012,8 +4012,7 @@ void rtl8821ae_bt_hw_init(struct ieee80211_hw *hw)
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
 
-   if (rtlpriv->cfg->ops->get_btc_status())
-   rtlpriv->btcoexist.btc_ops->btc_init_hw_config(rtlpriv);
+   rtl_btc(rtlpriv, rtlpriv->cfg->ops, btc_init_hw_config(rtlpriv));
 }
 
 void rtl8821ae_suspend(struct ieee80211_hw *hw)
-- 
2.10.0.rc2.1.g053435c

[PATCH iproute2 1/1] tc: fixed typo in usage text.

2017-06-19 Thread Roman Mashak

Signed-off-by: Roman Mashak 
---
 tc/f_u32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tc/f_u32.c b/tc/f_u32.c
index ff700e9..b272c2c 100644
--- a/tc/f_u32.c
+++ b/tc/f_u32.c
@@ -34,7 +34,7 @@ static void explain(void)
"Usage: ... u32 [ match SELECTOR ... ] [ link HTID ] [ classid 
CLASSID ]\n"
"   [ action ACTION_SPEC ] [ offset OFFSET_SPEC ]\n"
"   [ ht HTID ] [ hashkey HASHKEY_SPEC ]\n"
-   "   [ sample SAMPLE ] [skip-hw | skip-sw]\n"
+   "   [ sample SAMPLE ] [skip_hw | skip_sw]\n"
"or u32 divisor DIVISOR\n"
"\n"
"Where: SELECTOR := SAMPLE SAMPLE ...\n"
-- 
1.9.1

[PATCH] liquidio: stop using huge static buffer, save 4096k in .data

2017-06-19 Thread Denys Vlasenko

Only compile-tested - I don't have the hardware.

>From code inspection, octeon_pci_write_core_mem() appears to be safe wrt
unaligned source. In any case, u8 fbuf[] was not guaranteed to be aligned
anyway.

Signed-off-by: Denys Vlasenko 
CC: Felix Manlunas 
CC: Prasad Kanneganti 
CC: Derek Chickles 
CC: David Miller 
CC: netdev@vger.kernel.org
CC: linux-ker...@vger.kernel.org
---
 drivers/net/ethernet/cavium/liquidio/octeon_console.c | 6 +-
 drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c | 4 ++--
 drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h | 2 +-
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_console.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
index 53f38d0..e08f760 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_console.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
@@ -724,13 +724,11 @@ static int octeon_console_read(struct octeon_device *oct, 
u32 console_num,
 }
 
 #define FBUF_SIZE  (4 * 1024 * 1024)
-u8 fbuf[FBUF_SIZE];
 
 int octeon_download_firmware(struct octeon_device *oct, const u8 *data,
 size_t size)
 {
int ret = 0;
-   u8 *p = fbuf;
u32 crc32_result;
u64 load_addr;
u32 image_len;
@@ -805,10 +803,8 @@ int octeon_download_firmware(struct octeon_device *oct, 
const u8 *data,
else
size = FBUF_SIZE;
 
-   memcpy(p, data, size);
-
/* download the image */
-   octeon_pci_write_core_mem(oct, load_addr, p, (u32)size);
+   octeon_pci_write_core_mem(oct, load_addr, data, 
(u32)size);
 
data += size;
rem -= (u32)size;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c
index 5cd96e7..4c85ae6 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c
@@ -167,10 +167,10 @@ octeon_pci_read_core_mem(struct octeon_device *oct,
 void
 octeon_pci_write_core_mem(struct octeon_device *oct,
  u64 coreaddr,
- u8 *buf,
+ const u8 *buf,
  u32 len)
 {
-   __octeon_pci_rw_core_mem(oct, coreaddr, buf, len, 0);
+   __octeon_pci_rw_core_mem(oct, coreaddr, (u8 *)buf, len, 0);
 }
 
 u64 octeon_read_device_mem64(struct octeon_device *oct, u64 coreaddr)
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h 
b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h
index bae2fdd..47a3ff5 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h
@@ -66,7 +66,7 @@ octeon_pci_read_core_mem(struct octeon_device *oct,
 void
 octeon_pci_write_core_mem(struct octeon_device *oct,
  u64 coreaddr,
- u8 *buf,
+ const u8 *buf,
  u32 len);
 
 #endif
-- 
2.9.2

Re: [PATCH net] sctp: ensure ep is not destroyed before doing the dump

2017-06-19 Thread David Miller

From: Xin Long 
Date: Sat, 17 Jun 2017 16:10:27 +0800

> Now before dumping a sock in sctp_diag, it only holds the sock while
> the ep may be already destroyed. It can cause a use-after-free panic
> when accessing ep->asocs.
> 
> This patch is to set sctp_sk(sk)->ep NULL in sctp_endpoint_destroy,
> and check if this ep is already destroyed before dumping this ep.
> 
> Suggested-by: Marcelo Ricardo Leitner 
> Signed-off-by: Xin Long 

Applied, thanks.

Re: [RFC PATCH net-next v2 01/15] bpf: BPF support for socket ops

2017-06-19 Thread Daniel Borkmann

On 06/17/2017 11:48 PM, Lawrence Brakmo wrote:

On 6/16/17, 5:07 AM, "Daniel Borkmann"  wrote:

 On 06/15/2017 10:08 PM, Lawrence Brakmo wrote:
 > Two new corresponding structs (one for the kernel one for the user/BPF
 > program):
 >
 > /* kernel version */
 > struct bpf_socket_ops_kern {
 >  struct sock *sk;
 >   __u32  is_req_sock:1;
 >  __u32  op;
 >  union {
 >  __u32 reply;
 >  __u32 replylong[4];
 >  };
 > };
 >
 > /* user version */
 > struct bpf_socket_ops {
 >  __u32 op;
 >  union {
 >  __u32 reply;
 >  __u32 replylong[4];
 >  };
 >  __u32 family;
 >  __u32 remote_ip4;
 >  __u32 local_ip4;
 >  __u32 remote_ip6[4];
 >  __u32 local_ip6[4];
 >  __u32 remote_port;
 >  __u32 local_port;
 > };

 Above and ...

 struct bpf_sock {
__u32 bound_dev_if;
__u32 family;
__u32 type;
__u32 protocol;
 };

 ... would result in two BPF sock user versions. It's okayish, but
 given struct bpf_sock is quite generic, couldn't we merge the members
 from struct bpf_socket_ops into struct bpf_sock instead?

 Idea would be that sock_filter_is_valid_access() for cgroups would
 then check off < 0 || off + size > offsetofend(struct bpf_sock, protocol)
 to disallow new members, and your socket_ops_is_valid_access() could
 allow and xlate the full range. The family member is already duplicate
 and the others could then be accessed from these kind of BPF progs as
 well, plus we have a single user representation similar as with __sk_buff
 that multiple types will use.

I am concerned that it could make usage more confusing. One type of
sock program (cgroup) could only use a subset of the fields while the
other type (socket_ops) could use all (or a different subset). Then what
happens if there is a need to add a new field to cgroup type sock program?
In addition, in the near future I will have a patch to attach socket_ops
programs to cgroups.
I rather leave it as it is.

Okay, I'm fine with that as well. For the __sk_buff, we also have the
case that some members are not available for all program types like
tc_classid, so it's similar there. But if indeed the majority of members
cannot be supported for the most parts (?) then having different structs
seems okay, probably easier to use, but we should try hard to not ending
up with 10 different uapi socket structs that apply to program types
working on sockets in one way or another.

Re: rtnetlink: add IFLA_GROUP to ifla_policy

2017-06-19 Thread David Miller

From: David Miller 
Date: Mon, 19 Jun 2017 14:47:44 -0400 (EDT)

> From: Serhey Popovych 
> Date: Fri, 16 Jun 2017 15:22:24 +0300
> 
>> Network interface groups support added while ago, however
>> there is no IFLA_GROUP attribute description in policy
>> and netlink message size calculations until now.
>> 
>> Add IFLA_GROUP attribute to the policy.
>> 
>> Fixes: cbda10fa97d7 ("net_device: add support for network device groups")
>> Signed-off-by: Serhey Popovych 
> 
> Applied and queued up for -stable, thanks.

Actually, this doesn't apply cleanly to the 'net' tree, please
respin.

Thansk.

Re: rtnetlink: add IFLA_GROUP to ifla_policy

2017-06-19 Thread David Miller

From: Serhey Popovych 
Date: Fri, 16 Jun 2017 15:22:24 +0300

> Network interface groups support added while ago, however
> there is no IFLA_GROUP attribute description in policy
> and netlink message size calculations until now.
> 
> Add IFLA_GROUP attribute to the policy.
> 
> Fixes: cbda10fa97d7 ("net_device: add support for network device groups")
> Signed-off-by: Serhey Popovych 

Applied and queued up for -stable, thanks.

Re: [RFC PATCH net-next v2 01/15] bpf: BPF support for socket ops

2017-06-19 Thread Daniel Borkmann

On 06/17/2017 01:41 AM, Lawrence Brakmo wrote:

On 6/16/17, 5:07 AM, "Daniel Borkmann"  wrote:

[...]

I see. You are saying have one struct in common but still keep the two
PROG_TYPES? That makes sense. Do we really need two different
is_valid_access functions? Both types should be able to see all
the fields (otherwise adding new fields becomes messy).

Would probably leave the two is_valid_access() separate initially, and
once people ask for it we could potentially open this up to some of
the other fields that are available at that time.

 > Currently there are two types of ops. The first type expects the BPF
 > program to return a value which is then used by the caller (or a
 > negative value to indicate the operation is not supported). The second
 > type expects state changes to be done by the BPF program, for example
 > through a setsockopt BPF helper function, and they ignore the return
 > value.

[...]

 > +/* Call BPF_SOCKET_OPS program that returns an int. If the return value
 > + * is < 0, then the BPF op failed (for example if the loaded BPF
 > + * program does not support the chosen operation or there is no BPF
 > + * program loaded).
 > + */
 > +#ifdef CONFIG_BPF
 > +static inline int tcp_call_bpf(struct sock *sk, bool is_req_sock, int 
op)
 > +{
 > + struct bpf_socket_ops_kern socket_ops;
 > +
 > + memset(_ops, 0, sizeof(socket_ops));
 > + socket_ops.sk = sk;
 > + socket_ops.is_req_sock = is_req_sock ? 1 : 0;

 Is is_req_sock actually used here in this patch (apart from setting it)?
 Not seeing that BPF prog will access it, if it also shouldn't access it,
 then bool type would be better.

The only reason I used a bit was in case I wanted to add more fields later on.
Does it make sense or should I just use bool?

Didn't know that, but I think starting out with bool seems a bit
cleaner, if needed we could later still switch to bitfield.

 > + socket_ops.op = op;
 > +
 > + return bpf_socket_ops_call(_ops);
 > +}

[...]

 > +/* Global BPF program for sockets */
 > +static struct bpf_prog *bpf_socket_ops_prog;
 > +static DEFINE_RWLOCK(bpf_socket_ops_lock);
 > +
 > +int bpf_socket_ops_set_prog(int fd)
 > +{
 > + int err = 0;
 > +
 > + write_lock(_socket_ops_lock);
 > + if (bpf_socket_ops_prog) {
 > + bpf_prog_put(bpf_socket_ops_prog);
 > + bpf_socket_ops_prog = NULL;
 > + }
 > +
 > + /* fd of zero is used as a signal to remove the current
 > +  * bpf_socket_ops_prog.
 > +  */
 > + if (fd == 0) {

 Can we make the fd related semantics similar to dev_change_xdp_fd()?

Do you mean remove program is fd < 0 instead of == 0?

Yes, that and also the ordering of dropping the ref of the existing
bpf_socket_ops_prog program with setting the new one, so you can
convert bpf_socket_ops_prog to RCU more easily.

 > + write_unlock(_socket_ops_lock);
 > + return 1;
 > + }
 > +
 > + bpf_socket_ops_prog = bpf_prog_get_type(fd, 
BPF_PROG_TYPE_SOCKET_OPS);
 > + if (IS_ERR(bpf_socket_ops_prog)) {
 > + bpf_prog_put(bpf_socket_ops_prog);

 This will crash the kernel, passing err value to bpf_prog_put().

[...]

Re: [PATCH] loopback: Force LOOPBACK_IFINDEX for registration

2017-06-19 Thread David Miller

From: Serhey Popovych 
Date: Fri, 16 Jun 2017 15:10:03 +0300

> Now with commit 9c7dafb (net: Allow to create links with
> given ifindex) support registration of network devices
> with specific ifindex is added.
> 
> We can force loopback network device index before call to
> register_netdev() to ensure we always configure it with
> LOOPBACK_IFINDEX.
> 
> Kill BUG_ON() since system can continue without network
> namespace failed in loopback init path, unless it is
> init_net namespace where we panic() anyway.
> 
> Signed-off-by: Serhey Popovych 

Is the BUG_ON() triggering, if so why?

It looks to me that unless there is a bug, this assignment
is unnecessary.

Re: [PATCH v3 0/4] PTP support for macb driver

2017-06-19 Thread David Miller

From: Rafal Ozieblo 
Date: Fri, 16 Jun 2017 12:58:18 +0100

> This patch series adds support for PTP synchronization protocol
> in Cadence GEM driver based on PHC.

This doesn't apply cleanly to net-next, please respin.

Re: ipv6: Do not leak throw route references

2017-06-19 Thread David Miller

From: Serhey Popovych 
Date: Fri, 16 Jun 2017 14:42:17 +0300

> While commit 73ba57b (ipv6: fix backtracking for throw routes)
> does good job on error propagation to the fib_rules_lookup()
> in fib rules core framework that also corrects throw routes
> handling, it does not solve route reference leakage problem
> happened when we return -EAGAIN to the fib_rules_lookup()
> and leave routing table entry referenced in arg->result.
> 
> If rule with matched throw route isn't last matched in the
> list we overwrite arg->result loosing reference on throw
> route stored previously forever.
> 
> We also partially revert commit ab997ad (ipv6: fix the
> incorrect return value of throw route) since we never return
> routing table entry with dst.error == -EAGAIN when
> CONFIG_IPV6_MULTIPLE_TABLES is on. Also there is no point
> to check for RTF_REJECT flag since it is always set throw
> route.
> 
> Fixes: 73ba57b (ipv6: fix backtracking for throw routes)
> Signed-off-by: Serhey Popovych 

This does not apply cleanly to the net tree, please respin.

Re: [PATCH NET] net/hns:bugfix of ethtool -t phy self_test

2017-06-19 Thread David Miller

From: Lin Yun Sheng 
Date: Fri, 16 Jun 2017 17:24:51 +0800

> This patch fixes the phy loopback self_test failed issue. when
> Marvell Phy Module is loaded, it will powerdown fiber when doing
> phy loopback self test, which cause phy loopback self_test fail.
> 
> Signed-off-by: Lin Yun Sheng 

Applied.

Re: [PATCH v2] arm: eBPF JIT compiler

2017-06-19 Thread Daniel Borkmann


On 06/17/2017 02:23 PM, Shubham Bansal wrote:

Hi Daniel,


Not all of the helpers have 4 or less byte arguments only, there are a
few with 8 byte arguments, so making that general assumption wouldn't
work. I guess what could be done is that helpers have a flag in struct
bpf_func_proto which indicates for JITs that all args are 4 byte on 32bit
so you could probably use convention similar to case2 for them. Presumably
for that information to process, the JIT might need to be reworked to
extract that via bpf_analyzer() that does a verifier run to re-analyze
the program like in nfp JIT case.


Let me try a better solution which can be used to support both 4 byte
and 8 byte arguments. I hope it would work out. Are you sure this
patch can pass if it only supports 4 byte arguments though?
Let me list out what I have to do, so that you can tell me if I am
thinking in a wrong way :-

* I will add a bit flag in bpf_func_proto to represent whether
different arguments in a function call are 4 bytes or 8 bytes. If lsb
of bit flag is set then first argument is 8 byte, otherwise its not. I
think I can handle this flag properly in build_insn() in my code. Does
this sound okay?

I don't understand second part of your solution, i.e.


Presumably
for that information to process, the JIT might need to be reworked to
extract that via bpf_analyzer() that does a verifier run to re-analyze
the program like in nfp JIT case.


Please explain what are you suggesting and how can I extract bit flag
from bpf_func_proto().

Please reply asap, as I would like to finish it over the weekend. Please.


Sorry, had a travel over the weekend, so didn't read it in time.

What is the issue with imitating in JIT what the interpreter is
doing as a starting point? That should be generic enough to handle
any case.

Otherwise you'd need some sort of reverse mapping since verifier
already converted BPF_CALL insns into relative helper addresses
in imm part.


-Shubham

Re: [PATCH net] net: 8021q: Fix one possible panic caused by BUG_ON in free_netdev

2017-06-19 Thread David Miller

From: gfree.w...@vip.163.com
Date: Fri, 16 Jun 2017 15:00:02 +0800

> From: Gao Feng 
> 
> The register_vlan_device would invoke free_netdev directly, when
> register_vlan_dev failed. It would trigger the BUG_ON in free_netdev
> if the dev was already registered. In this case, the netdev would be
> freed in netdev_run_todo later.
> 
> So add one condition check now. Only when dev is not registered, then
> free it directly.
> 
> The following is the part coredump when netdev_upper_dev_link failed
> in register_vlan_dev. I removed the lines which are too long.
 ...
> Signed-off-by: Gao Feng 

Ok, I guess this is how we will have to fix this.

Applied, thanks.

Re: [PATCH v2 1/2] ip_tunnel: fix ip tunnel lookup in collect_md mode

2017-06-19 Thread Pravin Shelar

On Mon, Jun 19, 2017 at 6:13 AM, 严海双  wrote:
>
>
>> On 19 Jun 2017, at 1:43 PM, Pravin Shelar  wrote:
>>
>> On Fri, Jun 16, 2017 at 8:27 PM, Haishuang Yan
>>  wrote:
>>> In collect_md mode, if the tun dev is down, it still can call
>>> ip_tunnel_rcv to receive on packets, and the rx statistics increase
>>> improperly.
>>>
>>> Fixes: 2e15ea390e6f ("ip_gre: Add support to collect tunnel metadata.")
>>> Cc: Pravin B Shelar 
>>> Signed-off-by: Haishuang Yan 
>>>
>>> ---
>>> Change since v2:
>>>  * Fix wrong recipient addresss
>>> ---
>>> net/ipv4/ip_tunnel.c | 2 +-
>>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
>>> index 0f1d876..a3caba1 100644
>>> --- a/net/ipv4/ip_tunnel.c
>>> +++ b/net/ipv4/ip_tunnel.c
>>> @@ -176,7 +176,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net 
>>> *itn,
>>>return cand;
>>>
>>>t = rcu_dereference(itn->collect_md_tun);
>>> -   if (t)
>>> +   if (t && (t->dev->flags & IFF_UP))
>>>return t;
>>>
>> It would be nice if we could increment drop count if tunnel device is not up.
>>
> Hi Pravin
>
> I think it’s not necessary, for example as gre tunnel, if ipgre_rcv fails, it 
> would trigger send an icmp unreachable
> message:
>
> if (ipgre_rcv(skb, , hdr_len) == PACKET_RCVD)
> return 0;
>
> icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
>
> Since the tunnel device didn’t touch the packets, so increase drop statistics 
> is not necessary.
>
icmp err packets are not reliable on all networks. device stats are
much more convenient during debugging connectivity issues.

Re: [PATCH v3 2/2] tcp: md5: add TCP_MD5SIG_EXT socket option to set a key address prefix

2017-06-19 Thread David Miller

From: Ivan Delalande 
Date: Thu, 15 Jun 2017 18:07:07 -0700

> Replace first padding in the tcp_md5sig structure with a new flag field
> and address prefix length so it can be specified when configuring a new
> key for TCP MD5 signature. The tcpm_flags field will only be used if the
> socket option is TCP_MD5SIG_EXT to avoid breaking existing programs, and
> tcpm_prefixlen only when the TCP_MD5SIG_FLAG_PREFIX flag is set.
> 
> Signed-off-by: Bob Gilligan 
> Signed-off-by: Eric Mowat 
> Signed-off-by: Ivan Delalande 

Applied but I had to renumber TCP_MD5SIG_EXT to 32 since 31 is already
taken by TCP_ULP in my tree.

It's a shame we had to add a new sockopt number to do this, but I can't
think of a better idea.

Thanks.

Re: [PATCH v3 1/2] tcp: md5: add an address prefix for key lookup

2017-06-19 Thread David Miller

From: Ivan Delalande 
Date: Thu, 15 Jun 2017 18:07:06 -0700

> This allows the keys used for TCP MD5 signature to be used for whole
> range of addresses, specified with a prefix length, instead of only one
> address as it currently is.
> 
> Signed-off-by: Bob Gilligan 
> Signed-off-by: Eric Mowat 
> Signed-off-by: Ivan Delalande 

Applied.

Re: [PATCH net-next 3/4] s390/diag: add diag26c support

2017-06-19 Thread David Miller

From: Martin Schwidefsky 
Date: Mon, 19 Jun 2017 17:34:25 +0200

> We (as in the s390 guys) tend to add __packed to hardware and hypervisor
> structures even if the attribute is not strictly necessary. Most of the
> diagnose related structures look that way. Dunno if it is worth to change
> them.

It causes gcc to generate bad code on certain platforms (yes, probably not
yours) and is in general something to avoid.

Please do not use __packed unless absolutely necessary.

> The diag26c struct needs to be aligned on a doubleword boundary, the
> __aligned(8) is necessary.

That's fine.

> The __packed attribute is again superfluous but follows along the
> lines of the other diag structures.

Please remove it.

Re: [Bug 196093] New: dot1q S-VLAN frame on dot1ad configured interface is accepted

2017-06-19 Thread Jason Lixfeld

Confirmed.  Works with 1.8.1-3 on Debian via unstable.

> On Jun 18, 2017, at 10:35 PM, Toshiaki Makita  
> wrote:
> 
> Hi,
> 
> On 2017/06/17 0:40, Stephen Hemminger wrote:
>> I suspect that VLAN offload on this Intel NIC is allowing any of the VLAN 
>> types.
>> 
>> Begin forwarded message:
>> 
>> Date: Fri, 16 Jun 2017 15:33:35 +
>> From: bugzilla-dae...@bugzilla.kernel.org
>> To: step...@networkplumber.org
>> Subject: [Bug 196093] New: dot1q S-VLAN frame on dot1ad configured interface 
>> is accepted
>> 
>> 
>> https://bugzilla.kernel.org/show_bug.cgi?id=196093
>> 
>>Bug ID: 196093
>>   Summary: dot1q S-VLAN frame on dot1ad configured interface is
>>accepted
>>   Product: Networking
>>   Version: 2.5
>>Kernel Version: 3.16.0 and 4.9.0
>>  Hardware: Intel
>>OS: Linux
>>  Tree: Mainline
>>Status: NEW
>>  Severity: normal
>>  Priority: P1
>> Component: Other
>>  Assignee: step...@networkplumber.org
>>  Reporter: jason-kernelbugzi...@lixfeld.ca
>>Regression: No
>> 
>> Using the following configuration on an Intel 82599 port.  Tested in Debian 8
>> with Kernel 3.16.0 and 4.9.0:
>> 
>> ip link set dev eth4 up
>> ip link add link eth4 eth4.100ad type vlan proto 802.1ad id 100
>> ip link add link eth4.100ad eth4.100ad.10q type vlan proto 802.1Q id 10
>> ip link set dev eth4 netns nni-ad
>> ip link set dev eth4.100ad netns nni-ad
>> ip link set dev eth4.100ad.10q netns nni-ad
>> ip netns exec nni-ad ip link set dev eth4 up
>> ip netns exec nni-ad ip link set dev eth4.100ad up
>> ip netns exec nni-ad ip link set dev eth4.100ad.10q up
>> ip netns exec nni-ad ip addr add 10.4.100.10/8 dev eth4.100ad.10q
>> 
>> Ping to 10.4.100.10 while doing tcpdump on eth4 shows the frame has ether 
>> type
>> 0x8100 (dot1q) on the S-VLAN, not 0x88a8 (dot1ad), yet the frame is still
> 
> libpcap was not reliable in vlan protocol parsing.
> https://github.com/the-tcpdump-group/libpcap/pull/346
> AFAIK libpcap 1.7.2 is required to parse it correctly.
> 
>> accepted, and an echo reply is generated.
>> 
>> The echo reply has the correct ethertype on the S-VLAN (0x88a8).  My
>> understanding is that if the frame received on the wire does not match the
>> ether type of the configured interface, the frame should be dropped?
> 
> Yes, it should.
> 
> Toshiaki Makita
>

Re: [PATCH nf-next] netns: add and use net_ns_barrier

2017-06-19 Thread Pablo Neira Ayuso

On Tue, May 30, 2017 at 11:38:12AM +0200, Florian Westphal wrote:
> Quoting Joe Stringer:
>   If a user loads nf_conntrack_ftp, sends FTP traffic through a network
>   namespace, destroys that namespace then unloads the FTP helper module,
>   then the kernel will crash.
> 
> Events that lead to the crash:
> 1. conntrack is created with ftp helper in netns x
> 2. This netns is destroyed
> 3. netns destruction is scheduled
> 4. netns destruction wq starts, removes netns from global list
> 5. ftp helper is unloaded, which resets all helpers of the conntracks
> via for_each_net()
> 
> but because netns is already gone from list the for_each_net() loop
> doesn't include it, therefore all of these conntracks are unaffected.
> 
> 6. helper module unload finishes
> 7. netns wq invokes destructor for rmmod'ed helper

Applied, thanks everyone.

Re: [PATCH net] netfilter: do not hold dev in ipt_CLUSTERIP

2017-06-19 Thread Pablo Neira Ayuso

On Sat, May 20, 2017 at 05:08:06PM +0800, Xin Long wrote:
> It's a terrible thing to hold dev in iptables target. When the dev is
> being removed, unregister_netdevice has to wait for the dev to become
> free. dmesg will keep logging the err:
> 
>   kernel:unregister_netdevice: waiting for veth0_in to become free. \
>   Usage count = 1
> 
> until iptables rules with this target are removed manually.
> 
> The worse thing is when deleting a netns, a virtual nic will be deleted
> instead of reset to init_net in default_device_ops exit/exit_batch. As
> it is earlier than to flush the iptables rules in iptable_filter_net_ops
> exit, unregister_netdevice will block to wait for the nic to become free.
> 
> As unregister_netdevice is actually waiting for iptables rules flushing
> while iptables rules have to be flushed after unregister_netdevice. This
> 'dead lock' will cause unregister_netdevice to block there forever. As
> the netns is not available to operate at that moment, iptables rules can
> not even be flushed manually either.
> 
> The reproducer can be:
> 
>   # ip netns add test
>   # ip link add veth0_in type veth peer name veth0_out
>   # ip link set veth0_in netns test
>   # ip netns exec test ip link set lo up
>   # ip netns exec test ip link set veth0_in up
>   # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \
> CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \
> --local-node 1 --hashmode sourceip-sourceport
>   # ip netns del test
> 
> This issue can be triggered by all virtual nics with ipt_CLUSTERIP.
> 
> This patch is to fix it by not holding dev in ipt_CLUSTERIP, but only
> save dev->ifindex instead of dev. When removing the mc from the dev,
> it will get dev by c->ifindex through dev_get_by_index.
> 
> Note that it doesn't save dev->name but dev->ifindex, as a dev->name
> can be changed, it will confuse ipt_CLUSTERIP.

Applied to nf-next.

This problem has been there since day 1, and it's a large patch, so I
prefer we follow nf-next path.

Thanks!

Re: [PATCH net-next] net/mlx4_en: don't set CHECKSUM_COMPLETE on SCTP packets

2017-06-19 Thread Davide Caratti

hello Tariq,
On Sun, 2017-06-18 at 14:10 +0300, Tariq Toukan wrote:
> > @@ -624,12 +632,13 @@ static int check_csum(struct mlx4_cqe *cqe, struct 
> > sk_buff *skb, void *va,
> >    hdr += sizeof(struct vlan_hdr);
> >    }
> >    
> > - if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4))
> > - get_fixed_ipv4_csum(hw_checksum, skb, hdr);
> > + if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4)) &&
> > + (unlikely(get_fixed_ipv4_csum(hw_checksum, skb, hdr
> 
> No! The lazy evaluation trick is wrong here.
> This way you'll end up going almost always to the else (ipv6) for the 
> wrong reason.

you are right! thanks for spotting this.

> > + return -1;
> >    #if IS_ENABLED(CONFIG_IPV6)
> > - else if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6))
> > - if (unlikely(get_fixed_ipv6_csum(hw_checksum, skb, hdr)))
> > - return -1;
> > + else if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) &&
> > +  (unlikely(get_fixed_ipv6_csum(hw_checksum, skb, hdr
> > + return -1;
> 
> Let's not change this, might cause future bugs, similarly to the one above.
> >    #endif
> >    return 0;
> >    }

maybe we can avoid adding braces, remove that 'else' keyword and the nested 
'if',
thus saving one line, given that check_csum() returns the same set of values as
get_fixed_ipv{4,6}_checksum(), with the same meaning (-1 => go with 
CHECKSUM_NONE,
0 => go with CHECKSUM_COMPLETE).

 >8 
@@ -625,11 +633,10 @@ static int check_csum(struct mlx4_cqe *cqe, struct 
sk_buff *skb, void *va,
}
 
if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4))
-   get_fixed_ipv4_csum(hw_checksum, skb, hdr);
+   return get_fixed_ipv4_csum(hw_checksum, skb, hdr);
 #if IS_ENABLED(CONFIG_IPV6)
-   else if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6))
-   if (unlikely(get_fixed_ipv6_csum(hw_checksum, skb, hdr)))
-   return -1;
+   if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6))
+   return get_fixed_ipv6_csum(hw_checksum, skb, hdr);
 #endif
return 0;
 }
 8< 

I will test and repost a v2 with this modification, unless you have any
objections. Thank you in advance!
regards
--
davide

Re: [PATCH V3 net-next 2/8] net: hns3: Add support of the HNAE3 framework

2017-06-19 Thread Stephen Hemminger

On Sat, 17 Jun 2017 18:24:25 +0100
Salil Mehta  wrote:

> +
> +/* This struct defines the operation on the handle.
> + *
> + * init_ae_dev(): (mandatory)
> + *   Get PF configure from pci_dev and initialize PF hardware
> + * uninit_ae_dev()
> + *   Disable PF device and release PF resource
> + * register_client
> + *   Register client to ae_dev
> + * unregister_client()
> + *   Unregister client from ae_dev
> + * start()
> + *   Enable the hardware
> + * stop()
> + *   Disable the hardware
> + * get_status()
> + *   Get the carrier state of the back channel of the handle, 1 for ok, 0 for
> + *   non-ok
> + * get_ksettings_an_result()
> + *   Get negotiation status,speed and duplex
> + * update_speed_duplex_h()
> + *   Update hardware speed and duplex
> + * get_media_type()
> + *   Get media type of MAC
> + * adjust_link()
> + *   Adjust link status
> + * set_loopback()
> + *   Set loopback
> + * set_promisc_mode
> + *   Set promisc mode
> + * set_mtu()
> + *   set mtu
> + * get_pauseparam()
> + *   get tx and rx of pause frame use
> + * set_pauseparam()
> + *   set tx and rx of pause frame use
> + * set_autoneg()
> + *   set auto autonegotiation of pause frame use
> + * get_autoneg()
> + *   get auto autonegotiation of pause frame use
> + * get_coalesce_usecs()
> + *   get usecs to delay a TX interrupt after a packet is sent
> + * get_rx_max_coalesced_frames()
> + *   get Maximum number of packets to be sent before a TX interrupt.
> + * set_coalesce_usecs()
> + *   set usecs to delay a TX interrupt after a packet is sent
> + * set_coalesce_frames()
> + *   set Maximum number of packets to be sent before a TX interrupt.
> + * get_mac_addr()
> + *   get mac address
> + * set_mac_addr()
> + *   set mac address
> + * add_uc_addr
> + *   Add unicast addr to mac table
> + * rm_uc_addr
> + *   Remove unicast addr from mac table
> + * set_mc_addr()
> + *   Set multicast address
> + * add_mc_addr
> + *   Add multicast address to mac table
> + * rm_mc_addr
> + *   Remove multicast address from mac table
> + * update_stats()
> + *   Update Old network device statistics
> + * get_ethtool_stats()
> + *   Get ethtool network device statistics
> + * get_strings()
> + *   Get a set of strings that describe the requested objects
> + * get_sset_count()
> + *   Get number of strings that @get_strings will write
> + * update_led_status()
> + *   Update the led status
> + * set_led_id()
> + *   Set led id
> + * get_regs()
> + *   Get regs dump
> + * get_regs_len()
> + *   Get the len of the regs dump
> + * get_rss_key_size()
> + *   Get rss key size
> + * get_rss_indir_size()
> + *   Get rss indirection table size
> + * get_rss()
> + *   Get rss table
> + * set_rss()
> + *   Set rss table
> + * get_tc_size()
> + *   Get tc size of handle
> + * get_vector()
> + *   Get vector number and vector infomation
> + * map_ring_to_vector()
> + *   Map rings to vector
> + * unmap_ring_from_vector()
> + *   Unmap rings from vector
> + * add_tunnel_udp()
> + *   Add tunnel information to hardware
> + * del_tunnel_udp()
> + *   Delete tunnel information from hardware
> + * reset_queue()
> + *   Reset queue
> + * get_fw_version()
> + *   Get firmware version
> + * get_mdix_mode()
> + *   Get media typr of phy
> + * set_vlan_filter()
> + *   Set vlan filter config of Ports
> + * set_vf_vlan_filter()
> + *   Set vlan filter config of vf
> + */
> +struct hnae3_ae_ops {
> + int (*init_ae_dev)(struct hnae3_ae_dev *ae_dev);
> + void (*uninit_ae_dev)(struct hnae3_ae_dev *ae_dev);
> +
> + int (*register_client)(struct hnae3_client *client,
> +struct hnae3_ae_dev *ae_dev);
> + void (*unregister_client)(struct hnae3_client *client,
> +   struct hnae3_ae_dev *ae_dev);
> + int (*start)(struct hnae3_handle *handle);
> + void (*stop)(struct hnae3_handle *handle);
> + int (*get_status)(struct hnae3_handle *handle);
> + void (*get_ksettings_an_result)(struct hnae3_handle *handle,
> + u8 *auto_neg, u32 *speed, u8 *duplex);
> +
> + int (*update_speed_duplex_h)(struct hnae3_handle *handle);
> + int (*cfg_mac_speed_dup_h)(struct hnae3_handle *handle, int speed,
> +u8 duplex);
> +
> + void (*get_media_type)(struct hnae3_handle *handle, u8 *media_type);
> + void (*adjust_link)(struct hnae3_handle *handle, int speed, int duplex);
> + int (*set_loopback)(struct hnae3_handle *handle,
> + enum hnae3_loop loop_mode, bool en);
> +
> + void (*set_promisc_mode)(struct hnae3_handle *handle, u32 en);
> + int (*set_mtu)(struct hnae3_handle *handle, int new_mtu);
> +
> + void (*get_pauseparam)(struct hnae3_handle *handle,
> +u32 *auto_neg, u32 *rx_en, u32 *tx_en);
> + int (*set_pauseparam)(struct hnae3_handle *handle,
> +   u32 auto_neg, u32 rx_en, u32 tx_en);
> +
> + int

Re: [PATCH v1 1/2] dt-binding: ptp: add bindings document for dte based ptp clock

2017-06-19 Thread Arun Parameswaran

Hi David, Rob,
I will address all of Rob's comments below.

Since a part of the patch was applied to 'net-next', would you like me to
send a new patch (based on the applied one), or a 'V2' of this patch ?

Thanks
Arun

On 17-06-18 07:04 AM, Rob Herring wrote:
> On Mon, Jun 12, 2017 at 01:26:00PM -0700, Arun Parameswaran wrote:
>> Add device tree binding documentation for the Broadcom DTE
>> PTP clock driver.
>>
>> Signed-off-by: Arun Parameswaran 
>> ---
>>  Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt | 13 +
>>  1 file changed, 13 insertions(+)
>>  create mode 100644 Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt
>>
>> diff --git a/Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt 
>> b/Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt
>> new file mode 100644
>> index 000..07590bc
>> --- /dev/null
>> +++ b/Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt
>> @@ -0,0 +1,13 @@
>> +* Broadcom Digital Timing Engine(DTE) based PTP clock driver
> 
> Bindings describe h/w, not drivers.
> 
>> +
>> +Required properties:
>> +- compatible: should be "brcm,ptp-dte"
> 
> Looks too generic. You need SoC specific compatible strings.
> 
>> +- reg: address and length of the DTE block's NCO registers
>> +
>> +Example:
>> +
>> +ptp_dte: ptp_dte@180af650 {
> 
> Don't use '_' in node names.
> 
>> +compatible = "brcm,ptp-dte";
>> +reg = <0x180af650 0x10>;
>> +status = "okay";
>> +};
>> -- 
>> 1.9.1
>>

[PATCH net-next] net: stmmac: enable TSO for IPv6

2017-06-19 Thread Niklas Cassel

There is nothing in the IP that prevents us from enabling TSO for IPv6.

Before patch:
ftp fe80::2aa:bbff:fecc:1336%eth0
ftp> get /dev/zero
882512708 bytes received in 00:14 (56.11 MiB/s)

After patch:
ftp fe80::2aa:bbff:fecc:1336%eth0
ftp> get /dev/zero
1203326784 bytes received in 00:12 (94.52 MiB/s)

Signed-off-by: Niklas Cassel 
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 6a1cb59728fe..fefbf817399a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2965,7 +2965,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, 
struct net_device *dev)
 
/* Manage oversized TCP frames for GMAC4 device */
if (skb_is_gso(skb) && priv->tso) {
-   if (ip_hdr(skb)->protocol == IPPROTO_TCP)
+   if (skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))
return stmmac_tso_xmit(skb, dev);
}
 
@@ -4126,7 +4126,7 @@ int stmmac_dvr_probe(struct device *device,
NETIF_F_RXCSUM;
 
if ((priv->plat->tso_en) && (priv->dma_cap.tsoen)) {
-   ndev->hw_features |= NETIF_F_TSO;
+   ndev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
priv->tso = true;
dev_info(priv->device, "TSO feature enabled\n");
}
-- 
2.11.0

[PATCH net-next] ibmvnic: Return from ibmvnic_resume if not in VNIC_OPEN state

2017-06-19 Thread John Allen

If the ibmvnic driver is not in the VNIC_OPEN state, return from
ibmvnic_resume callback. If we are not in the VNIC_OPEN state, interrupts
may not be initialized and directly calling the interrupt handler will
cause a crash.

Signed-off-by: John Allen 
---
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 722daf5..0135095 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -3859,6 +3859,9 @@ static int ibmvnic_resume(struct device *dev)
struct ibmvnic_adapter *adapter = netdev_priv(netdev);
int i;

+   if (adapter->state != VNIC_OPEN)
+   return 0;
+
/* kick the interrupt handlers just in case we lost an interrupt */
for (i = 0; i < adapter->req_rx_queues; i++)
ibmvnic_interrupt_rx(adapter->rx_scrq[i]->irq,

[PATCH] dt-bindings: net: sms911x: Add missing optional VDD regulators

2017-06-19 Thread Krzysztof Kozlowski

The lan911x family of devices require supplying from 3.3 V power
supplies (connected to VDD_IO, VDD_A and VREG_3.3 pins).  The existing
driver however obtains only VDD_IO and VDD_A regulators in an optional
way so document this in bindings.

Signed-off-by: Krzysztof Kozlowski 
---
 Documentation/devicetree/bindings/net/smsc911x.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/net/smsc911x.txt 
b/Documentation/devicetree/bindings/net/smsc911x.txt
index 16c3a9501f5d..acfafc8e143c 100644
--- a/Documentation/devicetree/bindings/net/smsc911x.txt
+++ b/Documentation/devicetree/bindings/net/smsc911x.txt
@@ -27,6 +27,7 @@ Optional properties:
   of the device. On many systems this is wired high so the device goes
   out of reset at power-on, but if it is under program control, this
   optional GPIO can wake up in response to it.
+- vdd33a-supply, vddvario-supply : 3.3V analog and IO logic power supplies
 
 Examples:
 
-- 
2.9.3

Re: [PATCH V2 net-next 1/8] net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC

2017-06-19 Thread Stephen Hemminger

On Wed, 14 Jun 2017 00:10:28 +0100
Salil Mehta  wrote:

> +hns3_nic_get_stats64(struct net_device *ndev, struct rtnl_link_stats64 
> *stats)
> +{
> + struct hns3_nic_priv *priv = netdev_priv(ndev);
> + int queue_num = priv->ae_handle->kinfo.num_tqps;
> + u64 tx_bytes = 0;
> + u64 rx_bytes = 0;
> + u64 tx_pkts = 0;
> + u64 rx_pkts = 0;
> + int idx = 0;
unnecessary initialization

> +
> + for (idx = 0; idx < queue_num; idx++) {
> + tx_bytes += priv->ring_data[idx].ring->stats.tx_bytes;
> + tx_pkts += priv->ring_data[idx].ring->stats.tx_pkts;
> + rx_bytes +=
> + priv->ring_data[idx + queue_num].ring->stats.rx_bytes;
> + rx_pkts += priv->ring_data[idx + queue_num].ring->stats.rx_pkts;
> + }
> +

Since rx_bytes and other statistics are 64 bit values. You need to use
something to ensure that updates to these values are atomic on 32 bit
platforms.  The most common way to handle this is with the u64_stats_sync
mechanism which is a nop on 64 bit architectures, and uses a seqcount
to do updates on 32 bit CPU's.

Re: [PATCH net-next 3/4] s390/diag: add diag26c support

2017-06-19 Thread Martin Schwidefsky

Hi Dave,

On Mon, 19 Jun 2017 10:47:26 -0400 (EDT)
David Miller  wrote:

> From: Julian Wiedmann 
> Date: Mon, 19 Jun 2017 13:22:24 +0200
> 
> > +#define DIAG26C_GET_MAC0x
> > +struct diag26c_mac_req {
> > +   u32 resp_buf_len;
> > +   u32 resp_version;
> > +   u16 op_code;
> > +   u16 devno;
> > +   u8  res[4];
> > +} __packed;  
> 
> The packed attribute is not necessary here, the structure will be
> perfectly packed together because of the types used and the order of
> the members.

We (as in the s390 guys) tend to add __packed to hardware and hypervisor
structures even if the attribute is not strictly necessary. Most of the
diagnose related structures look that way. Dunno if it is worth to change
them.

I agree that __packed should be avoided for software defined structures.

> __packed is to be used only in the last possible resort for
> correctness and every effort whatsoever should be used to avoid using
> it.
> 
> > +
> > +struct diag26c_mac_resp {
> > +   u32 version;
> > +   u8  mac[ETH_ALEN];
> > +   u16 res;
> > +} __packed __aligned(8);  
> 
> Using packed with an 8 byte alignment is even more unnecessary.
> 
> Again, it is not needed, so please don't use it.

The diag26c struct needs to be aligned on a doubleword boundary, the
__aligned(8) is necessary. The __packed attribute is again superfluous but
follows along the lines of the other diag structures.

I do not mind the extra __packed attributes, but if you care about them
we could remove them from the structures in diag.h.

> > + */
> > +static inline int __diag26c(void *req, void *resp, enum diag26c_sc 
> > subcode)  
> 
> Do not mark functions inline in *.c files, let the compiler decide.
> 

Here I disagree. Basically all of our functions with assembly code are
static inline, it is a common pattern even in C files. Sometimes the compiler
*is* stupid and won't inline a function. And on s390 function calls do not
come for free.

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.

Re: [PATCH 06/44] iommu/dma: don't rely on DMA_ERROR_CODE

2017-06-19 Thread Robin Murphy

On 16/06/17 19:10, Christoph Hellwig wrote:
> DMA_ERROR_CODE is not a public API and will go away soon.  dma dma-iommu
> driver already implements a proper ->mapping_error method, so it's only
> using the value internally.  Add a new local define using the value
> that arm64 which is the only current user of dma-iommu.

I was angling at just open-coding 0/!dma_addr/etc. for simplicity rather
than having anything #defined at all - nothing except the 4th and final
hunks actually have any relevance to  dma_mapping_error(), and I reckon
it's plenty clear enough in context. The rest is just proactively
blatting address arguments with "arbitrary definitely-invalid value",
which is more paranoia than anything else (and arguably unnecessary).

It's not the biggest deal, though, so either way:

Reviewed-by: Robin Murphy 

> Signed-off-by: Christoph Hellwig 
> ---
>  drivers/iommu/dma-iommu.c | 18 ++
>  1 file changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 62618e77bedc..9403336f1fa6 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -31,6 +31,8 @@
>  #include 
>  #include 
>  
> +#define IOMMU_MAPPING_ERROR  0
> +
>  struct iommu_dma_msi_page {
>   struct list_headlist;
>   dma_addr_t  iova;
> @@ -500,7 +502,7 @@ void iommu_dma_free(struct device *dev, struct page 
> **pages, size_t size,
>  {
>   __iommu_dma_unmap(iommu_get_domain_for_dev(dev), *handle, size);
>   __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
> - *handle = DMA_ERROR_CODE;
> + *handle = IOMMU_MAPPING_ERROR;
>  }
>  
>  /**
> @@ -533,7 +535,7 @@ struct page **iommu_dma_alloc(struct device *dev, size_t 
> size, gfp_t gfp,
>   dma_addr_t iova;
>   unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap;
>  
> - *handle = DMA_ERROR_CODE;
> + *handle = IOMMU_MAPPING_ERROR;
>  
>   min_size = alloc_sizes & -alloc_sizes;
>   if (min_size < PAGE_SIZE) {
> @@ -627,11 +629,11 @@ static dma_addr_t __iommu_dma_map(struct device *dev, 
> phys_addr_t phys,
>  
>   iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
>   if (!iova)
> - return DMA_ERROR_CODE;
> + return IOMMU_MAPPING_ERROR;
>  
>   if (iommu_map(domain, iova, phys - iova_off, size, prot)) {
>   iommu_dma_free_iova(cookie, iova, size);
> - return DMA_ERROR_CODE;
> + return IOMMU_MAPPING_ERROR;
>   }
>   return iova + iova_off;
>  }
> @@ -671,7 +673,7 @@ static int __finalise_sg(struct device *dev, struct 
> scatterlist *sg, int nents,
>  
>   s->offset += s_iova_off;
>   s->length = s_length;
> - sg_dma_address(s) = DMA_ERROR_CODE;
> + sg_dma_address(s) = IOMMU_MAPPING_ERROR;
>   sg_dma_len(s) = 0;
>  
>   /*
> @@ -714,11 +716,11 @@ static void __invalidate_sg(struct scatterlist *sg, int 
> nents)
>   int i;
>  
>   for_each_sg(sg, s, nents, i) {
> - if (sg_dma_address(s) != DMA_ERROR_CODE)
> + if (sg_dma_address(s) != IOMMU_MAPPING_ERROR)
>   s->offset += sg_dma_address(s);
>   if (sg_dma_len(s))
>   s->length = sg_dma_len(s);
> - sg_dma_address(s) = DMA_ERROR_CODE;
> + sg_dma_address(s) = IOMMU_MAPPING_ERROR;
>   sg_dma_len(s) = 0;
>   }
>  }
> @@ -836,7 +838,7 @@ void iommu_dma_unmap_resource(struct device *dev, 
> dma_addr_t handle,
>  
>  int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
>  {
> - return dma_addr == DMA_ERROR_CODE;
> + return dma_addr == IOMMU_MAPPING_ERROR;
>  }
>  
>  static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
>

[PATCH 0/2] fix loadable module for DPAA Ethernet

2017-06-19 Thread Madalin Bucur

The DPAA Ethernet makes use of a symbol that is not exported.
Address the issue by propagating the dma_ops rather than calling
arch_setup_dma_ops().

Madalin Bucur (2):
  fsl/fman: propagate dma_ops
  dpaa_eth: reuse the dma_ops provided by the FMan MAC device

 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +-
 drivers/net/ethernet/freescale/fman/mac.c  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

-- 
2.1.0

1 2 >

1 - 100 of 183 matches

Mail list logo