[PATCH net-next 06/12] nfp: add stats and xmit helpers for representors
Provide helpers for stats and xmit on representor netdevs. Parts based on work by Bert van Leeuwen, Benjamin LaHaise and Jakub Kicinski. Signed-off-by: Simon HormanReviewed-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 198 +- drivers/net/ethernet/netronome/nfp/nfp_net_repr.h | 28 +++ 2 files changed, 225 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index bdd34d206d22..a97bb6f2cc12 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -38,9 +38,191 @@ #include "nfpcore/nfp_cpp.h" #include "nfp_app.h" #include "nfp_main.h" +#include "nfp_net_ctrl.h" #include "nfp_net_repr.h" #include "nfp_port.h" +static void +nfp_repr_inc_tx_stats(struct net_device *netdev, unsigned int len, + int tx_status) +{ + struct nfp_repr *repr = netdev_priv(netdev); + struct nfp_repr_pcpu_stats *stats; + + if (unlikely(tx_status != NET_XMIT_SUCCESS && +tx_status != NET_XMIT_CN)) { + this_cpu_inc(repr->stats->tx_drops); + return; + } + + stats = this_cpu_ptr(repr->stats); + u64_stats_update_begin(>syncp); + stats->tx_packets++; + stats->tx_bytes += len; + u64_stats_update_end(>syncp); +} + +void nfp_repr_inc_rx_stats(struct net_device *netdev, unsigned int len) +{ + struct nfp_repr *repr = netdev_priv(netdev); + struct nfp_repr_pcpu_stats *stats; + + stats = this_cpu_ptr(repr->stats); + u64_stats_update_begin(>syncp); + stats->rx_packets++; + stats->rx_bytes += len; + u64_stats_update_end(>syncp); +} + +void +nfp_repr_phy_port_get_stats64(const struct nfp_app *app, u8 phy_port, + struct rtnl_link_stats64 *stats) +{ + u8 __iomem *mem; + + mem = app->pf->mac_stats_mem + phy_port * NFP_MAC_STATS_SIZE; + + /* TX and RX stats are flipped as we are returning the stats as seen +* at the switch port corresponding to the phys port. +*/ + stats->tx_packets = readq(mem + NFP_MAC_STATS_RX_FRAMES_RECEIVED_OK); + stats->tx_bytes = readq(mem + NFP_MAC_STATS_RX_IN_OCTETS); + stats->tx_dropped = readq(mem + NFP_MAC_STATS_RX_IN_ERRORS); + + stats->rx_packets = readq(mem + NFP_MAC_STATS_TX_FRAMES_TRANSMITTED_OK); + stats->rx_bytes = readq(mem + NFP_MAC_STATS_TX_OUT_OCTETS); + stats->rx_dropped = readq(mem + NFP_MAC_STATS_TX_OUT_ERRORS); +} + +void +nfp_repr_vf_get_stats64(const struct nfp_app *app, u8 vf, + struct rtnl_link_stats64 *stats) +{ + u8 __iomem *mem; + + mem = app->pf->vf_cfg_mem + vf * NFP_NET_CFG_BAR_SZ; + + /* TX and RX stats are flipped as we are returning the stats as seen +* at the switch port corresponding to the VF. +*/ + stats->tx_packets = readq(mem + NFP_NET_CFG_STATS_RX_FRAMES); + stats->tx_bytes = readq(mem + NFP_NET_CFG_STATS_RX_OCTETS); + stats->tx_dropped = readq(mem + NFP_NET_CFG_STATS_RX_DISCARDS); + + stats->rx_packets = readq(mem + NFP_NET_CFG_STATS_TX_FRAMES); + stats->rx_bytes = readq(mem + NFP_NET_CFG_STATS_TX_OCTETS); + stats->rx_dropped = readq(mem + NFP_NET_CFG_STATS_TX_DISCARDS); +} + +void +nfp_repr_pf_get_stats64(const struct nfp_app *app, u8 pf, + struct rtnl_link_stats64 *stats) +{ + u8 __iomem *mem; + + if (pf) + return; + + mem = nfp_cpp_area_iomem(app->pf->data_vnic_bar); + + stats->tx_packets = readq(mem + NFP_NET_CFG_STATS_RX_FRAMES); + stats->tx_bytes = readq(mem + NFP_NET_CFG_STATS_RX_OCTETS); + stats->tx_dropped = readq(mem + NFP_NET_CFG_STATS_RX_DISCARDS); + + stats->rx_packets = readq(mem + NFP_NET_CFG_STATS_TX_FRAMES); + stats->rx_bytes = readq(mem + NFP_NET_CFG_STATS_TX_OCTETS); + stats->rx_dropped = readq(mem + NFP_NET_CFG_STATS_TX_DISCARDS); +} + +void +nfp_repr_get_stats64(const struct nfp_app *app, enum nfp_repr_type type, +u8 port, struct rtnl_link_stats64 *stats) +{ + switch (type) { + case NFP_REPR_TYPE_PHYS_PORT: + nfp_repr_phy_port_get_stats64(app, port, stats); + break; + case NFP_REPR_TYPE_PF: + nfp_repr_pf_get_stats64(app, port, stats); + break; + case NFP_REPR_TYPE_VF: + nfp_repr_vf_get_stats64(app, port, stats); + default: + break; + } +} + +bool +nfp_repr_has_offload_stats(const struct net_device *dev, int attr_id) +{ + switch (attr_id) { + case IFLA_OFFLOAD_XSTATS_CPU_HIT: + return true; + } + + return false; +} + +static int +nfp_repr_get_host_stats64(const struct net_device *netdev, +
[PATCH net-next 02/12] nfp: devlink add support for getting eswitch mode
From: Jakub KicinskiAdd app callback for reporting eswitch mode. Non-SRIOV apps should not implement this callback, nfp_app code will then respond with -EOPNOTSUPP. Signed-off-by: Jakub Kicinski Signed-off-by: Simon Horman --- drivers/net/ethernet/netronome/nfp/nfp_app.h | 15 +++ drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 18 ++ 2 files changed, 33 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h index f5e373fa8c3b..0fee14ffa081 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h @@ -34,6 +34,8 @@ #ifndef _NFP_APP_H #define _NFP_APP_H 1 +#include + struct bpf_prog; struct net_device; struct pci_dev; @@ -70,6 +72,7 @@ extern const struct nfp_app_type app_bpf; * @setup_tc: setup TC ndo * @tc_busy: TC HW offload busy (rules loaded) * @xdp_offload:offload an XDP program + * @eswitch_mode_get:get SR-IOV eswitch mode */ struct nfp_app_type { enum nfp_app_id id; @@ -95,6 +98,8 @@ struct nfp_app_type { bool (*tc_busy)(struct nfp_app *app, struct nfp_net *nn); int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn, struct bpf_prog *prog); + + enum devlink_eswitch_mode (*eswitch_mode_get)(struct nfp_app *app); }; /** @@ -216,6 +221,16 @@ static inline void nfp_app_ctrl_rx(struct nfp_app *app, struct sk_buff *skb) app->type->ctrl_msg_rx(app, skb); } +static inline int nfp_app_eswitch_mode_get(struct nfp_app *app, u16 *mode) +{ + if (!app->type->eswitch_mode_get) + return -EOPNOTSUPP; + + *mode = app->type->eswitch_mode_get(app); + + return 0; +} + const char *nfp_app_mip_name(struct nfp_app *app); struct sk_buff *nfp_app_ctrl_msg_alloc(struct nfp_app *app, unsigned int size); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index 2609a0f28e81..6c9f29c2e975 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -149,9 +149,27 @@ nfp_devlink_port_unsplit(struct devlink *devlink, unsigned int port_index) return ret; } +static int nfp_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) +{ + struct nfp_pf *pf = devlink_priv(devlink); + int ret; + + mutex_lock(>lock); + if (!pf->app) { + ret = -EBUSY; + goto out; + } + ret = nfp_app_eswitch_mode_get(pf->app, mode); +out: + mutex_unlock(>lock); + + return ret; +} + const struct devlink_ops nfp_devlink_ops = { .port_split = nfp_devlink_port_split, .port_unsplit = nfp_devlink_port_unsplit, + .eswitch_mode_get = nfp_devlink_eswitch_mode_get, }; int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port) -- 2.1.4
[PATCH net-next 11/12] nfp: add flower app
Add app for flower offload. At this point the PF netdev and phys port representor netdevs are initialised. Follow-up work will add support for VF and PF representors and beyond that offloading the flower classifier. Based in part on work by Benjamin LaHaise and Bert van Leeuwen. Signed-off-by: Simon HormanReviewed-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/Makefile | 1 + drivers/net/ethernet/netronome/nfp/flower/main.c | 294 +++ drivers/net/ethernet/netronome/nfp/nfp_app.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_app.h | 4 + 4 files changed, 300 insertions(+) create mode 100644 drivers/net/ethernet/netronome/nfp/flower/main.c diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile index e14f62863add..10b556b2c59d 100644 --- a/drivers/net/ethernet/netronome/nfp/Makefile +++ b/drivers/net/ethernet/netronome/nfp/Makefile @@ -28,6 +28,7 @@ nfp-objs := \ bpf/main.o \ bpf/offload.o \ flower/cmsg.o \ + flower/main.o \ nic/main.o ifeq ($(CONFIG_BPF_SYSCALL),y) diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c new file mode 100644 index ..01864840a21b --- /dev/null +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -0,0 +1,294 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "../nfpcore/nfp_cpp.h" +#include "../nfpcore/nfp_nsp.h" +#include "../nfp_app.h" +#include "../nfp_main.h" +#include "../nfp_net.h" +#include "../nfp_net_repr.h" +#include "../nfp_port.h" +#include "./cmsg.h" + +/** + * struct nfp_flower_priv - Flower APP per-vNIC priv data + * @nn: Pointer to vNIC + */ +struct nfp_flower_priv { + struct nfp_net *nn; +}; + +static const char *nfp_flower_extra_cap(struct nfp_app *app, struct nfp_net *nn) +{ + return "FLOWER"; +} + +static enum devlink_eswitch_mode eswitch_mode_get(struct nfp_app *app) +{ + return DEVLINK_ESWITCH_MODE_SWITCHDEV; +} + +static enum nfp_repr_type +nfp_flower_repr_get_type_and_port(struct nfp_app *app, u32 port_id, u8 *port) +{ + switch (FIELD_GET(NFP_FLOWER_CMSG_PORT_TYPE, port_id)) { + case NFP_FLOWER_CMSG_PORT_TYPE_PHYS_PORT: + *port = FIELD_GET(NFP_FLOWER_CMSG_PORT_PHYS_PORT_NUM, + port_id); + return NFP_REPR_TYPE_PHYS_PORT; + + case NFP_FLOWER_CMSG_PORT_TYPE_PCIE_PORT: + *port = FIELD_GET(NFP_FLOWER_CMSG_PORT_VNIC, port_id); + if (FIELD_GET(NFP_FLOWER_CMSG_PORT_VNIC_TYPE, port_id) == + NFP_FLOWER_CMSG_PORT_VNIC_TYPE_PF) + return NFP_REPR_TYPE_PF; + else + return NFP_REPR_TYPE_VF; + } + + return NFP_FLOWER_CMSG_PORT_TYPE_UNSPEC; +} + +static struct net_device * +nfp_flower_repr_get(struct nfp_app *app, u32 port_id) +{ + enum nfp_repr_type repr_type; + struct nfp_reprs *reprs; + u8 port = 0; + + repr_type = nfp_flower_repr_get_type_and_port(app, port_id, ); + + reprs = rcu_dereference(app->reprs[repr_type]); + if (!reprs) + return NULL; + + if (port >= reprs->num_reprs) + return NULL; + + return reprs->reprs[port]; +} + +static void +nfp_flower_repr_netdev_get_stats64(struct
[PATCH net-next 05/12] nfp: general representor implementation
Provide infrastructure to create and destroy representors of a given type. Parts based on work by Bert van Leeuwen, Benjamin LaHaise, and Jakub Kicinski. Signed-off-by: Simon HormanReviewed-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/Makefile | 1 + drivers/net/ethernet/netronome/nfp/nfp_app.c | 20 +++ drivers/net/ethernet/netronome/nfp/nfp_app.h | 18 +++ drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 156 ++ drivers/net/ethernet/netronome/nfp/nfp_net_repr.h | 92 + 5 files changed, 287 insertions(+) create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile index 5ad9a557f06a..a401113035f5 100644 --- a/drivers/net/ethernet/netronome/nfp/Makefile +++ b/drivers/net/ethernet/netronome/nfp/Makefile @@ -22,6 +22,7 @@ nfp-objs := \ nfp_net_common.o \ nfp_net_ethtool.o \ nfp_net_main.o \ + nfp_net_repr.o \ nfp_netvf_main.o \ nfp_port.o \ bpf/main.o \ diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c b/drivers/net/ethernet/netronome/nfp/nfp_app.c index 396b93f54823..c9ccb0f94604 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c @@ -38,6 +38,7 @@ #include "nfpcore/nfp_nffw.h" #include "nfp_app.h" #include "nfp_main.h" +#include "nfp_net_repr.h" static const struct nfp_app_type *apps[] = { _nic, @@ -68,6 +69,25 @@ struct sk_buff *nfp_app_ctrl_msg_alloc(struct nfp_app *app, unsigned int size) return skb; } +struct nfp_reprs * +nfp_app_reprs_set(struct nfp_app *app, enum nfp_repr_type type, + struct nfp_reprs *reprs) +{ + struct nfp_reprs *old; + + old = rcu_dereference_protected(app->reprs[type], + lockdep_is_held(>pf->lock)); + if (reprs && old) { + old = ERR_PTR(-EBUSY); + goto exit_unlock; + } + + rcu_assign_pointer(app->reprs[type], reprs); + +exit_unlock: + return old; +} + struct nfp_app *nfp_app_alloc(struct nfp_pf *pf, enum nfp_app_id id) { struct nfp_app *app; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h index 0fee14ffa081..af023a0491e7 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h @@ -36,6 +36,8 @@ #include +#include "nfp_net_repr.h" + struct bpf_prog; struct net_device; struct pci_dev; @@ -73,6 +75,7 @@ extern const struct nfp_app_type app_bpf; * @tc_busy: TC HW offload busy (rules loaded) * @xdp_offload:offload an XDP program * @eswitch_mode_get:get SR-IOV eswitch mode + * @repr_get: get representor netdev */ struct nfp_app_type { enum nfp_app_id id; @@ -100,6 +103,7 @@ struct nfp_app_type { struct bpf_prog *prog); enum devlink_eswitch_mode (*eswitch_mode_get)(struct nfp_app *app); + struct net_device *(*repr_get)(struct nfp_app *app, u32 id); }; /** @@ -108,6 +112,7 @@ struct nfp_app_type { * @pf:backpointer to NFP PF structure * @cpp: pointer to the CPP handle * @ctrl: pointer to ctrl vNIC struct + * @reprs: array of pointers to representors * @type: pointer to const application ops and info */ struct nfp_app { @@ -116,6 +121,7 @@ struct nfp_app { struct nfp_cpp *cpp; struct nfp_net *ctrl; + struct nfp_reprs __rcu *reprs[NFP_REPR_TYPE_MAX + 1]; const struct nfp_app_type *type; }; @@ -231,6 +237,18 @@ static inline int nfp_app_eswitch_mode_get(struct nfp_app *app, u16 *mode) return 0; } +static inline struct net_device *nfp_app_repr_get(struct nfp_app *app, u32 id) +{ + if (unlikely(!app || !app->type->repr_get)) + return NULL; + + return app->type->repr_get(app, id); +} + +struct nfp_reprs * +nfp_app_reprs_set(struct nfp_app *app, enum nfp_repr_type type, + struct nfp_reprs *reprs); + const char *nfp_app_mip_name(struct nfp_app *app); struct sk_buff *nfp_app_ctrl_msg_alloc(struct nfp_app *app, unsigned int size); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c new file mode 100644 index ..bdd34d206d22 --- /dev/null +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the
[PATCH net-next 04/12] nfp: map mac_stats and vf_cfg BARs
If present map mac_stats and vf_cfg BARs. These will be used by representor netdevs to read statistics for phys port and vf representors. Also provide defines describing the layout of the mac_stats area. Similar defines are already present for the cf_cfg area. Based in part on work by Jakub Kicinski. Signed-off-by: Simon HormanReviewed-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_main.h | 8 ++ drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 116 +++-- drivers/net/ethernet/netronome/nfp/nfp_port.h | 60 +++ .../net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h | 2 + .../ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c | 5 +- 5 files changed, 161 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.h b/drivers/net/ethernet/netronome/nfp/nfp_main.h index 88724f8d0dcd..aa69d4101eb9 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.h @@ -68,6 +68,10 @@ struct nfp_rtsym_table; * @data_vnic_bar: Pointer to the CPP area for the data vNICs' BARs * @ctrl_vnic_bar: Pointer to the CPP area for the ctrl vNIC's BAR * @qc_area: Pointer to the CPP area for the queues + * @mac_stats_bar: Pointer to the CPP area for the MAC stats + * @mac_stats_mem: Pointer to mapped MAC stats area + * @vf_cfg_bar:Pointer to the CPP area for the VF configuration BAR + * @vf_cfg_mem:Pointer to mapped VF configuration area * @irq_entries: Array of MSI-X entries for all vNICs * @limit_vfs: Number of VFs supported by firmware (~0 for PCI limit) * @num_vfs: Number of SR-IOV VFs enabled @@ -97,6 +101,10 @@ struct nfp_pf { struct nfp_cpp_area *data_vnic_bar; struct nfp_cpp_area *ctrl_vnic_bar; struct nfp_cpp_area *qc_area; + struct nfp_cpp_area *mac_stats_bar; + u8 __iomem *mac_stats_mem; + struct nfp_cpp_area *vf_cfg_bar; + u8 __iomem *vf_cfg_mem; struct msix_entry *irq_entries; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c index bc2bc0886176..eb87e1c08bb1 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c @@ -570,6 +570,79 @@ static void nfp_net_pf_app_stop(struct nfp_pf *pf) nfp_net_pf_app_stop_ctrl(pf); } +static void nfp_net_pci_unmap_mem(struct nfp_pf *pf) +{ + if (pf->vf_cfg_bar) + nfp_cpp_area_release_free(pf->vf_cfg_bar); + if (pf->mac_stats_bar) + nfp_cpp_area_release_free(pf->mac_stats_bar); + nfp_cpp_area_release_free(pf->qc_area); + nfp_cpp_area_release_free(pf->data_vnic_bar); +} + +static int nfp_net_pci_map_mem(struct nfp_pf *pf) +{ + u32 ctrl_bar_sz; + u8 __iomem *mem; + int err; + + ctrl_bar_sz = pf->max_data_vnics * NFP_PF_CSR_SLICE_SIZE; + mem = nfp_net_pf_map_rtsym(pf, "net.ctrl", "_pf%d_net_bar0", + ctrl_bar_sz, >data_vnic_bar); + if (IS_ERR(mem)) { + err = PTR_ERR(mem); + if (!pf->fw_loaded && err == -ENOENT) + err = -EPROBE_DEFER; + return err; + } + + pf->mac_stats_mem = nfp_net_pf_map_rtsym(pf, "net.macstats", +"_mac_stats", +NFP_MAC_STATS_SIZE * +(pf->eth_tbl->max_index + 1), +>mac_stats_bar); + if (IS_ERR(pf->mac_stats_mem)) { + if (PTR_ERR(pf->mac_stats_mem) != -ENOENT) { + err = PTR_ERR(pf->mac_stats_mem); + goto err_unmap_ctrl; + } + pf->mac_stats_mem = NULL; + } + + pf->vf_cfg_mem = nfp_net_pf_map_rtsym(pf, "net.vfcfg", + "_pf%d_net_vf_bar", + NFP_NET_CFG_BAR_SZ * + pf->limit_vfs, >vf_cfg_bar); + if (IS_ERR(pf->vf_cfg_mem)) { + if (PTR_ERR(pf->vf_cfg_mem) != -ENOENT) { + err = PTR_ERR(pf->vf_cfg_mem); + goto err_unmap_mac_stats; + } + pf->vf_cfg_mem = NULL; + } + + mem = nfp_net_map_area(pf->cpp, "net.qc", 0, 0, + NFP_PCIE_QUEUE(0), NFP_QCP_QUEUE_AREA_SZ, + >qc_area); + if (IS_ERR(mem)) { + nfp_err(pf->cpp, "Failed to map Queue Controller area.\n"); + err = PTR_ERR(mem); + goto err_unmap_vf_cfg; + } + + return 0; + +err_unmap_vf_cfg: + if (pf->vf_cfg_bar) +
[PATCH net-next 12/12] nfp: add VF and PF representors to flower app
Initialise VF and PF representors in flower app. Based in part on work by Benjamin LaHaise, Bert van Leeuwen and Jakub Kicinski. Signed-off-by: Simon HormanReviewed-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/flower/main.c | 86 +++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index 01864840a21b..b30f1c4ffd3a 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -149,15 +149,81 @@ static const struct net_device_ops nfp_flower_repr_netdev_ops = { .ndo_get_offload_stats = nfp_repr_get_offload_stats, }; +static void nfp_flower_sriov_disable(struct nfp_app *app) +{ + nfp_reprs_clean_and_free_by_type(app, NFP_REPR_TYPE_VF); +} + +static int +nfp_flower_spawn_vnic_reprs(struct nfp_app *app, + enum nfp_flower_cmsg_port_vnic_type vnic_type, + enum nfp_repr_type repr_type, unsigned int cnt) +{ + u8 nfp_pcie = nfp_cppcore_pcie_unit(app->pf->cpp); + struct nfp_flower_priv *priv = app->priv; + struct nfp_reprs *reprs, *old_reprs; + const u8 queue = 0; + int i, err; + + reprs = nfp_reprs_alloc(cnt); + if (!reprs) + return -ENOMEM; + + for (i = 0; i < cnt; i++) { + u32 port_id; + + reprs->reprs[i] = nfp_repr_alloc(app); + if (!reprs->reprs[i]) { + err = -ENOMEM; + goto err_reprs_clean; + } + + SET_NETDEV_DEV(reprs->reprs[i], >nn->pdev->dev); + eth_hw_addr_inherit(reprs->reprs[i], priv->nn->dp.netdev); + + port_id = nfp_flower_cmsg_pcie_port(nfp_pcie, vnic_type, + i, queue); + err = nfp_repr_init(app, reprs->reprs[i], + _flower_repr_netdev_ops, + port_id, NULL, priv->nn->dp.netdev); + if (err) + goto err_reprs_clean; + + nfp_info(app->cpp, "%s%d Representor(%s) created\n", +repr_type == NFP_REPR_TYPE_PF ? "PF" : "VF", i, +reprs->reprs[i]->name); + } + + old_reprs = nfp_app_reprs_set(app, repr_type, reprs); + if (IS_ERR(old_reprs)) { + err = PTR_ERR(old_reprs); + goto err_reprs_clean; + } + + return 0; +err_reprs_clean: + nfp_reprs_clean_and_free(reprs); + return err; +} + +static int nfp_flower_sriov_enable(struct nfp_app *app, int num_vfs) +{ + return nfp_flower_spawn_vnic_reprs(app, + NFP_FLOWER_CMSG_PORT_VNIC_TYPE_VF, + NFP_REPR_TYPE_VF, num_vfs); +} + static void nfp_flower_stop(struct nfp_app *app) { + nfp_reprs_clean_and_free_by_type(app, NFP_REPR_TYPE_PF); nfp_reprs_clean_and_free_by_type(app, NFP_REPR_TYPE_PHYS_PORT); + } -static int nfp_flower_start(struct nfp_app *app) +static int +nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) { struct nfp_eth_table *eth_tbl = app->pf->eth_tbl; - struct nfp_flower_priv *priv = app->priv; struct nfp_reprs *reprs, *old_reprs; unsigned int i; int err; @@ -218,6 +284,19 @@ static int nfp_flower_start(struct nfp_app *app) return err; } +static int nfp_flower_start(struct nfp_app *app) +{ + int err; + + err = nfp_flower_spawn_phy_reprs(app, app->priv); + if (err) + return err; + + return nfp_flower_spawn_vnic_reprs(app, + NFP_FLOWER_CMSG_PORT_VNIC_TYPE_PF, + NFP_REPR_TYPE_PF, 1); +} + static void nfp_flower_vnic_clean(struct nfp_app *app, struct nfp_net *nn) { kfree(app->priv); @@ -289,6 +368,9 @@ const struct nfp_app_type app_flower = { .ctrl_msg_rx= nfp_flower_cmsg_rx, + .sriov_enable = nfp_flower_sriov_enable, + .sriov_disable = nfp_flower_sriov_disable, + .eswitch_mode_get = eswitch_mode_get, .repr_get = nfp_flower_repr_get, }; -- 2.1.4
[PATCH net-next 07/12] nfp: app callbacks for SRIOV
Add app-callbacks for app-specific initialisation of SRIOV. Disabling SRIOV is brought forward in nfp_pci_remove() so that nfp_app_sriov_disable is called while the app still exists. This is intended to be used to implement representor netdevs for virtual ports. Signed-off-by: Simon HormanReviewed-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_app.h | 18 drivers/net/ethernet/netronome/nfp/nfp_main.c | 42 +++ 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h index af023a0491e7..ff2d43615808 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h @@ -75,6 +75,8 @@ extern const struct nfp_app_type app_bpf; * @tc_busy: TC HW offload busy (rules loaded) * @xdp_offload:offload an XDP program * @eswitch_mode_get:get SR-IOV eswitch mode + * @sriov_enable: app-specific sriov initialisation + * @sriov_disable: app-specific sriov clean-up * @repr_get: get representor netdev */ struct nfp_app_type { @@ -102,6 +104,9 @@ struct nfp_app_type { int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn, struct bpf_prog *prog); + int (*sriov_enable)(struct nfp_app *app, int num_vfs); + void (*sriov_disable)(struct nfp_app *app); + enum devlink_eswitch_mode (*eswitch_mode_get)(struct nfp_app *app); struct net_device *(*repr_get)(struct nfp_app *app, u32 id); }; @@ -237,6 +242,19 @@ static inline int nfp_app_eswitch_mode_get(struct nfp_app *app, u16 *mode) return 0; } +static inline int nfp_app_sriov_enable(struct nfp_app *app, int num_vfs) +{ + if (!app || !app->type->sriov_enable) + return -EOPNOTSUPP; + return app->type->sriov_enable(app, num_vfs); +} + +static inline void nfp_app_sriov_disable(struct nfp_app *app) +{ + if (app && app->type->sriov_disable) + app->type->sriov_disable(app); +} + static inline struct net_device *nfp_app_repr_get(struct nfp_app *app, u32 id) { if (unlikely(!app || !app->type->repr_get)) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c index 4e59dcb78c36..748e54cc885e 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c @@ -54,6 +54,7 @@ #include "nfpcore/nfp6000_pcie.h" +#include "nfp_app.h" #include "nfp_main.h" #include "nfp_net.h" @@ -97,28 +98,45 @@ static int nfp_pcie_sriov_enable(struct pci_dev *pdev, int num_vfs) struct nfp_pf *pf = pci_get_drvdata(pdev); int err; + mutex_lock(>lock); + if (num_vfs > pf->limit_vfs) { nfp_info(pf->cpp, "Firmware limits number of VFs to %u\n", pf->limit_vfs); - return -EINVAL; + err = -EINVAL; + goto err_unlock; + } + + err = nfp_app_sriov_enable(pf->app, num_vfs); + if (err) { + dev_warn(>dev, "App specific PCI sriov configuration failed: %d\n", +err); + goto err_unlock; } err = pci_enable_sriov(pdev, num_vfs); if (err) { dev_warn(>dev, "Failed to enable PCI sriov: %d\n", err); - return err; + goto err_app_sriov_disable; } pf->num_vfs = num_vfs; dev_dbg(>dev, "Created %d VFs.\n", pf->num_vfs); + mutex_unlock(>lock); return num_vfs; + +err_app_sriov_disable: + nfp_app_sriov_disable(pf->app); +err_unlock: + mutex_unlock(>lock); + return err; #endif return 0; } -static int nfp_pcie_sriov_disable(struct pci_dev *pdev) +static int __nfp_pcie_sriov_disable(struct pci_dev *pdev) { #ifdef CONFIG_PCI_IOV struct nfp_pf *pf = pci_get_drvdata(pdev); @@ -132,6 +150,8 @@ static int nfp_pcie_sriov_disable(struct pci_dev *pdev) return -EPERM; } + nfp_app_sriov_disable(pf->app); + pf->num_vfs = 0; pci_disable_sriov(pdev); @@ -140,6 +160,18 @@ static int nfp_pcie_sriov_disable(struct pci_dev *pdev) return 0; } +static int nfp_pcie_sriov_disable(struct pci_dev *pdev) +{ + struct nfp_pf *pf = pci_get_drvdata(pdev); + int err; + + mutex_lock(>lock); + err = __nfp_pcie_sriov_disable(pdev); + mutex_unlock(>lock); + + return err; +} + static int nfp_pcie_sriov_configure(struct pci_dev *pdev, int num_vfs) { if (num_vfs == 0) @@ -431,11 +463,11 @@ static void nfp_pci_remove(struct pci_dev *pdev) devlink = priv_to_devlink(pf); - nfp_net_pci_remove(pf); - nfp_pcie_sriov_disable(pdev); pci_sriov_set_totalvfs(pf->pdev, 0); + nfp_net_pci_remove(pf); +
[PATCH net-next 09/12] nfp: add support for tx/rx with metadata portid
Allow tx/rx with metadata port id. This will be used for tx/rx of representor netdevs acting as upper-devices while a pf netdev acts as a lower-device. Signed-off-by: Simon HormanReviewed-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_net.h | 1 + .../net/ethernet/netronome/nfp/nfp_net_common.c| 57 +++--- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 02fd8d4e253c..96c8ea476c05 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -318,6 +318,7 @@ struct nfp_meta_parsed { u8 csum_type; u32 hash; u32 mark; + u32 portid; __wsum csum; }; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 2b1ae666..046e4d929e93 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -755,6 +755,26 @@ static void nfp_net_tx_xmit_more_flush(struct nfp_net_tx_ring *tx_ring) tx_ring->wr_ptr_add = 0; } +static int nfp_net_prep_port_id(struct sk_buff *skb) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + unsigned char *data; + + if (likely(!md_dst)) + return 0; + if (unlikely(md_dst->type != METADATA_HW_PORT_MUX)) + return 0; + + if (unlikely(skb_cow_head(skb, 8))) + return -ENOMEM; + + data = skb_push(skb, 8); + put_unaligned_be32(NFP_NET_META_PORTID, data); + memcpy(data + 4, _dst->u.port_info.port_id, 4); + + return 8; +} + /** * nfp_net_tx() - Main transmit entry point * @skb:SKB to transmit @@ -767,6 +787,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev) struct nfp_net *nn = netdev_priv(netdev); const struct skb_frag_struct *frag; struct nfp_net_tx_desc *txd, txdg; + int f, nr_frags, wr_idx, md_bytes; struct nfp_net_tx_ring *tx_ring; struct nfp_net_r_vector *r_vec; struct nfp_net_tx_buf *txbuf; @@ -774,8 +795,6 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev) struct nfp_net_dp *dp; dma_addr_t dma_addr; unsigned int fsize; - int f, nr_frags; - int wr_idx; u16 qidx; dp = >dp; @@ -797,6 +816,13 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev) return NETDEV_TX_BUSY; } + md_bytes = nfp_net_prep_port_id(skb); + if (unlikely(md_bytes < 0)) { + nfp_net_tx_xmit_more_flush(tx_ring); + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + /* Start with the head skbuf */ dma_addr = dma_map_single(dp->dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE); @@ -815,7 +841,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev) /* Build TX descriptor */ txd = _ring->txds[wr_idx]; - txd->offset_eop = (nr_frags == 0) ? PCIE_DESC_TX_EOP : 0; + txd->offset_eop = (nr_frags ? 0 : PCIE_DESC_TX_EOP) | md_bytes; txd->dma_len = cpu_to_le16(skb_headlen(skb)); nfp_desc_set_dma_addr(txd, dma_addr); txd->data_len = cpu_to_le16(skb->len); @@ -855,7 +881,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev) *txd = txdg; txd->dma_len = cpu_to_le16(fsize); nfp_desc_set_dma_addr(txd, dma_addr); - txd->offset_eop = + txd->offset_eop |= (f == nr_frags - 1) ? PCIE_DESC_TX_EOP : 0; } @@ -1450,6 +1476,10 @@ nfp_net_parse_meta(struct net_device *netdev, struct nfp_meta_parsed *meta, meta->mark = get_unaligned_be32(data); data += 4; break; + case NFP_NET_META_PORTID: + meta->portid = get_unaligned_be32(data); + data += 4; + break; case NFP_NET_META_CSUM: meta->csum_type = CHECKSUM_COMPLETE; meta->csum = @@ -1594,6 +1624,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) struct nfp_net_rx_buf *rxbuf; struct nfp_net_rx_desc *rxd; struct nfp_meta_parsed meta; + struct net_device *netdev; dma_addr_t new_dma_addr; void *new_frag; @@ -1672,7 +1703,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) } if (xdp_prog && !(rxd->rxd.flags &
[PATCH net-next 08/12] nfp: provide nfp_port to of nfp_net_get_mac_addr()
Provide port rather than vNIC as parameter of nfp_net_get_mac_addr. This is to allow this function to be used by representor netdevs where a vNIC may have more than one physical port none of which are associated with the vNIC. Signed-off-by: Simon HormanReviewed-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_app_nic.c | 2 +- drivers/net/ethernet/netronome/nfp/nfp_main.h | 3 ++- drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 25 +++ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c b/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c index 7b966bd3d214..c11a6c34e217 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c @@ -69,7 +69,7 @@ int nfp_app_nic_vnic_init(struct nfp_app *app, struct nfp_net *nn, if (err) return err < 0 ? err : 0; - nfp_net_get_mac_addr(app->pf, nn, id); + nfp_net_get_mac_addr(app->pf, nn->port, id); return 0; } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.h b/drivers/net/ethernet/netronome/nfp/nfp_main.h index aa69d4101eb9..edc14dc78674 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.h @@ -58,6 +58,7 @@ struct nfp_hwinfo; struct nfp_mip; struct nfp_net; struct nfp_nsp_identify; +struct nfp_port; struct nfp_rtsym_table; /** @@ -147,7 +148,7 @@ void nfp_hwmon_unregister(struct nfp_pf *pf); struct nfp_eth_table_port * nfp_net_find_port(struct nfp_eth_table *eth_tbl, unsigned int id); void -nfp_net_get_mac_addr(struct nfp_pf *pf, struct nfp_net *nn, unsigned int id); +nfp_net_get_mac_addr(struct nfp_pf *pf, struct nfp_port *port, unsigned int id); bool nfp_ctrl_tx(struct nfp_net *nn, struct sk_buff *skb); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c index eb87e1c08bb1..e16a5fa92279 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c @@ -135,25 +135,24 @@ static u8 __iomem *nfp_net_map_area(struct nfp_cpp *cpp, /** * nfp_net_get_mac_addr() - Get the MAC address. * @pf: NFP PF handle - * @nn: NFP Network structure + * @port: NFP port structure * @id: NFP port id * * First try to get the MAC address from NSP ETH table. If that * fails try HWInfo. As a last resort generate a random address. */ void -nfp_net_get_mac_addr(struct nfp_pf *pf, struct nfp_net *nn, unsigned int id) +nfp_net_get_mac_addr(struct nfp_pf *pf, struct nfp_port *port, unsigned int id) { struct nfp_eth_table_port *eth_port; - struct nfp_net_dp *dp = >dp; u8 mac_addr[ETH_ALEN]; const char *mac_str; char name[32]; - eth_port = __nfp_port_get_eth_port(nn->port); + eth_port = __nfp_port_get_eth_port(port); if (eth_port) { - ether_addr_copy(dp->netdev->dev_addr, eth_port->mac_addr); - ether_addr_copy(dp->netdev->perm_addr, eth_port->mac_addr); + ether_addr_copy(port->netdev->dev_addr, eth_port->mac_addr); + ether_addr_copy(port->netdev->perm_addr, eth_port->mac_addr); return; } @@ -161,22 +160,22 @@ nfp_net_get_mac_addr(struct nfp_pf *pf, struct nfp_net *nn, unsigned int id) mac_str = nfp_hwinfo_lookup(pf->hwinfo, name); if (!mac_str) { - dev_warn(dp->dev, "Can't lookup MAC address. Generate\n"); - eth_hw_addr_random(dp->netdev); + nfp_warn(pf->cpp, "Can't lookup MAC address. Generate\n"); + eth_hw_addr_random(port->netdev); return; } if (sscanf(mac_str, "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx", _addr[0], _addr[1], _addr[2], _addr[3], _addr[4], _addr[5]) != 6) { - dev_warn(dp->dev, -"Can't parse MAC address (%s). Generate.\n", mac_str); - eth_hw_addr_random(dp->netdev); + nfp_warn(pf->cpp, "Can't parse MAC address (%s). Generate.\n", +mac_str); + eth_hw_addr_random(port->netdev); return; } - ether_addr_copy(dp->netdev->dev_addr, mac_addr); - ether_addr_copy(dp->netdev->perm_addr, mac_addr); + ether_addr_copy(port->netdev->dev_addr, mac_addr); + ether_addr_copy(port->netdev->perm_addr, mac_addr); } struct nfp_eth_table_port * -- 2.1.4
[PATCH net-next 10/12] nfp: add support for control messages for flower app
In preparation for adding a new flower app - targeted at offloading the flower classifier - provide support for control message that it will use to communicate with the NFP. Based in part on work by Bert van Leeuwen. Signed-off-by: Simon HormanReviewed-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/Makefile | 1 + drivers/net/ethernet/netronome/nfp/flower/cmsg.c | 159 +++ drivers/net/ethernet/netronome/nfp/flower/cmsg.h | 116 + drivers/net/ethernet/netronome/nfp/nfp_app.c | 5 +- drivers/net/ethernet/netronome/nfp/nfp_app.h | 3 +- 5 files changed, 281 insertions(+), 3 deletions(-) create mode 100644 drivers/net/ethernet/netronome/nfp/flower/cmsg.c create mode 100644 drivers/net/ethernet/netronome/nfp/flower/cmsg.h diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile index a401113035f5..e14f62863add 100644 --- a/drivers/net/ethernet/netronome/nfp/Makefile +++ b/drivers/net/ethernet/netronome/nfp/Makefile @@ -27,6 +27,7 @@ nfp-objs := \ nfp_port.o \ bpf/main.o \ bpf/offload.o \ + flower/cmsg.o \ nic/main.o ifeq ($(CONFIG_BPF_SYSCALL),y) diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c new file mode 100644 index ..523ae03b49c5 --- /dev/null +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2015-2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "../nfpcore/nfp_cpp.h" +#include "../nfp_net_repr.h" +#include "./cmsg.h" + +#define nfp_flower_cmsg_warn(app, fmt, args...) \ + do {\ + if (net_ratelimit())\ + nfp_warn((app)->cpp, fmt, ## args); \ + } while (0) + +static struct nfp_flower_cmsg_hdr * +nfp_flower_cmsg_get_hdr(struct sk_buff *skb) +{ + return (struct nfp_flower_cmsg_hdr *)skb->data; +} + +static void *nfp_flower_cmsg_get_data(struct sk_buff *skb) +{ + return (unsigned char *)skb->data + NFP_FLOWER_CMSG_HLEN; +} + +static struct sk_buff * +nfp_flower_cmsg_alloc(struct nfp_app *app, unsigned int size, + enum nfp_flower_cmsg_type_port type) +{ + struct nfp_flower_cmsg_hdr *ch; + struct sk_buff *skb; + + size += NFP_FLOWER_CMSG_HLEN; + + skb = nfp_app_ctrl_msg_alloc(app, size, GFP_KERNEL); + if (!skb) + return NULL; + + ch = nfp_flower_cmsg_get_hdr(skb); + ch->pad = 0; + ch->version = NFP_FLOWER_CMSG_VER1; + ch->type = type; + skb_put(skb, size); + + return skb; +} + +int nfp_flower_cmsg_portmod(struct net_device *netdev) +{ + struct nfp_repr *repr = netdev_priv(netdev); + struct nfp_flower_cmsg_portmod *msg; + struct sk_buff *skb; + + skb = nfp_flower_cmsg_alloc(repr->app, sizeof(*msg), + NFP_FLOWER_CMSG_TYPE_PORT_MOD); + if (!skb) + return -ENOMEM; + + msg = nfp_flower_cmsg_get_data(skb); + msg->portnum = repr->dst->u.port_info.port_id; + msg->reserved = 0; + msg->info = netif_carrier_ok(netdev); + msg->mtu = cpu_to_be16(netdev->mtu); + + nfp_ctrl_tx(repr->app->ctrl, skb); + + return
[PATCH net-next 03/12] nfp: move physical port init into a helper
From: Jakub KicinskiMove MAC/PHY port init into a helper to make it easier to reuse it in the representor code. Signed-off-by: Jakub Kicinski Signed-off-by: Simon Horman --- drivers/net/ethernet/netronome/nfp/nfp_app_nic.c | 23 ++ drivers/net/ethernet/netronome/nfp/nfp_port.c| 25 drivers/net/ethernet/netronome/nfp/nfp_port.h| 3 +++ 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c b/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c index 83c65e6291ee..7b966bd3d214 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c @@ -42,6 +42,8 @@ static int nfp_app_nic_vnic_init_phy_port(struct nfp_pf *pf, struct nfp_app *app, struct nfp_net *nn, unsigned int id) { + int err; + if (!pf->eth_tbl) return 0; @@ -49,26 +51,13 @@ nfp_app_nic_vnic_init_phy_port(struct nfp_pf *pf, struct nfp_app *app, if (IS_ERR(nn->port)) return PTR_ERR(nn->port); - nn->port->eth_id = id; - nn->port->eth_port = nfp_net_find_port(pf->eth_tbl, id); - - /* Check if vNIC has external port associated and cfg is OK */ - if (!nn->port->eth_port) { - nfp_err(app->cpp, - "NSP port entries don't match vNICs (no entry for port #%d)\n", - id); + err = nfp_port_init_phy_port(pf, app, nn->port, id); + if (err) { nfp_port_free(nn->port); - return -EINVAL; - } - if (nn->port->eth_port->override_changed) { - nfp_warn(app->cpp, -"Config changed for port #%d, reboot required before port will be operational\n", -id); - nn->port->type = NFP_PORT_INVALID; - return 1; + return err; } - return 0; + return nn->port->type == NFP_PORT_INVALID; } int nfp_app_nic_vnic_init(struct nfp_app *app, struct nfp_net *nn, diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c b/drivers/net/ethernet/netronome/nfp/nfp_port.c index a17410ac01ab..19bceeb82225 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_port.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c @@ -33,6 +33,7 @@ #include +#include "nfpcore/nfp_cpp.h" #include "nfpcore/nfp_nsp.h" #include "nfp_app.h" #include "nfp_main.h" @@ -112,6 +113,30 @@ nfp_port_get_phys_port_name(struct net_device *netdev, char *name, size_t len) return 0; } +int nfp_port_init_phy_port(struct nfp_pf *pf, struct nfp_app *app, + struct nfp_port *port, unsigned int id) +{ + port->eth_id = id; + port->eth_port = nfp_net_find_port(pf->eth_tbl, id); + + /* Check if vNIC has external port associated and cfg is OK */ + if (!port->eth_port) { + nfp_err(app->cpp, + "NSP port entries don't match vNICs (no entry for port #%d)\n", + id); + return -EINVAL; + } + if (port->eth_port->override_changed) { + nfp_warn(app->cpp, +"Config changed for port #%d, reboot required before port will be operational\n", +id); + port->type = NFP_PORT_INVALID; + return 0; + } + + return 0; +} + struct nfp_port * nfp_port_alloc(struct nfp_app *app, enum nfp_port_type type, struct net_device *netdev) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h index 4d1a9b3fed41..fb28c7071987 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_port.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h @@ -104,6 +104,9 @@ nfp_port_alloc(struct nfp_app *app, enum nfp_port_type type, struct net_device *netdev); void nfp_port_free(struct nfp_port *port); +int nfp_port_init_phy_port(struct nfp_pf *pf, struct nfp_app *app, + struct nfp_port *port, unsigned int id); + int nfp_net_refresh_eth_port(struct nfp_port *port); void nfp_net_refresh_port_table(struct nfp_port *port); int nfp_net_refresh_port_table_sync(struct nfp_pf *pf); -- 2.1.4
[PATCH net-next 01/12] net: store port/representator id in metadata_dst
From: Jakub KicinskiSwitches and modern SR-IOV enabled NICs may multiplex traffic from Port representators and control messages over single set of hardware queues. Control messages and muxed traffic may need ordered delivery. Those requirements make it hard to comfortably use TC infrastructure today unless we have a way of attaching metadata to skbs at the upper device. Because single set of queues is used for many netdevs stopping TC/sched queues of all of them reliably is impossible and lower device has to retreat to returning NETDEV_TX_BUSY and usually has to take extra locks on the fastpath. This patch attempts to enable port/representative devs to attach metadata to skbs which carry port id. This way representatives can be queueless and all queuing can be performed at the lower netdev in the usual way. Traffic arriving on the port/representative interfaces will be have metadata attached and will subsequently be queued to the lower device for transmission. The lower device should recognize the metadata and translate it to HW specific format which is most likely either a special header inserted before the network headers or descriptor/metadata fields. Metadata is associated with the lower device by storing the netdev pointer along with port id so that if TC decides to redirect or mirror the new netdev will not try to interpret it. This is mostly for SR-IOV devices since switches don't have lower netdevs today. Signed-off-by: Jakub Kicinski Signed-off-by: Sridhar Samudrala Signed-off-by: Simon Horman --- include/net/dst_metadata.h | 41 - net/core/dst.c | 15 ++- net/core/filter.c | 1 + net/ipv4/ip_tunnel_core.c | 6 -- net/openvswitch/flow_netlink.c | 4 +++- 5 files changed, 50 insertions(+), 17 deletions(-) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 701fc814d0af..a803129a4849 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -5,10 +5,22 @@ #include #include +enum metadata_type { + METADATA_IP_TUNNEL, + METADATA_HW_PORT_MUX, +}; + +struct hw_port_info { + struct net_device *lower_dev; + u32 port_id; +}; + struct metadata_dst { struct dst_entrydst; + enum metadata_type type; union { struct ip_tunnel_info tun_info; + struct hw_port_info port_info; } u; }; @@ -27,7 +39,7 @@ static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; - if (md_dst) + if (md_dst && md_dst->type == METADATA_IP_TUNNEL) return _dst->u.tun_info; dst = skb_dst(skb); @@ -55,22 +67,33 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, a = (const struct metadata_dst *) skb_dst(skb_a); b = (const struct metadata_dst *) skb_dst(skb_b); - if (!a != !b || a->u.tun_info.options_len != b->u.tun_info.options_len) + if (!a != !b || a->type != b->type) return 1; - return memcmp(>u.tun_info, >u.tun_info, - sizeof(a->u.tun_info) + a->u.tun_info.options_len); + switch (a->type) { + case METADATA_HW_PORT_MUX: + return memcmp(>u.port_info, >u.port_info, + sizeof(a->u.port_info)); + case METADATA_IP_TUNNEL: + return memcmp(>u.tun_info, >u.tun_info, + sizeof(a->u.tun_info) + +a->u.tun_info.options_len); + default: + return 1; + } } void metadata_dst_free(struct metadata_dst *); -struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); -struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); +struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, + gfp_t flags); +struct metadata_dst __percpu * +metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags); static inline struct metadata_dst *tun_rx_dst(int md_size) { struct metadata_dst *tun_dst; - tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC); + tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC); if (!tun_dst) return NULL; @@ -85,11 +108,11 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) int md_size; struct metadata_dst *new_md; - if (!md_dst) + if (!md_dst || md_dst->type != METADATA_IP_TUNNEL) return ERR_PTR(-EINVAL); md_size = md_dst->u.tun_info.options_len; - new_md = metadata_dst_alloc(md_size, GFP_ATOMIC); +
[PATCH net-next 00/12] nfp: add flower app with representors
Hi, this series adds a flower app to the NFP driver. It initialises four types of netdevs: * PF netdev - lower-device for communication of packets to device * PF representor netdev * VF representor netdevs * Phys port representor netdevs The PF netdev acts as a lower-device which sends and receives packets to and from the firmware. The representors act as upper-devices. For TX representors attach a metadata dst to the skb which is used by the PF netdev to prepend metadata to the packet before forwarding the firmware. On RX the PF netdev looks up the representor based on the prepended metadata recieved from the firmware and forwards the skb to the representor after removing the metadata. Control queues are used to send and receive control messages which are used to communicate configuration information with the firmware. These are in separate vNIC to the queues belonging to the PF netdev. The control queues are not exposed to use-space via a netdev or any other means. As the name implies this app is targeted at providing offload of TC flower. That will be added by follow-up work. This patchset focuses on adding phys port and VF representor netdevs to which flower classifiers may be attached. Jakub Kicinski (3): net: store port/representator id in metadata_dst nfp: devlink add support for getting eswitch mode nfp: move physical port init into a helper Simon Horman (9): nfp: map mac_stats and vf_cfg BARs nfp: general representor implementation nfp: add stats and xmit helpers for representors nfp: app callbacks for SRIOV nfp: provide nfp_port to of nfp_net_get_mac_addr() nfp: add support for tx/rx with metadata portid nfp: add support for control messages for flower app nfp: add flower app nfp: add VF and PF representors to flower app drivers/net/ethernet/netronome/nfp/Makefile| 3 + drivers/net/ethernet/netronome/nfp/flower/cmsg.c | 159 + drivers/net/ethernet/netronome/nfp/flower/cmsg.h | 116 +++ drivers/net/ethernet/netronome/nfp/flower/main.c | 376 + drivers/net/ethernet/netronome/nfp/nfp_app.c | 26 +- drivers/net/ethernet/netronome/nfp/nfp_app.h | 58 +++- drivers/net/ethernet/netronome/nfp/nfp_app_nic.c | 25 +- drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 18 + drivers/net/ethernet/netronome/nfp/nfp_main.c | 42 ++- drivers/net/ethernet/netronome/nfp/nfp_main.h | 11 +- drivers/net/ethernet/netronome/nfp/nfp_net.h | 1 + .../net/ethernet/netronome/nfp/nfp_net_common.c| 57 +++- drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 141 +--- drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 352 +++ drivers/net/ethernet/netronome/nfp/nfp_net_repr.h | 120 +++ drivers/net/ethernet/netronome/nfp/nfp_port.c | 25 ++ drivers/net/ethernet/netronome/nfp/nfp_port.h | 63 .../net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h | 2 + .../ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c | 5 +- include/net/dst_metadata.h | 41 ++- net/core/dst.c | 15 +- net/core/filter.c | 1 + net/ipv4/ip_tunnel_core.c | 6 +- net/openvswitch/flow_netlink.c | 4 +- 24 files changed, 1574 insertions(+), 93 deletions(-) create mode 100644 drivers/net/ethernet/netronome/nfp/flower/cmsg.c create mode 100644 drivers/net/ethernet/netronome/nfp/flower/cmsg.h create mode 100644 drivers/net/ethernet/netronome/nfp/flower/main.c create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h -- 2.1.4
Re: [PATCH net-next 0/1] Introduction of the tc tests
On Mon, 19 Jun 2017 23:48:19 -0400 (EDT) David Millerwrote: > From: Cong Wang > Date: Mon, 19 Jun 2017 16:37:29 -0700 > > > Hi, > > > > On Fri, Jun 16, 2017 at 2:22 PM, Lucas Bates wrote: > >> Apologies for sending this as one big patch. I've been sitting on this a > >> little > >> too long, but it's ready and I wanted to get it out. > >> > >> There are a limited number of tests to start - I plan to add more on a > >> regular > >> basis. > >> > >> Lucas Bates (1): > >> selftests: Introduce tc testsuite > > > > Nice work! > > > > Is there any particular reason you want to put these tests in kernel tree > > especially tools/testing/selftests/ ? > > Yeah, it would be absolutely terrible if we had more tests in the > kernel selftests area for networking. > > More seriously, we need more, not less, tests in the kernel networking > selftests directory. > > It doesn't belong in iproute2 because we want a place to put things > that automatically get tested when someone makes kernel changes and > can be integrated into the kernel development workflow. > > I want as many tests as possible under there, so I'm really surprised > that you're asking "why" tests are being added there. The "Occum's razor" for deciding where tests belong should be does the test need to change to respond to kernel change? Don't want to have iproute2 tests that have if (kernel_version > ...)
Re: [PATCH net-next 0/1] Introduction of the tc tests
On Mon, 19 Jun 2017 23:48:19 -0400 (EDT) David Millerwrote: > From: Cong Wang > Date: Mon, 19 Jun 2017 16:37:29 -0700 > > > Hi, > > > > On Fri, Jun 16, 2017 at 2:22 PM, Lucas Bates wrote: > >> Apologies for sending this as one big patch. I've been sitting on this a > >> little > >> too long, but it's ready and I wanted to get it out. > >> > >> There are a limited number of tests to start - I plan to add more on a > >> regular > >> basis. > >> > >> Lucas Bates (1): > >> selftests: Introduce tc testsuite > > > > Nice work! > > > > Is there any particular reason you want to put these tests in kernel tree > > especially tools/testing/selftests/ ? > > Yeah, it would be absolutely terrible if we had more tests in the > kernel selftests area for networking. > > More seriously, we need more, not less, tests in the kernel networking > selftests directory. > > It doesn't belong in iproute2 because we want a place to put things > that automatically get tested when someone makes kernel changes and > can be integrated into the kernel development workflow. > > I want as many tests as possible under there, so I'm really surprised > that you're asking "why" tests are being added there. I agree these tests should be more about kernel behavior and updated when kernel changes. Iproute2 has some outdated tests of its own, but these are more functional tests for the command portion.
Re: [net,v2] ipv6: reorder ip6_route_dev_notifier after ipv6_dev_notf
Hello, On Mon, Jun 19, 2017 at 8:15 PM, jeffywrote: > but actually they are not guaranteed to be paired: > > the netdev_run_todo(see the first dump stack above) would call > netdev_wait_allrefs to rebroadcast unregister notification multiple times, > unless timed out or all of the "struct net_device"'s refs released: > > * This is called when unregistering network devices. > * > * Any protocol or device that holds a reference should register > * for netdevice notification, and cleanup and put back the > * reference if they receive an UNREGISTER event. > * We can get stuck here if buggy protocols don't correctly > * call dev_put. > */ > static void netdev_wait_allrefs(struct net_device *dev) > { > ... > while (refcnt != 0) { > if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { > rtnl_lock(); > > /* Rebroadcast unregister notification */ > call_netdevice_notifiers(NETDEV_UNREGISTER, dev); > > __rtnl_unlock(); > rcu_barrier(); > rtnl_lock(); > > > call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); Interesting, I didn't notice this corner-case, because normally we would hit the one in rollback_registered_many(). Probably we need to add a check if (dev->reg_state == NETREG_UNREGISTERING) in ip6_route_dev_notify(). Can you give it a try? I guess we probably need to revise other NETDEV_UNREGISTER handlers too. I will send a patch tomorrow. Thanks!
Re: [PATCH net-next 0/1] Introduction of the tc tests
On Mon, Jun 19, 2017 at 8:48 PM, David Millerwrote: > From: Cong Wang > Date: Mon, 19 Jun 2017 16:37:29 -0700 > >> Hi, >> >> On Fri, Jun 16, 2017 at 2:22 PM, Lucas Bates wrote: >>> Apologies for sending this as one big patch. I've been sitting on this a >>> little >>> too long, but it's ready and I wanted to get it out. >>> >>> There are a limited number of tests to start - I plan to add more on a >>> regular >>> basis. >>> >>> Lucas Bates (1): >>> selftests: Introduce tc testsuite >> >> Nice work! >> >> Is there any particular reason you want to put these tests in kernel tree >> especially tools/testing/selftests/ ? > > Yeah, it would be absolutely terrible if we had more tests in the > kernel selftests area for networking. > > More seriously, we need more, not less, tests in the kernel networking > selftests directory. > > It doesn't belong in iproute2 because we want a place to put things > that automatically get tested when someone makes kernel changes and > can be integrated into the kernel development workflow. > > I want as many tests as possible under there, so I'm really surprised > that you're asking "why" tests are being added there. I thought tools/testing/selftests/ is mainly for those tests close to kernel ABI and API. What is the criteria for these tests? If any test can fit in, we somehow would merge the whole LTP... I definitely don't object more tests, I am just wondering if we should put it to tools/testing/selftests/ or host it somewhere else.
Re: [PATCH 0/9] Bug fixes and ctr mode of operation
On Thu, Jun 15, 2017 at 12:43:38PM +0530, Harsh Jain wrote: > This series is based on cryptodev2.6 tree and includes bug fix ,ctr(aes), > rfc3686(ctr(aes)) algo. > > Harsh Jain (7): > crypto: chcr - Pass lcb bit setting to firmware > crypto: chcr - Set fallback key > crypto: chcr - Return correct error code > crypto: chcr - Avoid changing request structure > crypto:chcr - Add ctr mode and process large sg entries for cipher > MAINTAINERS:Add maintainer for chelsio crypto driver > crypto: chcr - Ensure Destination sg entry size less than 2k > Atul Gupta (2): > chcr - Add debug counters > crypto: chcr - Select device in Round Robin fashion All applied. Thanks. -- Email: Herbert XuHome Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Re: [PATCH net-next 0/1] Introduction of the tc tests
From: Cong WangDate: Mon, 19 Jun 2017 16:37:29 -0700 > Hi, > > On Fri, Jun 16, 2017 at 2:22 PM, Lucas Bates wrote: >> Apologies for sending this as one big patch. I've been sitting on this a >> little >> too long, but it's ready and I wanted to get it out. >> >> There are a limited number of tests to start - I plan to add more on a >> regular >> basis. >> >> Lucas Bates (1): >> selftests: Introduce tc testsuite > > Nice work! > > Is there any particular reason you want to put these tests in kernel tree > especially tools/testing/selftests/ ? Yeah, it would be absolutely terrible if we had more tests in the kernel selftests area for networking. More seriously, we need more, not less, tests in the kernel networking selftests directory. It doesn't belong in iproute2 because we want a place to put things that automatically get tested when someone makes kernel changes and can be integrated into the kernel development workflow. I want as many tests as possible under there, so I'm really surprised that you're asking "why" tests are being added there.
Re: [PATCH NET] net/hns:bugfix of ethtool -t phy self_test
Hi, Andrew On 2017/6/20 5:54, Andrew Lunn wrote: > On Mon, Jun 19, 2017 at 02:00:43PM -0700, Florian Fainelli wrote: >> On 06/16/2017 02:24 AM, Lin Yun Sheng wrote: >>> This patch fixes the phy loopback self_test failed issue. when >>> Marvell Phy Module is loaded, it will powerdown fiber when doing >>> phy loopback self test, which cause phy loopback self_test fail. >>> >>> Signed-off-by: Lin Yun Sheng>>> --- >>> drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 16 ++-- >>> 1 file changed, 14 insertions(+), 2 deletions(-) >>> >>> diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c >>> b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c >>> index b8fab14..e95795b 100644 >>> --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c >>> +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c >>> @@ -288,9 +288,15 @@ static int hns_nic_config_phy_loopback(struct >>> phy_device *phy_dev, u8 en) >> >> The question really is, why is not this properly integrated into the PHY >> driver and PHYLIB such that the only thing the Ethernet MAC driver has >> to call is a function of the PHY driver putting it in self-test? > > This whole driver pokes various PHY registers, rather than use > phylib. And it does so without taking the PHY lock. I will consider using phylib as much as possible, thanks. It also assumes it > is a Marvell PHY and i don't see anywhere it actually verifies this. When it said Marvell Phy , I meant Marvell Phy with fibre support. I will send anther patch to only setting bit in Fiber Control when it is a Marvell Phy with fibre support. Thanks for reply. Best Regards Yunsheng Lin > > This is all broken. > > Andrew > > . >
Re: [net,v2] ipv6: reorder ip6_route_dev_notifier after ipv6_dev_notf
Hi guys, i hit some warnings when testing this patch on my local 4.4 kernel(arm64 chromebook) with KASAN & SLUB_DEBUG: [9.919374] BUG: KASAN: use-after-free in ip6_route_dev_notify+0x194/0x2bc at addr ffc0c9d4e480 [9.928469] Read of size 4 by task kworker/u12:3/124 [9.933463] = [9.941686] BUG kmalloc-1024 (Not tainted): kasan: bad access detected ... [ 10.741337] Workqueue: netns cleanup_net [ 10.745300] Call trace: [ 10.747776] [] dump_backtrace+0x0/0x200 [ 10.753203] [] show_stack+0x24/0x30 [ 10.758284] [] dump_stack+0xa8/0xcc [ 10.763364] [] print_trailer+0x158/0x168 [ 10.768877] [] object_err+0x4c/0x5c [ 10.773956] [] kasan_report+0x338/0x4ec [ 10.779383] [] __asan_load4+0x7c/0x84 [ 10.784640] [] ip6_route_dev_notify+0x194/0x2bc [ 10.790763] [] notifier_call_chain+0x78/0xc0 [ 10.796625] [] raw_notifier_call_chain+0x3c/0x4c [ 10.802835] [] call_netdevice_notifiers_info+0x8c/0x9c [ 10.809564] [] call_netdevice_notifiers+0x9c/0xcc [ 10.815859] [] netdev_run_todo+0x224/0x3f0 [ 10.821547] [] rtnl_unlock+0x14/0x1c [ 10.826716] [] default_device_exit_batch+0x258/0x2a0 [ 10.833269] [] ops_exit_list+0x74/0xdc [ 10.838608] [] cleanup_net+0x290/0x400 and also this: [ 11.607268] BUG kmalloc-1024 (Tainted: GB ): Poison overwritten [ 11.607270] - [ 11.607274] INFO: 0xffc0c9d4e478-0xffc0c9d4e478. First byte 0x67 instead of 0x6b ... [ 11.607679] [] print_trailer+0x158/0x168 [ 11.607683] [] check_bytes_and_report+0xd8/0x13c [ 11.607688] [] check_object+0x134/0x230 [ 11.607692] [] alloc_debug_processing+0x104/0x178 [ 11.607697] [] ___slab_alloc.constprop.26+0x2ec/0x434 [ 11.607702] [] __slab_alloc.isra.23.constprop.25+0x48/0x5c [ 11.607707] [] __kmalloc_track_caller+0x12c/0x338 it looks like the "struct inet6_dev" been touched after freed, and refcnt changed(0xffc0c9d4e478, 376 bytes of struct inet6_dev) when reusing this memory. i think the problem would be we are assuming NETDEV_REGISTER and NETDEV_UNREGISTER be paired in ip6_route_dev_notify: > + if (event == NETDEV_REGISTER) { >net->ipv6.ip6_null_entry->dst.dev = dev; >net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); > #ifdef CONFIG_IPV6_MULTIPLE_TABLES > @@ -3718,6 +3721,12 @@ static int ip6_route_dev_notify(struct notifier_block *this, >net->ipv6.ip6_blk_hole_entry->dst.dev = dev; >net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); > #endif > + } else if (event == NETDEV_UNREGISTER) { > + in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev); > +#ifdef CONFIG_IPV6_MULTIPLE_TABLES > + in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev); > + in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev); > +#endif >} but actually they are not guaranteed to be paired: the netdev_run_todo(see the first dump stack above) would call netdev_wait_allrefs to rebroadcast unregister notification multiple times, unless timed out or all of the "struct net_device"'s refs released: * This is called when unregistering network devices. * * Any protocol or device that holds a reference should register * for netdevice notification, and cleanup and put back the * reference if they receive an UNREGISTER event. * We can get stuck here if buggy protocols don't correctly * call dev_put. */ static void netdev_wait_allrefs(struct net_device *dev) { ... while (refcnt != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); On 05/05/2017 01:36 AM, WANG Cong wrote: For each netns (except init_net), we initialize its null entry in 3 places: 1) The template itself, as we use kmemdup() 2) Code around dst_init_metrics() in ip6_route_net_init() 3) ip6_route_dev_notify(), which is supposed to initialize it after loopback registers Unfortunately the last one still happens in a wrong order because we expect to initialize net->ipv6.ip6_null_entry->rt6i_idev to net->loopback_dev's idev, so we have to do that after we add idev to it. However, this notifier has priority == 0 same as ipv6_dev_notf, and ipv6_dev_notf is registered after ip6_route_dev_notifier so it is called actually after ip6_route_dev_notifier. Fix it by picking a smaller priority for ip6_route_dev_notifier. Also, we have to release the refcnt accordingly when unregistering loopback_dev because device exit functions are called before subsys exit functions. Cc: David
Re: [PATCH NET] net/hns:bugfix of ethtool -t phy self_test
hi, Florian On 2017/6/20 5:00, Florian Fainelli wrote: > On 06/16/2017 02:24 AM, Lin Yun Sheng wrote: >> This patch fixes the phy loopback self_test failed issue. when >> Marvell Phy Module is loaded, it will powerdown fiber when doing >> phy loopback self test, which cause phy loopback self_test fail. >> >> Signed-off-by: Lin Yun Sheng>> --- >> drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 16 ++-- >> 1 file changed, 14 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c >> b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c >> index b8fab14..e95795b 100644 >> --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c >> +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c >> @@ -288,9 +288,15 @@ static int hns_nic_config_phy_loopback(struct >> phy_device *phy_dev, u8 en) > > The question really is, why is not this properly integrated into the PHY > driver and PHYLIB such that the only thing the Ethernet MAC driver has > to call is a function of the PHY driver putting it in self-test? Do you meaning calling phy_dev->drv->resume and phy_dev->drv->suspend function? I tried it, but it failed. if that is what you mean, I will look into it why it fail. Thanks for your reply. Best regards YunSheng Lin > >> >> /* Force 1000M Link, Default is 0x0200 */ >> phy_write(phy_dev, 7, 0x20C); >> -phy_write(phy_dev, HNS_PHY_PAGE_REG, 0); >> >> -/* Enable PHY loop-back */ >> +/* Powerup Fiber */ >> +phy_write(phy_dev, HNS_PHY_PAGE_REG, 1); >> +val = phy_read(phy_dev, COPPER_CONTROL_REG); >> +val &= ~PHY_POWER_DOWN; >> +phy_write(phy_dev, COPPER_CONTROL_REG, val); >> + >> +/* Enable Phy Loopback */ >> +phy_write(phy_dev, HNS_PHY_PAGE_REG, 0); >> val = phy_read(phy_dev, COPPER_CONTROL_REG); >> val |= PHY_LOOP_BACK; >> val &= ~PHY_POWER_DOWN; >> @@ -299,6 +305,12 @@ static int hns_nic_config_phy_loopback(struct >> phy_device *phy_dev, u8 en) >> phy_write(phy_dev, HNS_PHY_PAGE_REG, 0xFA); >> phy_write(phy_dev, 1, 0x400); >> phy_write(phy_dev, 7, 0x200); >> + >> +phy_write(phy_dev, HNS_PHY_PAGE_REG, 1); >> +val = phy_read(phy_dev, COPPER_CONTROL_REG); >> +val |= PHY_POWER_DOWN; >> +phy_write(phy_dev, COPPER_CONTROL_REG, val); >> + >> phy_write(phy_dev, HNS_PHY_PAGE_REG, 0); >> phy_write(phy_dev, 9, 0xF00); >> >> > >
[PATCH net-next v3 07/15] bpf: Add setsockopt helper function to bpf
Added support for calling a subset of socket setsockopts from BPF_PROG_TYPE_SOCK_OPS programs. The code was duplicated rather than making the changes to call the socket setsockopt function because the changes required would have been larger. The ops supported are: SO_RCVBUF SO_SNDBUF SO_MAX_PACING_RATE SO_PRIORITY SO_RCVLOWAT SO_MARK Signed-off-by: Lawrence Brakmo--- include/uapi/linux/bpf.h | 14 - net/core/filter.c | 77 ++- samples/bpf/bpf_helpers.h | 3 ++ 3 files changed, 92 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 314fdf3..86595f9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -520,6 +520,17 @@ union bpf_attr { * Set full skb->hash. * @skb: pointer to skb * @hash: hash to set + * + * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) + * Calls setsockopt. Not all opts are available, only those with + * integer optvals plus TCP_CONGESTION. + * Supported levels: SOL_SOCKET and IPROTO_TCP + * @bpf_socket: pointer to bpf_socket + * @level: SOL_SOCKET or IPROTO_TCP + * @optname: option name + * @optval: pointer to option value + * @optlen: length of optval in byes + * Return: 0 or negative error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -570,7 +581,8 @@ union bpf_attr { FN(probe_read_str), \ FN(get_socket_cookie), \ FN(get_socket_uid), \ - FN(set_hash), + FN(set_hash), \ + FN(setsockopt), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 7d69d16..b114ae1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -54,6 +54,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -2671,6 +2672,69 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + struct sock *sk = bpf_sock->sk; + int ret = 0; + int val; + + if (bpf_sock->is_req_sock) + return -EINVAL; + + if (level == SOL_SOCKET) { + /* Only some socketops are supported */ + val = *((int *)optval); + + switch (optname) { + case SO_RCVBUF: + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + break; + case SO_SNDBUF: + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + break; + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, +sk->sk_max_pacing_rate); + break; + case SO_PRIORITY: + sk->sk_priority = val; + break; + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->sk_rcvlowat = val ? : 1; + break; + case SO_MARK: + sk->sk_mark = val; + break; + default: + ret = -EINVAL; + } + } else if (level == SOL_TCP && + sk->sk_prot->setsockopt == tcp_setsockopt) { + /* Place holder */ + ret = -EINVAL; + } else { + ret = -EINVAL; + } + return ret; +} + +static const struct bpf_func_proto bpf_setsockopt_proto = { + .func = bpf_setsockopt, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -2822,6 +2886,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * + sock_ops_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return _setsockopt_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto *
[PATCH net-next v3 11/15] bpf: Sample BPF program to set congestion control
Sample BPF program that sets congestion control to dctcp when both hosts are within the same datacenter. In this example that is assumed to be when they have the first 5.5 bytes of their IPv6 address are the same. Signed-off-by: Lawrence Brakmo--- samples/bpf/Makefile| 1 + samples/bpf/tcp_cong_kern.c | 73 + 2 files changed, 74 insertions(+) create mode 100644 samples/bpf/tcp_cong_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 942c7c7..eb324e0 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -116,6 +116,7 @@ always += cookie_uid_helper_example.o always += tcp_synrto_kern.o always += tcp_rwnd_kern.o always += tcp_bufs_kern.o +always += tcp_cong_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c new file mode 100644 index 000..d56fb8a --- /dev/null +++ b/samples/bpf/tcp_cong_kern.c @@ -0,0 +1,73 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set congestion control to dctcp when both hosts are + * in the same datacenter (as deteremined by IPv6 prefix). + */ + +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define DEBUG 1 + +SEC("sockops") +int bpf_cong(struct bpf_sock_ops *skops) +{ + char fmt1[] = "BPF command: %d\n"; + char fmt2[] = " Returning %d\n"; + char cong[] = "dctcp"; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program +* if neither port numberis 55601 +*/ + if (skops->remote_port != 55601 && skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_trace_printk(fmt1, sizeof(fmt1), op); +#endif + + /* Check if both hosts are in the same datacenter. For this +* example they are if the 1st 5.5 bytes in the IPv6 address +* are the same. +*/ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (skops->local_ip6[1] & 0xfff0) == + (skops->remote_ip6[1] & 0xfff0)) { + switch (op) { + case BPF_SOCK_OPS_NEEDS_ECN: + rv = 1; + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_trace_printk(fmt2, sizeof(fmt2), rv); +#endif + return rv; +} +char _license[] SEC("license") = "GPL"; -- 2.9.3
[PATCH net-next v3 15/15] bpf: Sample bpf program to set sndcwnd clamp
Sample BPF program, tcp_clamp_kern.c, to demostrate the use of setting the sndcwnd clamp. This program assumes that if the first 5.5 bytes of the host's IPv6 addresses are the same, then the hosts are in the same datacenter and sets sndcwnd clamp to 100 packets, SYN and SYN-ACK RTOs to 10ms and send/receive buffer sizes to 150KB. Signed-off-by: Lawrence Brakmo--- samples/bpf/Makefile | 1 + samples/bpf/tcp_clamp_kern.c | 93 2 files changed, 94 insertions(+) create mode 100644 samples/bpf/tcp_clamp_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 3ec96a0..59975c3 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -118,6 +118,7 @@ always += tcp_rwnd_kern.o always += tcp_bufs_kern.o always += tcp_cong_kern.o always += tcp_iw_kern.o +always += tcp_clamp_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c new file mode 100644 index 000..413eeba --- /dev/null +++ b/samples/bpf/tcp_clamp_kern.c @@ -0,0 +1,93 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample BPF program to set send and receive buffers to 150KB, sndcwnd clamp + * to 100 packets and SYN and SYN_ACK RTOs to 10ms when both hosts are within + * the same datacenter. For his example, we assume they are within the same + * datacenter when the first 5.5 bytes of their IPv6 addresses are the same. + */ + +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define DEBUG 1 + +SEC("sockops") +int bpf_clamp(struct bpf_sock_ops *skops) +{ + char fmt1[] = "BPF command: %d\n"; + char fmt2[] = " Returning %d\n"; + int bufsize = 15; + int to_init = 10; + int clamp = 100; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program +* if neither port numberis 55601 +*/ + if (skops->remote_port != 55601 && skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_trace_printk(fmt1, sizeof(fmt1), op); +#endif + + /* Check that both hosts are within same datacenter. For this example +* it is the case when the first 5.5 bytes of their IPv6 addresses are +* the same. +*/ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (skops->local_ip6[1] & 0xfff0) == + (skops->remote_ip6[1] & 0xfff0)) { + switch (op) { + case BPF_SOCK_OPS_TIMEOUT_INIT: + rv = to_init; + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + /* Set sndbuf and rcvbuf of active connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, + , sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, , + sizeof(bufsize)); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, + TCP_BPF_SNDCWND_CLAMP, + , sizeof(clamp)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + /* Set sndbuf and rcvbuf of passive connections */ + rv = bpf_setsockopt(skops, SOL_TCP, + TCP_BPF_SNDCWND_CLAMP, + , sizeof(clamp)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, + SO_SNDBUF, , + sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, , + sizeof(bufsize)); + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_trace_printk(fmt2, sizeof(fmt2), rv); +#endif + return rv; +} +char _license[] SEC("license") = "GPL"; -- 2.9.3
[PATCH net-next v3 10/15] bpf: Add support for changing congestion control
Added support for changing congestion control for SOCK_OPS bpf programs through the setsockopt bpf helper function. It also adds a new SOCK_OPS op, BPF_SOCK_OPS_NEEDS_ECN, that is needed for congestion controls, like dctcp, that need to enable ECN in the SYN packets. Signed-off-by: Lawrence Brakmo--- include/net/tcp.h| 9 - include/uapi/linux/bpf.h | 3 +++ net/core/filter.c| 11 +-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_cong.c | 32 ++-- net/ipv4/tcp_input.c | 3 ++- net/ipv4/tcp_output.c| 8 +--- 7 files changed, 50 insertions(+), 18 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index ff806d7..58d67be 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1003,7 +1003,9 @@ void tcp_get_default_congestion_control(char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); -int tcp_set_congestion_control(struct sock *sk, const char *name); +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load); +void tcp_reinit_congestion_control(struct sock *sk, + const struct tcp_congestion_ops *ca); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); @@ -2072,4 +2074,9 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk, bool is_req_sock) rwnd = 0; return rwnd; } + +static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) +{ + return (tcp_call_bpf(sk, true, BPF_SOCK_OPS_NEEDS_ECN) == 1); +} #endif /* _TCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4856d16..c222059 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -776,6 +776,9 @@ enum { * passive connection is * established */ + BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control +* needs ECN +*/ }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index b114ae1..bbf8f78 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2716,8 +2716,15 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, } } else if (level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { - /* Place holder */ - ret = -EINVAL; + if (optname == TCP_CONGESTION) { + ret = tcp_set_congestion_control(sk, optval, false); + if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) + /* replacing an existing ca */ + tcp_reinit_congestion_control(sk, + inet_csk(sk)->icsk_ca_ops); + } else { + ret = -EINVAL; + } } else { ret = -EINVAL; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 058f509..9476fd6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2479,7 +2479,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name); + err = tcp_set_congestion_control(sk, name, true); release_sock(sk); return err; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 324c9bc..fde983f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_dontxmit(sk); } -static void tcp_reinit_congestion_control(struct sock *sk, - const struct tcp_congestion_ops *ca) +void tcp_reinit_congestion_control(struct sock *sk, + const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -333,8 +333,12 @@ int tcp_set_allowed_congestion_control(char *val) return ret; } -/* Change congestion control for socket */ -int tcp_set_congestion_control(struct sock *sk, const char *name) +/* Change congestion control for socket. If load is false, then it is the + * responsibility of the caller to call tcp_init_congestion_control or + * tcp_reinit_congestion_control (if the current congestion control was + * already initialized. + */ +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -344,21 +348,29 @@ int
[PATCH net-next v3 13/15] bpf: Sample BPF program to set initial cwnd
Sample BPF program that assumes hosts are far away (i.e. large RTTs) and sets initial cwnd and initial receive window to 40 packets, send and receive buffers to 1.5MB. In practice there would be a test to insure the hosts are actually far enough away. Signed-off-by: Lawrence Brakmo--- samples/bpf/Makefile | 1 + samples/bpf/tcp_iw_kern.c | 78 +++ 2 files changed, 79 insertions(+) create mode 100644 samples/bpf/tcp_iw_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index eb324e0..3ec96a0 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -117,6 +117,7 @@ always += tcp_synrto_kern.o always += tcp_rwnd_kern.o always += tcp_bufs_kern.o always += tcp_cong_kern.o +always += tcp_iw_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c new file mode 100644 index 000..4f978fc --- /dev/null +++ b/samples/bpf/tcp_iw_kern.c @@ -0,0 +1,78 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set initial congestion window and initial receive + * window to 40 packets and send and receive buffers to 1.5MB. This + * would usually be done after doing appropriate checks that indicate + * the hosts are far enough away (i.e. large RTT). + */ + +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define DEBUG 1 + +SEC("sockops") +int bpf_iw(struct bpf_sock_ops *skops) +{ + char fmt1[] = "BPF command: %d\n"; + char fmt2[] = " Returning %d\n"; + int bufsize = 150; + int rwnd_init = 40; + int iw = 40; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program +* if neither port numberis 55601 +*/ + if (skops->remote_port != 55601 && skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_trace_printk(fmt1, sizeof(fmt1), op); +#endif + + /* Usually there would be a check to insure the hosts are far +* from each other so it makes sense to increase buffer sizes +*/ + switch (op) { + case BPF_SOCK_OPS_RWND_INIT: + rv = rwnd_init; + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + /* Set sndbuf and rcvbuf of active connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, , + sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, +, sizeof(bufsize)); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, , + sizeof(iw)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + /* Set sndbuf and rcvbuf of passive connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, , + sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, +, sizeof(bufsize)); + break; + default: + rv = -1; + } +#ifdef DEBUG + bpf_trace_printk(fmt2, sizeof(fmt2), rv); +#endif + return rv; +} +char _license[] SEC("license") = "GPL"; -- 2.9.3
[PATCH net-next v3 14/15] bpf: Adds support for setting sndcwnd clamp
Adds a new bpf_setsockopt for TCP sockets, TCP_BPF_SNDCWND_CLAMP, which sets the initial congestion window. It is useful to limit the sndcwnd when the host are close to each other (small RTT). Signed-off-by: Lawrence Brakmo--- include/uapi/linux/bpf.h | 1 + net/core/filter.c| 7 +++ 2 files changed, 8 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a07acc6..47189e5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -782,5 +782,6 @@ enum { }; #define TCP_BPF_IW 1001/* Set TCP initial congestion window */ +#define TCP_BPF_SNDCWND_CLAMP 1002/* Set sndcwnd_clamp */ #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index db6d30c0..664bb9f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2733,6 +2733,13 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, else tp->snd_cwnd = val; break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) { + ret = -EINVAL; + } else { + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + } default: ret = -EINVAL; } -- 2.9.3
[PATCH net-next v3 01/15] bpf: BPF support for sock_ops
Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding struct that allows BPF programs of this type to access some of the socket's fields (such as IP addresses, ports, etc.). Currently there is functionality to load one global BPF program of this type which can be called at appropriate times to set relevant connection parameters such as buffer sizes, SYN and SYN-ACK RTOs, etc., based on connection information such as IP addresses, port numbers, etc. Alghough there are already 3 mechanisms to set parameters (sysctls, route metrics and setsockopts), this new mechanism provides some disticnt advantages. Unlike sysctls, it can set parameters per connection. In contrast to route metrics, it can also use port numbers and information provided by a user level program. In addition, it could set parameters probabilistically for evaluation purposes (i.e. do something different on 10% of the flows and compare results with the other 90% of the flows). Also, in cases where IPv6 addresses contain geographic information, the rules to make changes based on the distance (or RTT) between the hosts are much easier than route metric rules and can be global. Finally, unlike setsockopt, it oes not require application changes and it can be updated easily at any time. I plan to add support for loading per cgroup sock_ops BPF programs in the near future. One question is whether I should add this functionality into David Ahern's BPF_PROG_TYPE_CGROUP_SOCK or create a new cgroup bpf type. Whereas the current cgroup_sock type expects to be called only once during a connection's lifetime, the new sock_ops type could be called multipe times. For example, before sending SYN and SYN-ACKs to set an appropriate timeout, when the connection is established to set congestion control, etc. As a result it has "op" field to specify the type of operation requested. The purpose of this new program type is to simplify setting connection parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is easy to use facebook's internal IPv6 addresses to determine if both hosts of a connection are in the same datacenter. Therefore, it is easy to write a BPF program to choose a small SYN RTO value when both hosts are in the same datacenter. This patch only contains the framework to support the new BPF program type, following patches add the functionality to set various connection parameters. This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS and a new bpf syscall command to load a new program of this type: BPF_PROG_LOAD_SOCKET_OPS. Two new corresponding structs (one for the kernel one for the user/BPF program): /* kernel version */ struct bpf_sock_ops_kern { struct sock *sk; bool is_req_sock:1; __u32 op; union { __u32 reply; __u32 replylong[4]; }; }; /* user version */ struct bpf_sock_ops { __u32 op; union { __u32 reply; __u32 replylong[4]; }; __u32 family; __u32 remote_ip4; __u32 local_ip4; __u32 remote_ip6[4]; __u32 local_ip6[4]; __u32 remote_port; __u32 local_port; }; Currently there are two types of ops. The first type expects the BPF program to return a value which is then used by the caller (or a negative value to indicate the operation is not supported). The second type expects state changes to be done by the BPF program, for example through a setsockopt BPF helper function, and they ignore the return value. The reply fields of the bpf_sockt_ops struct are there in case a bpf program needs to return a value larger than an integer. Signed-off-by: Lawrence Brakmo--- include/linux/bpf.h | 6 ++ include/linux/bpf_types.h | 1 + include/linux/filter.h| 10 +++ include/net/tcp.h | 30 include/uapi/linux/bpf.h | 28 kernel/bpf/syscall.c | 62 + net/core/Makefile | 3 +- net/core/filter.c | 170 ++ net/core/sock_bpfops.c| 65 ++ samples/bpf/bpf_load.c| 13 +++- 10 files changed, 370 insertions(+), 18 deletions(-) create mode 100644 net/core/sock_bpfops.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1bcbf0a..a1a1f2f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -362,4 +362,10 @@ extern const struct bpf_func_proto bpf_get_stackid_proto; void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +/* sock_ops related */ +struct bpf_sock_ops_kern; + +int bpf_sock_ops_attach_global_prog(int fd); +int bpf_sock_ops_detach_global_prog(void); +int bpf_sock_ops_call(struct bpf_sock_ops_kern *bpf_sock); #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 03bf223..3d137c3 100644 --- a/include/linux/bpf_types.h +++
[PATCH net-next v3 08/15] bpf: Add TCP connection BPF callbacks
Added callbacks to BPF SOCK_OPS type program before an active connection is intialized and after a passive or active connection is established. The following patch demostrates how they can be used to set send and receive buffer sizes. Signed-off-by: Lawrence Brakmo--- include/uapi/linux/bpf.h | 11 +++ net/ipv4/tcp_fastopen.c | 1 + net/ipv4/tcp_input.c | 4 +++- net/ipv4/tcp_output.c| 1 + 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 86595f9..4856d16 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -765,6 +765,17 @@ enum { * window (in packets) or -1 if default * value should be used */ + BPF_SOCK_OPS_TCP_CONNECT_CB,/* Calls BPF program right before an +* active connection is initialized +*/ + BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an +* active connection is +* established +*/ + BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,/* Calls BPF program when a +* passive connection is +* established +*/ }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 4af82b9..ed6b549 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -221,6 +221,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_init_congestion_control(child); tcp_mtup_init(child); tcp_init_metrics(child); + tcp_call_bpf(child, false, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_buffer_space(child); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0867b05..1b868ae 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5571,7 +5571,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); - + tcp_call_bpf(sk, false, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); /* Prevent spurious tcp_cwnd_restart() on first data @@ -5977,6 +5977,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } else { /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); + tcp_call_bpf(sk, false, +BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_mtup_init(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e5f623f..958edc8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3445,6 +3445,7 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; + tcp_call_bpf(sk, false, BPF_SOCK_OPS_TCP_CONNECT_CB); tcp_connect_init(sk); if (unlikely(tp->repair)) { -- 2.9.3
[PATCH net-next v3 12/15] bpf: Adds support for setting initial cwnd
Adds a new bpf_setsockopt for TCP sockets, TCP_BPF_IW, which sets the initial congestion window. This can be used when the hosts are far apart (large RTTs) and it is safe to start with a large inital cwnd. Signed-off-by: Lawrence Brakmo--- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c| 14 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c222059..a07acc6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -781,4 +781,6 @@ enum { */ }; +#define TCP_BPF_IW 1001/* Set TCP initial congestion window */ + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index bbf8f78..db6d30c0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2723,7 +2723,19 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, tcp_reinit_congestion_control(sk, inet_csk(sk)->icsk_ca_ops); } else { - ret = -EINVAL; + struct tcp_sock *tp = tcp_sk(sk); + + val = *((int *)optval); + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > 0) + ret = -EINVAL; + else + tp->snd_cwnd = val; + break; + default: + ret = -EINVAL; + } } } else { ret = -EINVAL; -- 2.9.3
[PATCH net-next v3 03/15] bpf: Support for per connection SYN/SYN-ACK RTOs
This patch adds support for setting a per connection SYN and SYN_ACK RTOs from within a BPF_SOCK_OPS program. For example, to set small RTOs when it is known both hosts are within a datacenter. Signed-off-by: Lawrence Brakmo--- include/net/tcp.h| 11 +++ include/uapi/linux/bpf.h | 3 +++ net/ipv4/tcp_input.c | 3 ++- net/ipv4/tcp_output.c| 2 +- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index f6f415c..bdf6bfd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2051,4 +2051,15 @@ static inline int tcp_call_bpf(struct sock *sk, bool is_req_sock, int op) } #endif +static inline u32 tcp_timeout_init(struct sock *sk, bool is_req_sock) +{ + int timeout; + + timeout = tcp_call_bpf(sk, is_req_sock, BPF_SOCK_OPS_TIMEOUT_INIT); + + if (timeout <= 0) + timeout = TCP_TIMEOUT_INIT; + return timeout; +} + #endif /* _TCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 861dbe9..4532c31 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -746,6 +746,9 @@ struct bpf_sock_ops { */ enum { BPF_SOCK_OPS_VOID, + BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or +* -1 if default value should be used +*/ }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2ab7e2f..0867b05 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6406,7 +6406,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } else { tcp_rsk(req)->tfo_listener = false; if (!want_cookie) - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, + tcp_timeout_init((struct sock *)req, true)); af_ops->send_synack(sk, dst, , req, , !want_cookie ? TCP_SYNACK_NORMAL : TCP_SYNACK_COOKIE); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9a9c395..5e478a1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3327,7 +3327,7 @@ static void tcp_connect_init(struct sock *sk) tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = tcp_timeout_init(sk, false); inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } -- 2.9.3
[PATCH net-next v3 05/15] bpf: Support for setting initial receive window
This patch adds suppport for setting the initial advertized window from within a BPF_SOCK_OPS program. This can be used to support larger initial cwnd values in environments where it is known to be safe. Signed-off-by: Lawrence Brakmo--- include/net/tcp.h| 10 ++ include/uapi/linux/bpf.h | 4 net/ipv4/tcp_minisocks.c | 9 - net/ipv4/tcp_output.c| 7 ++- 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index bdf6bfd..ff806d7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2062,4 +2062,14 @@ static inline u32 tcp_timeout_init(struct sock *sk, bool is_req_sock) return timeout; } +static inline u32 tcp_rwnd_init_bpf(struct sock *sk, bool is_req_sock) +{ + int rwnd; + + rwnd = tcp_call_bpf(sk, is_req_sock, BPF_SOCK_OPS_RWND_INIT); + + if (rwnd < 0) + rwnd = 0; + return rwnd; +} #endif /* _TCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4532c31..314fdf3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -749,6 +749,10 @@ enum { BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or * -1 if default value should be used */ + BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized +* window (in packets) or -1 if default +* value should be used +*/ }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d30ee31..bbaf3c6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -351,6 +351,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; + u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); @@ -363,6 +364,12 @@ void tcp_openreq_init_rwin(struct request_sock *req, (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; + rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req, true); + if (rcv_wnd == 0) + rcv_wnd = dst_metric(dst, RTAX_INITRWND); + else if (full_space < rcv_wnd * mss) + full_space = rcv_wnd * mss; + /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -370,7 +377,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, >rsk_window_clamp, ireq->wscale_ok, _wscale, - dst_metric(dst, RTAX_INITRWND)); + rcv_wnd); ireq->rcv_wscale = rcv_wscale; } EXPORT_SYMBOL(tcp_openreq_init_rwin); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5e478a1..e5f623f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3267,6 +3267,7 @@ static void tcp_connect_init(struct sock *sk) const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -3300,13 +3301,17 @@ static void tcp_connect_init(struct sock *sk) (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) tp->window_clamp = tcp_full_space(sk); + rcv_wnd = tcp_rwnd_init_bpf(sk, false); + if (rcv_wnd == 0) + rcv_wnd = dst_metric(dst, RTAX_INITRWND); + tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), >rcv_wnd, >window_clamp, sock_net(sk)->ipv4.sysctl_tcp_window_scaling, _wscale, - dst_metric(dst, RTAX_INITRWND)); + rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; -- 2.9.3
PATCH net-next v3 00/15
Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding struct that allows BPF programs of this type to access some of the socket's fields (such as IP addresses, ports, etc.) and setting connection parameters such as buffer sizes, initial window, SYN/SYN-ACK RTOs, etc. Unlike current BPF program types that expect to be called at a particular place in the network stack code, SOCK_OPS program can be called at different places and use an "op" field to indicate the context. There are currently two types of operations, those whose effect is through their return value and those whose effect is through the new bpf_setsocketop BPF helper function. Example operands of the first type are: BPF_SOCK_OPS_TIMEOUT_INIT BPF_SOCK_OPS_RWND_INIT BPF_SOCK_OPS_NEEDS_ECN Example operands of the secont type are: BPF_SOCK_OPS_TCP_CONNECT_CB BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB Current operands are only called during connection establishment so there should not be any BPF overheads after connection establishment. The main idea is to use connection information form both hosts, such as IP addresses and ports to allow setting of per connection parameters to optimize the connection's peformance. Alghough there are already 3 mechanisms to set parameters (sysctls, route metrics and setsockopts), this new mechanism provides some disticnt advantages. Unlike sysctls, it can set parameters per connection. In contrast to route metrics, it can also use port numbers and information provided by a user level program. In addition, it could set parameters probabilistically for evaluation purposes (i.e. do something different on 10% of the flows and compare results with the other 90% of the flows). Also, in cases where IPv6 addresses contain geographic information, the rules to make changes based on the distance (or RTT) between the hosts are much easier than route metric rules and can be global. Finally, unlike setsockopt, it does not require application changes and it can be updated easily at any time. Currently there is functionality to load one global BPF program of this type but I plan to add support for loading per cgroup socket ops BPF programs in the near future. When that is done, the global program could be called when a cgroup has no program associated with it. One question is whether I should add this functionality into David Ahern's BPF_PROG_TYPE_CGROUP_SOCK or create a new cgroup bpf type. Whereas the current cgroup_sock type expects to be called only once during a connection's lifetime, the new socket_ops type could be called multipe times. My preference is to define a new bpf attach type, BPF_CGROUP_SOCK_OPS, to attach BPF_PROG_TYPE_SOCK_OPS to cgroups. This patch set also includes sample BPF programs to demostrate the differnet features. v2: Formatting changes, rebased to latest net-next v3: Fixed build issues, changed socket_ops to sock_ops throught, fixed formatting issues, removed the syscall to load sock_ops program and added functionality to use existing bpf attach and bpf detach system calls, removed reader/writer locks in sock_bpfops.c (used when saving sock_ops global program) Consists of the following patches: include/linux/bpf.h | 6 ++ include/linux/bpf_types.h | 1 + include/linux/filter.h| 10 ++ include/net/tcp.h | 60 ++- include/uapi/linux/bpf.h | 66 +++- kernel/bpf/syscall.c | 62 +--- net/core/Makefile | 3 +- net/core/filter.c | 271 ++ net/core/sock_bpfops.c| 65 net/ipv4/tcp.c| 2 +- net/ipv4/tcp_cong.c | 32 -- net/ipv4/tcp_fastopen.c | 1 + net/ipv4/tcp_input.c | 10 +- net/ipv4/tcp_minisocks.c | 9 +- net/ipv4/tcp_output.c | 18 +++- samples/bpf/Makefile | 9 ++ samples/bpf/bpf_helpers.h | 3 + samples/bpf/bpf_load.c| 13 ++- samples/bpf/tcp_bpf.c | 86 samples/bpf/tcp_bufs_kern.c | 76 ++ samples/bpf/tcp_clamp_kern.c | 93 + samples/bpf/tcp_cong_kern.c | 73 ++ samples/bpf/tcp_iw_kern.c | 78 +++ samples/bpf/tcp_rwnd_kern.c | 60 +++ samples/bpf/tcp_synrto_kern.c | 59 +++ 25 files changed, 1126 insertions(+), 40 deletions(-)
[PATCH net-next v3 06/15] bpf: Sample bpf program to set initial window
The sample bpf program, tcp_rwnd_kern.c, sets the initial advertized window to 40 packets in an environment where distinct IPv6 prefixes indicate that both hosts are not in the same data center. Signed-off-by: Lawrence Brakmo--- samples/bpf/Makefile| 1 + samples/bpf/tcp_rwnd_kern.c | 60 + 2 files changed, 61 insertions(+) create mode 100644 samples/bpf/tcp_rwnd_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 21cb016..9aca209 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -114,6 +114,7 @@ always += xdp_tx_iptunnel_kern.o always += test_map_in_map_kern.o always += cookie_uid_helper_example.o always += tcp_synrto_kern.o +always += tcp_rwnd_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c new file mode 100644 index 000..26c1370 --- /dev/null +++ b/samples/bpf/tcp_rwnd_kern.c @@ -0,0 +1,60 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set initial receive window to 40 packets when using IPv6 + * and the first 5.5 bytes of the IPv6 addresses are not the same (in this + * example that means both hosts are not the same datacenter. + */ + +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define DEBUG 1 + +SEC("sockops") +int bpf_rwnd(struct bpf_sock_ops *skops) +{ + char fmt1[] = "BPF command: %d\n"; + char fmt2[] = " Returning %d\n"; + int rv = -1; + int op; + + /* For testing purposes, only execute rest of BPF program +* if neither port numberis 55601 +*/ + if (skops->remote_port != 55601 && skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_trace_printk(fmt1, sizeof(fmt1), op); +#endif + + /* Check for RWND_INIT operation and IPv6 addresses */ + if (op == BPF_SOCK_OPS_RWND_INIT && + skops->family == AF_INET6) { + + /* If the first 5.5 bytes of the IPv6 address are not the same +* then both hosts are not in the same datacenter +* so use a larger initial advertized window (40 packets) +*/ + if (skops->local_ip6[0] != skops->remote_ip6[0] || + (skops->local_ip6[1] & 0xf000) != + (skops->remote_ip6[1] & 0xf000)) + bpf_trace_printk(fmt2, sizeof(fmt2), -1); + rv = 40; + } +#ifdef DEBUG + bpf_trace_printk(fmt2, sizeof(fmt2), rv); +#endif + return rv; +} +char _license[] SEC("license") = "GPL"; -- 2.9.3
[PATCH net-next v3 09/15] bpf: Sample BPF program to set buffer sizes
This patch contains a BPF program to set initial receive window to 40 packets and send and receive buffers to 1.5MB. This would usually be done after doing appropriate checks that indicate the hosts are far enough away (i.e. large RTT). Signed-off-by: Lawrence Brakmo--- samples/bpf/Makefile| 1 + samples/bpf/tcp_bufs_kern.c | 76 + 2 files changed, 77 insertions(+) create mode 100644 samples/bpf/tcp_bufs_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 9aca209..942c7c7 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -115,6 +115,7 @@ always += test_map_in_map_kern.o always += cookie_uid_helper_example.o always += tcp_synrto_kern.o always += tcp_rwnd_kern.o +always += tcp_bufs_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c new file mode 100644 index 000..6cc096c --- /dev/null +++ b/samples/bpf/tcp_bufs_kern.c @@ -0,0 +1,76 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set initial receive window to 40 packets and send + * and receive buffers to 1.5MB. This would usually be done after + * doing appropriate checks that indicate the hosts are far enough + * away (i.e. large RTT). + */ + +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define DEBUG 1 + +SEC("sockops") +int bpf_bufs(struct bpf_sock_ops *skops) +{ + char fmt1[] = "BPF command: %d\n"; + char fmt2[] = " Returning %d\n"; + int bufsize = 150; + int rwnd_init = 40; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program +* if neither port numberis 55601 +*/ + if (skops->remote_port != 55601 && skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_trace_printk(fmt1, sizeof(fmt1), op); +#endif + + /* Usually there would be a check to insure the hosts are far +* from each other so it makes sense to increase buffer sizes +*/ + switch (op) { + case BPF_SOCK_OPS_RWND_INIT: + rv = rwnd_init; + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + /* Set sndbuf and rcvbuf of active connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, , + sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, +, sizeof(bufsize)); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + /* Nothing to do */ + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + /* Set sndbuf and rcvbuf of passive connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, , + sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, +, sizeof(bufsize)); + break; + default: + rv = -1; + } +#ifdef DEBUG + bpf_trace_printk(fmt2, sizeof(fmt2), rv); +#endif + return rv; +} +char _license[] SEC("license") = "GPL"; -- 2.9.3
[PATCH net-next v3 02/15] bpf: program to load sock_ops BPF programs
The program tcp_bpf can be used to remove current global sock_ops program and to load/replace sock_ops BPF programs. There is also an option to print the bpf trace buffer (for debugging purposes). USAGE: ./tcp_bpf [-r] [-l] [] WHERE: -r remove current loaded sock_ops BPF program not needed if loading a new program -l print BPF trace buffer. Used when loading a new program name of BPF sock_ops program to load if does not end in ".o", then "_kern.o" is appended example: using tcp_rto will load tcp_rto_kern.o Signed-off-by: Lawrence Brakmo--- samples/bpf/Makefile | 3 ++ samples/bpf/tcp_bpf.c | 86 +++ 2 files changed, 89 insertions(+) create mode 100644 samples/bpf/tcp_bpf.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index a0561dc..ed6bc75 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -36,6 +36,7 @@ hostprogs-y += lwt_len_hist hostprogs-y += xdp_tx_iptunnel hostprogs-y += test_map_in_map hostprogs-y += per_socket_stats_example +hostprogs-y += tcp_bpf # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o @@ -52,6 +53,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o +tcp_bpf-objs := bpf_load.o $(LIBBPF) tcp_bpf.o test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o @@ -130,6 +132,7 @@ HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex5 += -lelf HOSTLOADLIBES_tracex6 += -lelf HOSTLOADLIBES_test_cgrp2_sock2 += -lelf +HOSTLOADLIBES_tcp_bpf += -lelf HOSTLOADLIBES_test_probe_write_user += -lelf HOSTLOADLIBES_trace_output += -lelf -lrt HOSTLOADLIBES_lathist += -lelf diff --git a/samples/bpf/tcp_bpf.c b/samples/bpf/tcp_bpf.c new file mode 100644 index 000..735b8b2 --- /dev/null +++ b/samples/bpf/tcp_bpf.c @@ -0,0 +1,86 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" +#include +#include +#include + +static void usage(char *pname) +{ + printf("USAGE:\n %s [-r] [-l] \n", pname); + printf("WHERE:\n"); + printf(" -r remove current loaded socketops BPF program\n"); + printf(" not needed if loading a new program\n"); + printf(" -l print out BPF log buffer\n"); + printf(" name of BPF sockeops program to load\n"); + printf(" if does not end in \".o\", then \"_kern.o\" " + "is appended\n"); + printf(" example: using tcp1 will load tcp1_kern.o\n"); + printf("\n"); +} + +int main(int argc, char **argv) +{ + //union bpf_attr attr; + int k, logFlag = 0; + int error = 0; + char fn[500]; + + if (argc <= 1) + usage(argv[0]); + for (k = 1; k < argc; k++) { + if (!strcmp(argv[k], "-r")) { + error = bpf_prog_detach(0, BPF_GLOBAL_SOCK_OPS); + if (error) { + printf("ERROR: bpf_prog_detach: %d (%s)\n", + error, strerror(errno)); + error = 1; + } + } else if (!strcmp(argv[k], "-l")) { + logFlag = 1; + } else if (!strcmp(argv[k], "-h")) { + usage(argv[0]); + } else if (argv[k][0] == '-') { + printf("Error, unknown flag: %s\n", argv[k]); + error = 2; + } else if (strlen(argv[k]) > 450) { + printf("Error, program name too long %d\n", + (int) strlen(argv[k])); + error = 3; + } else { + if (!strcmp(argv[k]+strlen(argv[k])-2, ".o")) + strcpy(fn, argv[k]); + else + sprintf(fn, "%s_kern.o", argv[k]); + if (logFlag) + printf("loading bpf file:%s\n", fn); + if (load_bpf_file(fn)) { + printf("%s", bpf_log_buf); + return 1; + } + if (logFlag) { + printf("TCP BPF Loaded %s\n", fn); + printf("%s\n", bpf_log_buf); + } + error = bpf_prog_attach(prog_fd[0], 0, +
[PATCH net-next v3 04/15] bpf: Sample bpf program to set SYN/SYN-ACK RTOs
The sample BPF program, tcp_synrto_kern.c, sets the SYN and SYN-ACK RTOs to 10ms when both hosts are within the same datacenter (i.e. small RTTs) in an environment where common IPv6 prefixes indicate both hosts are in the same data center. Signed-off-by: Lawrence Brakmo--- samples/bpf/Makefile | 1 + samples/bpf/tcp_synrto_kern.c | 59 +++ 2 files changed, 60 insertions(+) create mode 100644 samples/bpf/tcp_synrto_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index ed6bc75..21cb016 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -113,6 +113,7 @@ always += lwt_len_hist_kern.o always += xdp_tx_iptunnel_kern.o always += test_map_in_map_kern.o always += cookie_uid_helper_example.o +always += tcp_synrto_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c new file mode 100644 index 000..b11efd8 --- /dev/null +++ b/samples/bpf/tcp_synrto_kern.c @@ -0,0 +1,59 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set SYN and SYN-ACK RTOs to 10ms when using IPv6 addresses + * and the first 5.5 bytes of the IPv6 addresses are the same (in this example + * that means both hosts are in the same datacenter. + */ + +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define DEBUG 1 + +SEC("sockops") +int bpf_synrto(struct bpf_sock_ops *skops) +{ + char fmt1[] = "BPF command: %d\n"; + char fmt2[] = " Returning %d\n"; + int rv = -1; + int op; + + /* For testing purposes, only execute rest of BPF program +* if neither port numberis 55601 +*/ + if (skops->remote_port != 55601 && skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_trace_printk(fmt1, sizeof(fmt1), op); +#endif + + /* Check for TIMEOUT_INIT operation and IPv6 addresses */ + if (op == BPF_SOCK_OPS_TIMEOUT_INIT && + skops->family == AF_INET6) { + + /* If the first 5.5 bytes of the IPv6 address are the same +* then both hosts are in the same datacenter +* so use an RTO of 10ms +*/ + if (skops->local_ip6[0] == skops->remote_ip6[0] && + (skops->local_ip6[1] & 0xfff0) == + (skops->remote_ip6[1] & 0xfff0)) + rv = 10; + } +#ifdef DEBUG + bpf_trace_printk(fmt2, sizeof(fmt2), rv); +#endif + return rv; +} +char _license[] SEC("license") = "GPL"; -- 2.9.3
Re: [PATCH v2] arm: eBPF JIT compiler
Hi Daniel, > > Sorry, had a travel over the weekend, so didn't read it in time. > > What is the issue with imitating in JIT what the interpreter is > doing as a starting point? That should be generic enough to handle > any case. > > Otherwise you'd need some sort of reverse mapping since verifier > already converted BPF_CALL insns into relative helper addresses > in imm part. > Sorry but I don't get what you are trying to say. Can you explain it with an example? -Shubham
Re: [RFC PATCH net-next v2 10/15] bpf: Add support for changing congestion control
On 6/19/17, 3:34 PM, "Daniel Borkmann"wrote: On 06/18/2017 04:39 AM, Lawrence Brakmo wrote: > On 6/16/17, 6:58 AM, "Daniel Borkmann" wrote: [...] > > /* Change congestion control for socket */ > > -int tcp_set_congestion_control(struct sock *sk, const char *name) > > +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) > > { > >struct inet_connection_sock *icsk = inet_csk(sk); > >const struct tcp_congestion_ops *ca; > > @@ -344,7 +344,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) > >return -EPERM; > > > >rcu_read_lock(); > > - ca = __tcp_ca_find_autoload(name); > > + if (!load) > > + ca = tcp_ca_find(name); > > + else > > + ca = __tcp_ca_find_autoload(name); > > From BPF program side, we call with !load since we're not allowed > to sleep under RCU, that's correct ... > > >/* No change asking for existing value */ > >if (ca == icsk->icsk_ca_ops) { > >icsk->icsk_ca_setsockopt = 1; > > @@ -352,8 +355,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) > >} > >if (!ca) > >err = -ENOENT; > > + else if (!load) > > + icsk->icsk_ca_ops = ca; > > ... but don't we also need to hold a module ref in this case as done > below? > > Meaning, tcp_ca_find() could return a ca that was previously loaded > to the tcp_cong_list as module, then resulting in ref count imbalance > when set from BPF? > > As I mentioned above, this can be called before congestion has been > initialized (op <= BPF_SOCKET_OPS_NEEDS_ECN) in which case > tcp_init_congestion_control will be called later. If op > ..OPS_NEEDS_ECN > then bpf_setsockopt() will call the reinit_congestion_control(). > > But this points to an issue where someone else could call > tcp_set_congestion_control() with load == false not knowing they > need to call either init or reinit. I will add a comment to the function > to make it clear. Hm, I'm not sure it answers my question. What I meant was that from BPF prog, you're setting tcp_set_congestion_control(..., false) so if tcp_ca_find() returns a ca that was loaded earlier as a from a module (so it becomes available in tcp_cong_list), the above... [...] else if (!load) icsk->icsk_ca_ops = ca; [...] ... will basically prevent the later try_module_get() on the ca. So any later tcp_reinit_congestion_control() or tcp_init_congestion_control() will still run not having the refcount held on the owner module. Meaning a module unload would let the machine crash due to the refcnt imbalance? What am I missing? Nothing, you are correct. I was mistakenly thinking that the refcount update was being done in tcp_init_congestion_control. Done.
Re: [RFC net-next 6/8] nfp: bpf: add support for XDP_FLAGS_HW_MODE
On 06/20/2017 02:01 AM, Jakub Kicinski wrote: On Tue, 20 Jun 2017 01:50:17 +0200, Daniel Borkmann wrote: On 06/17/2017 01:57 AM, Jakub Kicinski wrote: Respect the XDP_FLAGS_HW_MODE. When it's set install the program on the NIC and skip enabling XDP in the driver. Signed-off-by: Jakub Kicinski--- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 68648e312129..c5903b6e58c5 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3310,19 +3310,22 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags, struct netlink_ext_ack *extack) { - struct bpf_prog *offload_prog; + struct bpf_prog *drv_prog, *offload_prog; int err; if (nn->xdp_prog && (flags ^ nn->xdp_flags) & XDP_FLAGS_MODES) return -EBUSY; + drv_prog = flags & XDP_FLAGS_HW_MODE ? NULL : prog; offload_prog = flags & XDP_FLAGS_DRV_MODE ? NULL : prog; Can you make this assumption here? If dev_change_xdp_fd() is called without XDP_FLAGS_HW_MODE or XDP_FLAGS_DRV_MODE flags, then we set prog to both, drv_prog and offload_prog. Is this expected? Maybe in nfp_net_xdp_setup() check for !hweight32(xdp_flags & XDP_FLAGS_MODES) and then set flags |= XDP_FLAGS_DRV_MODE before both assignments? I thought we did want both. In case the program is loaded to both the HW/FW will mark the packets with BPF bit in the descriptor so that they are not processed twice. But the driver path will be configured for running bpf and when user replaces the program with one which cannot be offloaded the driver will not have to reconfigure itself. Okay, that's a good point ... so that you can just use xchg() later on. Probably worth explaining this rationale in a short comment.
Re: [RFC net-next 6/8] nfp: bpf: add support for XDP_FLAGS_HW_MODE
On Tue, 20 Jun 2017 01:50:17 +0200, Daniel Borkmann wrote: > On 06/17/2017 01:57 AM, Jakub Kicinski wrote: > > Respect the XDP_FLAGS_HW_MODE. When it's set install the program > > on the NIC and skip enabling XDP in the driver. > > > > Signed-off-by: Jakub Kicinski> > --- > > drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 10 +++--- > > 1 file changed, 7 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c > > b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c > > index 68648e312129..c5903b6e58c5 100644 > > --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c > > +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c > > @@ -3310,19 +3310,22 @@ static int > > nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags, > > struct netlink_ext_ack *extack) > > { > > - struct bpf_prog *offload_prog; > > + struct bpf_prog *drv_prog, *offload_prog; > > int err; > > > > if (nn->xdp_prog && (flags ^ nn->xdp_flags) & XDP_FLAGS_MODES) > > return -EBUSY; > > > > + drv_prog = flags & XDP_FLAGS_HW_MODE ? NULL : prog; > > offload_prog = flags & XDP_FLAGS_DRV_MODE ? NULL : prog; > > Can you make this assumption here? If dev_change_xdp_fd() is called > without XDP_FLAGS_HW_MODE or XDP_FLAGS_DRV_MODE flags, then we set prog > to both, drv_prog and offload_prog. Is this expected? > > Maybe in nfp_net_xdp_setup() check for !hweight32(xdp_flags & XDP_FLAGS_MODES) > and then set flags |= XDP_FLAGS_DRV_MODE before both assignments? I thought we did want both. In case the program is loaded to both the HW/FW will mark the packets with BPF bit in the descriptor so that they are not processed twice. But the driver path will be configured for running bpf and when user replaces the program with one which cannot be offloaded the driver will not have to reconfigure itself.
Re: [RFC net-next 6/8] nfp: bpf: add support for XDP_FLAGS_HW_MODE
On 06/17/2017 01:57 AM, Jakub Kicinski wrote: Respect the XDP_FLAGS_HW_MODE. When it's set install the program on the NIC and skip enabling XDP in the driver. Signed-off-by: Jakub Kicinski--- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 68648e312129..c5903b6e58c5 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3310,19 +3310,22 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags, struct netlink_ext_ack *extack) { - struct bpf_prog *offload_prog; + struct bpf_prog *drv_prog, *offload_prog; int err; if (nn->xdp_prog && (flags ^ nn->xdp_flags) & XDP_FLAGS_MODES) return -EBUSY; + drv_prog = flags & XDP_FLAGS_HW_MODE ? NULL : prog; offload_prog = flags & XDP_FLAGS_DRV_MODE ? NULL : prog; Can you make this assumption here? If dev_change_xdp_fd() is called without XDP_FLAGS_HW_MODE or XDP_FLAGS_DRV_MODE flags, then we set prog to both, drv_prog and offload_prog. Is this expected? Maybe in nfp_net_xdp_setup() check for !hweight32(xdp_flags & XDP_FLAGS_MODES) and then set flags |= XDP_FLAGS_DRV_MODE before both assignments? - err = nfp_net_xdp_setup_drv(nn, prog, extack); + err = nfp_net_xdp_setup_drv(nn, drv_prog, extack); if (err) return err; - nfp_app_xdp_offload(nn->app, nn, offload_prog); + err = nfp_app_xdp_offload(nn->app, nn, offload_prog); + if (err && flags & XDP_FLAGS_HW_MODE) + return err; if (nn->xdp_prog) bpf_prog_put(nn->xdp_prog); @@ -3338,6 +3341,7 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_xdp *xdp) switch (xdp->command) { case XDP_SETUP_PROG: + case XDP_SETUP_PROG_HW: return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags, xdp->extack); case XDP_QUERY_PROG:
Re: [RFC net-next 7/8] xdp: add reporting of offload mode
On 06/17/2017 01:57 AM, Jakub Kicinski wrote: Extend the XDP_ATTACHED_* values to include offloaded mode. Let drivers report whether program is installed in the driver or the HW by changing the prog_attached field from bool to u8 (type of the netlink attribute). Exploit the fact that the value of XDP_ATTACHED_DRV is 1, therefore since all drivers currently assign the mode with double negation: mode = !!xdp_prog; no drivers have to be modified. Signed-off-by: Jakub KicinskiAcked-by: Daniel Borkmann
Re: [RFC net-next 2/8] xdp: add HW offload mode flag for installing programs
On 06/17/2017 01:57 AM, Jakub Kicinski wrote: Add an installation-time flag for requesting that the program be installed only if it can be offloaded to HW. Internally new command for ndo_xdp is added, this way we avoid putting checks into drivers since they all return -EINVAL on an unknown command. Signed-off-by: Jakub KicinskiAcked-by: Daniel Borkmann
Re: [RFC net-next 1/8] xdp: pass XDP flags into install handlers
On 06/17/2017 01:57 AM, Jakub Kicinski wrote: Pass XDP flags to the xdp ndo. This will allow drivers to look at the mode flags and make decisions about offload. Signed-off-by: Jakub KicinskiAcked-by: Daniel Borkmann
Re: [PATCH net-next 0/1] Introduction of the tc tests
Hi, On Fri, Jun 16, 2017 at 2:22 PM, Lucas Bateswrote: > Apologies for sending this as one big patch. I've been sitting on this a > little > too long, but it's ready and I wanted to get it out. > > There are a limited number of tests to start - I plan to add more on a regular > basis. > > Lucas Bates (1): > selftests: Introduce tc testsuite Nice work! Is there any particular reason you want to put these tests in kernel tree especially tools/testing/selftests/ ?
Re: [RFC net-next 2/8] xdp: add HW offload mode flag for installing programs
On 06/20/2017 01:24 AM, Jakub Kicinski wrote: [...] The XDP_SETUP_PROG_HW command is purely for convenience of drivers without an offload. I felt it's not appropriate to burden all drivers with: if (xdp->flags & XDP_FLAGS_HW_MODE) return -EOPNOTSUPP; But, I do have a patch which does it, so I'm happy to drop the new command if it's preferred. Ahh, that makes sense, yep. I was only focused on reviewing this in the context of nfp driver. Lack of coffee. ;)
Re: [RFC net-next 5/8] nfp: bpf: take a reference on offloaded programs
On Tue, 20 Jun 2017 01:23:05 +0200, Daniel Borkmann wrote: > On 06/17/2017 01:57 AM, Jakub Kicinski wrote: > > The xdp_prog member of the adapter's data path structure is used > > for XDP in driver mode. In case a XDP program is loaded with in > > HW-only mode, we need to store it somewhere else. Add a new XDP > > prog pointer in the main structure and use that when we need to > > know whether any XDP program is loaded, not only a driver mode > > one. Only release our reference on adapter free instead of > > immediately after netdev unregister to allow offload to be disabled > > first. > > > > Signed-off-by: Jakub Kicinski> [...] > > @@ -3327,6 +3323,10 @@ nfp_net_xdp_setup(struct nfp_net *nn, struct > > bpf_prog > > return err; > > > > nfp_app_xdp_offload(nn->app, nn, offload_prog); > > + > > + if (nn->xdp_prog) > > + bpf_prog_put(nn->xdp_prog); > > + nn->xdp_prog = prog; > > nn->xdp_flags = flags; > > > > return 0; > > Can you elaborate on the extra reference on the prog? Sorry, this patch went through a few revisions and the subject doesn't express the intent too well any more :S Originally it was about making sure we have a reference on the program when it's offloaded but not loaded in the driver, but I realized the we have the reference from dev_change_xdp_fd() already, so now the patch just releases the reference on the offloaded program. > So in nfp_net_xdp_setup(), assuming a prog was already loaded on > driver side: after your set, nfp_net_xdp_setup_drv() will then > do the xchg() on nn->dp.xdp_prog, bpf_prog_put() this one and > later back in nfp_net_xdp_setup() we check nn->xdp_prog and > bpf_prog_put() it if it existed before and update nn->xdp_prog > to the current prog. So you end up with two puts on the same > program, but I don't see where you take the one additional ref > aside from the ref that you already get from dev_change_xdp_fd(). > What am I missing? You are right, I missed there were two spots where I was doing a bpf_prog_put() in nfp_net_xdp_setup_drv(), thanks!
[PATCH net-next v2] enic: Fix format truncation warning
With -Wformat-truncation, gcc throws the following warning. Fix this by increasing the size of devname to accommodate 15 character netdev interface name and description. Remove length format precision for %s. We can fit entire name. Also increment the version. drivers/net/ethernet/cisco/enic/enic_main.c: In function ‘enic_open’: drivers/net/ethernet/cisco/enic/enic_main.c:1740:15: warning: ‘%u’ directive output may be truncated writing between 1 and 2 bytes into a region of size between 1 and 12 [-Wformat-truncation=] "%.11s-rx-%u", netdev->name, i); ^~ drivers/net/ethernet/cisco/enic/enic_main.c:1740:5: note: directive argument in the range [0, 16] "%.11s-rx-%u", netdev->name, i); ^ drivers/net/ethernet/cisco/enic/enic_main.c:1738:4: note: ‘snprintf’ output between 6 and 18 bytes into a destination of size 16 snprintf(enic->msix[intr].devname, ^~ sizeof(enic->msix[intr].devname), ~ "%.11s-rx-%u", netdev->name, i); ~~~ Signed-off-by: Govindarajulu Varadarajan--- v2: dont use kasprintf, increase the devname size http://patchwork.ozlabs.org/patch/777568/ drivers/net/ethernet/cisco/enic/enic.h | 4 ++-- drivers/net/ethernet/cisco/enic/enic_main.c | 8 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h index 2b23f46b34d3..ba032ac9ae86 100644 --- a/drivers/net/ethernet/cisco/enic/enic.h +++ b/drivers/net/ethernet/cisco/enic/enic.h @@ -33,7 +33,7 @@ #define DRV_NAME "enic" #define DRV_DESCRIPTION"Cisco VIC Ethernet NIC Driver" -#define DRV_VERSION"2.3.0.31" +#define DRV_VERSION"2.3.0.42" #define DRV_COPYRIGHT "Copyright 2008-2013 Cisco Systems, Inc" #define ENIC_BARS_MAX 6 @@ -47,7 +47,7 @@ struct enic_msix_entry { int requested; - char devname[IFNAMSIZ]; + char devname[IFNAMSIZ + 8]; irqreturn_t (*isr)(int, void *); void *devid; cpumask_var_t affinity_mask; diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index 6a9c8878aca0..d24ee1ad3be1 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -1737,7 +1737,7 @@ static int enic_request_intr(struct enic *enic) intr = enic_msix_rq_intr(enic, i); snprintf(enic->msix[intr].devname, sizeof(enic->msix[intr].devname), - "%.11s-rx-%u", netdev->name, i); + "%s-rx-%u", netdev->name, i); enic->msix[intr].isr = enic_isr_msix; enic->msix[intr].devid = >napi[i]; } @@ -1748,7 +1748,7 @@ static int enic_request_intr(struct enic *enic) intr = enic_msix_wq_intr(enic, i); snprintf(enic->msix[intr].devname, sizeof(enic->msix[intr].devname), - "%.11s-tx-%u", netdev->name, i); + "%s-tx-%u", netdev->name, i); enic->msix[intr].isr = enic_isr_msix; enic->msix[intr].devid = >napi[wq]; } @@ -1756,14 +1756,14 @@ static int enic_request_intr(struct enic *enic) intr = enic_msix_err_intr(enic); snprintf(enic->msix[intr].devname, sizeof(enic->msix[intr].devname), - "%.11s-err", netdev->name); + "%s-err", netdev->name); enic->msix[intr].isr = enic_isr_msix_err; enic->msix[intr].devid = enic; intr = enic_msix_notify_intr(enic); snprintf(enic->msix[intr].devname, sizeof(enic->msix[intr].devname), - "%.11s-notify", netdev->name); + "%s-notify", netdev->name); enic->msix[intr].isr = enic_isr_msix_notify; enic->msix[intr].devid = enic; -- 2.13.1
Re: [RFC net-next 2/8] xdp: add HW offload mode flag for installing programs
On Tue, 20 Jun 2017 00:55:41 +0200, Daniel Borkmann wrote: > On 06/17/2017 01:57 AM, Jakub Kicinski wrote: > > Add an installation-time flag for requesting that the program > > be installed only if it can be offloaded to HW. > > > > Internally new command for ndo_xdp is added, this way we avoid > > putting checks into drivers since they all return -EINVAL on > > an unknown command. > > > > Signed-off-by: Jakub Kicinski> [...] > > diff --git a/net/core/dev.c b/net/core/dev.c > > index a04db264aa1c..05cec8e2cd82 100644 > > --- a/net/core/dev.c > > +++ b/net/core/dev.c > > @@ -6959,7 +6959,10 @@ static int dev_xdp_install(struct net_device *dev, > > xdp_op_t xdp_op, > > struct netdev_xdp xdp; > > > > memset(, 0, sizeof(xdp)); > > - xdp.command = XDP_SETUP_PROG; > > + if (flags & XDP_FLAGS_HW_MODE) > > + xdp.command = XDP_SETUP_PROG_HW; > > + else > > + xdp.command = XDP_SETUP_PROG; > > xdp.extack = extack; > > xdp.flags = flags; > > xdp.prog = prog; > > One thing I'm not sure I follow is that while you pass flags to the ndo > in patch 1, add a new XDP_SETUP_PROG_HW command here in patch 2 based on > the flags, and later on in patch 6, you don't really make use of it, but > look at the flags anyway? Then, why adding separate XDP_SETUP_PROG_HW > in the first place? > > [patch 6:] > @@ -3338,6 +3341,7 @@ static int nfp_net_xdp(struct net_device *netdev, > struct netdev_xdp *xdp) > > switch (xdp->command) { > case XDP_SETUP_PROG: > + case XDP_SETUP_PROG_HW: > return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags, >xdp->extack); We still need the flags to be able to differentiate between default/no flags case where we load to the driver and the HW ("both"), and when the DRV_MODE flag is set, in which case we disable the HW offload and only load to the driver. We have three cases: drv offload no flag yesattempted DRV_MODE yesno HW_MODEno yes The XDP_SETUP_PROG_HW command is purely for convenience of drivers without an offload. I felt it's not appropriate to burden all drivers with: if (xdp->flags & XDP_FLAGS_HW_MODE) return -EOPNOTSUPP; But, I do have a patch which does it, so I'm happy to drop the new command if it's preferred.
Re: [RFC net-next 5/8] nfp: bpf: take a reference on offloaded programs
On 06/17/2017 01:57 AM, Jakub Kicinski wrote: The xdp_prog member of the adapter's data path structure is used for XDP in driver mode. In case a XDP program is loaded with in HW-only mode, we need to store it somewhere else. Add a new XDP prog pointer in the main structure and use that when we need to know whether any XDP program is loaded, not only a driver mode one. Only release our reference on adapter free instead of immediately after netdev unregister to allow offload to be disabled first. Signed-off-by: Jakub Kicinski[...] @@ -3327,6 +3323,10 @@ nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags, return err; nfp_app_xdp_offload(nn->app, nn, offload_prog); + + if (nn->xdp_prog) + bpf_prog_put(nn->xdp_prog); + nn->xdp_prog = prog; nn->xdp_flags = flags; return 0; Can you elaborate on the extra reference on the prog? So in nfp_net_xdp_setup(), assuming a prog was already loaded on driver side: after your set, nfp_net_xdp_setup_drv() will then do the xchg() on nn->dp.xdp_prog, bpf_prog_put() this one and later back in nfp_net_xdp_setup() we check nn->xdp_prog and bpf_prog_put() it if it existed before and update nn->xdp_prog to the current prog. So you end up with two puts on the same program, but I don't see where you take the one additional ref aside from the ref that you already get from dev_change_xdp_fd(). What am I missing?
Re: [RFC net-next 2/8] xdp: add HW offload mode flag for installing programs
On 06/17/2017 01:57 AM, Jakub Kicinski wrote: Add an installation-time flag for requesting that the program be installed only if it can be offloaded to HW. Internally new command for ndo_xdp is added, this way we avoid putting checks into drivers since they all return -EINVAL on an unknown command. Signed-off-by: Jakub Kicinski[...] diff --git a/net/core/dev.c b/net/core/dev.c index a04db264aa1c..05cec8e2cd82 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6959,7 +6959,10 @@ static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, struct netdev_xdp xdp; memset(, 0, sizeof(xdp)); - xdp.command = XDP_SETUP_PROG; + if (flags & XDP_FLAGS_HW_MODE) + xdp.command = XDP_SETUP_PROG_HW; + else + xdp.command = XDP_SETUP_PROG; xdp.extack = extack; xdp.flags = flags; xdp.prog = prog; One thing I'm not sure I follow is that while you pass flags to the ndo in patch 1, add a new XDP_SETUP_PROG_HW command here in patch 2 based on the flags, and later on in patch 6, you don't really make use of it, but look at the flags anyway? Then, why adding separate XDP_SETUP_PROG_HW in the first place? [patch 6:] @@ -3338,6 +3341,7 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_xdp *xdp) switch (xdp->command) { case XDP_SETUP_PROG: + case XDP_SETUP_PROG_HW: return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags, xdp->extack);
Re: [RFC PATCH net-next v2 10/15] bpf: Add support for changing congestion control
On 06/18/2017 04:39 AM, Lawrence Brakmo wrote: On 6/16/17, 6:58 AM, "Daniel Borkmann"wrote: [...] > /* Change congestion control for socket */ > -int tcp_set_congestion_control(struct sock *sk, const char *name) > +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) > { > struct inet_connection_sock *icsk = inet_csk(sk); > const struct tcp_congestion_ops *ca; > @@ -344,7 +344,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) > return -EPERM; > > rcu_read_lock(); > - ca = __tcp_ca_find_autoload(name); > + if (!load) > + ca = tcp_ca_find(name); > + else > + ca = __tcp_ca_find_autoload(name); From BPF program side, we call with !load since we're not allowed to sleep under RCU, that's correct ... > /* No change asking for existing value */ > if (ca == icsk->icsk_ca_ops) { > icsk->icsk_ca_setsockopt = 1; > @@ -352,8 +355,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) > } > if (!ca) > err = -ENOENT; > + else if (!load) > + icsk->icsk_ca_ops = ca; ... but don't we also need to hold a module ref in this case as done below? Meaning, tcp_ca_find() could return a ca that was previously loaded to the tcp_cong_list as module, then resulting in ref count imbalance when set from BPF? As I mentioned above, this can be called before congestion has been initialized (op <= BPF_SOCKET_OPS_NEEDS_ECN) in which case tcp_init_congestion_control will be called later. If op > ..OPS_NEEDS_ECN then bpf_setsockopt() will call the reinit_congestion_control(). But this points to an issue where someone else could call tcp_set_congestion_control() with load == false not knowing they need to call either init or reinit. I will add a comment to the function to make it clear. Hm, I'm not sure it answers my question. What I meant was that from BPF prog, you're setting tcp_set_congestion_control(..., false) so if tcp_ca_find() returns a ca that was loaded earlier as a from a module (so it becomes available in tcp_cong_list), the above... [...] else if (!load) icsk->icsk_ca_ops = ca; [...] ... will basically prevent the later try_module_get() on the ca. So any later tcp_reinit_congestion_control() or tcp_init_congestion_control() will still run not having the refcount held on the owner module. Meaning a module unload would let the machine crash due to the refcnt imbalance? What am I missing?
Re: [PATCH NET] net/hns:bugfix of ethtool -t phy self_test
On Mon, Jun 19, 2017 at 02:00:43PM -0700, Florian Fainelli wrote: > On 06/16/2017 02:24 AM, Lin Yun Sheng wrote: > > This patch fixes the phy loopback self_test failed issue. when > > Marvell Phy Module is loaded, it will powerdown fiber when doing > > phy loopback self test, which cause phy loopback self_test fail. > > > > Signed-off-by: Lin Yun Sheng> > --- > > drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 16 ++-- > > 1 file changed, 14 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c > > b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c > > index b8fab14..e95795b 100644 > > --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c > > +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c > > @@ -288,9 +288,15 @@ static int hns_nic_config_phy_loopback(struct > > phy_device *phy_dev, u8 en) > > The question really is, why is not this properly integrated into the PHY > driver and PHYLIB such that the only thing the Ethernet MAC driver has > to call is a function of the PHY driver putting it in self-test? This whole driver pokes various PHY registers, rather than use phylib. And it does so without taking the PHY lock. It also assumes it is a Marvell PHY and i don't see anywhere it actually verifies this. This is all broken. Andrew
Re: Reply Urgent
Hello, How are you doing? I have been sent to inform you that, We have an inheritance of a deceased client with your surname. Contact Mr Andrew Bailey Reply Email To: myinf...@gmail.com with your "Full Names" for more info. Thanks for your understanding. Reply ASAP thank you. Melissa. -- Correo Corporativo Hospital Universitario del Valle E.S.E *** "Estamos re-dimensionandonos para crecer!" **
Re: [PATCH NET] net/hns:bugfix of ethtool -t phy self_test
On 06/16/2017 02:24 AM, Lin Yun Sheng wrote: > This patch fixes the phy loopback self_test failed issue. when > Marvell Phy Module is loaded, it will powerdown fiber when doing > phy loopback self test, which cause phy loopback self_test fail. > > Signed-off-by: Lin Yun Sheng> --- > drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 16 ++-- > 1 file changed, 14 insertions(+), 2 deletions(-) > > diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c > b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c > index b8fab14..e95795b 100644 > --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c > +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c > @@ -288,9 +288,15 @@ static int hns_nic_config_phy_loopback(struct phy_device > *phy_dev, u8 en) The question really is, why is not this properly integrated into the PHY driver and PHYLIB such that the only thing the Ethernet MAC driver has to call is a function of the PHY driver putting it in self-test? > > /* Force 1000M Link, Default is 0x0200 */ > phy_write(phy_dev, 7, 0x20C); > - phy_write(phy_dev, HNS_PHY_PAGE_REG, 0); > > - /* Enable PHY loop-back */ > + /* Powerup Fiber */ > + phy_write(phy_dev, HNS_PHY_PAGE_REG, 1); > + val = phy_read(phy_dev, COPPER_CONTROL_REG); > + val &= ~PHY_POWER_DOWN; > + phy_write(phy_dev, COPPER_CONTROL_REG, val); > + > + /* Enable Phy Loopback */ > + phy_write(phy_dev, HNS_PHY_PAGE_REG, 0); > val = phy_read(phy_dev, COPPER_CONTROL_REG); > val |= PHY_LOOP_BACK; > val &= ~PHY_POWER_DOWN; > @@ -299,6 +305,12 @@ static int hns_nic_config_phy_loopback(struct phy_device > *phy_dev, u8 en) > phy_write(phy_dev, HNS_PHY_PAGE_REG, 0xFA); > phy_write(phy_dev, 1, 0x400); > phy_write(phy_dev, 7, 0x200); > + > + phy_write(phy_dev, HNS_PHY_PAGE_REG, 1); > + val = phy_read(phy_dev, COPPER_CONTROL_REG); > + val |= PHY_POWER_DOWN; > + phy_write(phy_dev, COPPER_CONTROL_REG, val); > + > phy_write(phy_dev, HNS_PHY_PAGE_REG, 0); > phy_write(phy_dev, 9, 0xF00); > > -- Florian
Re: [RFC PATCH net-next v2 01/15] bpf: BPF support for socket ops
On 6/19/17, 11:44 AM, "Daniel Borkmann"wrote: On 06/17/2017 01:41 AM, Lawrence Brakmo wrote: > On 6/16/17, 5:07 AM, "Daniel Borkmann" wrote: [...] > I see. You are saying have one struct in common but still keep the two > PROG_TYPES? That makes sense. Do we really need two different > is_valid_access functions? Both types should be able to see all > the fields (otherwise adding new fields becomes messy). Would probably leave the two is_valid_access() separate initially, and once people ask for it we could potentially open this up to some of the other fields that are available at that time. As discussed in the other thread, I will keep the 2 structs > > Currently there are two types of ops. The first type expects the BPF > > program to return a value which is then used by the caller (or a > > negative value to indicate the operation is not supported). The second > > type expects state changes to be done by the BPF program, for example > > through a setsockopt BPF helper function, and they ignore the return > > value. [...] > > +/* Call BPF_SOCKET_OPS program that returns an int. If the return value > > + * is < 0, then the BPF op failed (for example if the loaded BPF > > + * program does not support the chosen operation or there is no BPF > > + * program loaded). > > + */ > > +#ifdef CONFIG_BPF > > +static inline int tcp_call_bpf(struct sock *sk, bool is_req_sock, int op) > > +{ > > + struct bpf_socket_ops_kern socket_ops; > > + > > + memset(_ops, 0, sizeof(socket_ops)); > > + socket_ops.sk = sk; > > + socket_ops.is_req_sock = is_req_sock ? 1 : 0; > > Is is_req_sock actually used here in this patch (apart from setting it)? > Not seeing that BPF prog will access it, if it also shouldn't access it, > then bool type would be better. > > The only reason I used a bit was in case I wanted to add more fields later on. > Does it make sense or should I just use bool? Didn't know that, but I think starting out with bool seems a bit cleaner, if needed we could later still switch to bitfield. Done. > > + socket_ops.op = op; > > + > > + return bpf_socket_ops_call(_ops); > > +} [...] > > +/* Global BPF program for sockets */ > > +static struct bpf_prog *bpf_socket_ops_prog; > > +static DEFINE_RWLOCK(bpf_socket_ops_lock); > > + > > +int bpf_socket_ops_set_prog(int fd) > > +{ > > + int err = 0; > > + > > + write_lock(_socket_ops_lock); > > + if (bpf_socket_ops_prog) { > > + bpf_prog_put(bpf_socket_ops_prog); > > + bpf_socket_ops_prog = NULL; > > + } > > + > > + /* fd of zero is used as a signal to remove the current > > + * bpf_socket_ops_prog. > > + */ > > + if (fd == 0) { > > Can we make the fd related semantics similar to dev_change_xdp_fd()? > > Do you mean remove program is fd < 0 instead of == 0? Yes, that and also the ordering of dropping the ref of the existing bpf_socket_ops_prog program with setting the new one, so you can convert bpf_socket_ops_prog to RCU more easily. I made lots of changes to how we set/attach the global_sock_ops program affecting the files kernel/bpf/syscall.c, net/core/sock_bpfops.c and samples/bpf/tcp_bpf.c. The patch set will be submitted later today. > > + write_unlock(_socket_ops_lock); > > + return 1; > > + } > > + > > + bpf_socket_ops_prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_OPS); > > + if (IS_ERR(bpf_socket_ops_prog)) { > > + bpf_prog_put(bpf_socket_ops_prog); > > This will crash the kernel, passing err value to bpf_prog_put(). [...] Thanks again for the feedback.
Re: [RFC PATCH net-next v2 01/15] bpf: BPF support for socket ops
On 6/19/17, 11:52 AM, "Daniel Borkmann"wrote: On 06/17/2017 11:48 PM, Lawrence Brakmo wrote: > On 6/16/17, 5:07 AM, "Daniel Borkmann" wrote: > > On 06/15/2017 10:08 PM, Lawrence Brakmo wrote: > > Two new corresponding structs (one for the kernel one for the user/BPF > > program): > > > > /* kernel version */ > > struct bpf_socket_ops_kern { > > struct sock *sk; > >__u32 is_req_sock:1; > > __u32 op; > > union { > > __u32 reply; > > __u32 replylong[4]; > > }; > > }; > > > > /* user version */ > > struct bpf_socket_ops { > > __u32 op; > > union { > > __u32 reply; > > __u32 replylong[4]; > > }; > > __u32 family; > > __u32 remote_ip4; > > __u32 local_ip4; > > __u32 remote_ip6[4]; > > __u32 local_ip6[4]; > > __u32 remote_port; > > __u32 local_port; > > }; > > Above and ... > > struct bpf_sock { > __u32 bound_dev_if; > __u32 family; > __u32 type; > __u32 protocol; > }; > > ... would result in two BPF sock user versions. It's okayish, but > given struct bpf_sock is quite generic, couldn't we merge the members > from struct bpf_socket_ops into struct bpf_sock instead? > > Idea would be that sock_filter_is_valid_access() for cgroups would > then check off < 0 || off + size > offsetofend(struct bpf_sock, protocol) > to disallow new members, and your socket_ops_is_valid_access() could > allow and xlate the full range. The family member is already duplicate > and the others could then be accessed from these kind of BPF progs as > well, plus we have a single user representation similar as with __sk_buff > that multiple types will use. > > I am concerned that it could make usage more confusing. One type of > sock program (cgroup) could only use a subset of the fields while the > other type (socket_ops) could use all (or a different subset). Then what > happens if there is a need to add a new field to cgroup type sock program? > In addition, in the near future I will have a patch to attach socket_ops > programs to cgroups. > I rather leave it as it is. Okay, I'm fine with that as well. For the __sk_buff, we also have the case that some members are not available for all program types like tc_classid, so it's similar there. But if indeed the majority of members cannot be supported for the most parts (?) then having different structs seems okay, probably easier to use, but we should try hard to not ending up with 10 different uapi socket structs that apply to program types working on sockets in one way or another. Agree 100%.
[PATCH net-next v2 1/4] rtnetlink: add NEWCACHEREPORT message type
New NEWCACHEREPORT message type to be used for cache reports sent via Netlink, effectively allowing splitting cache report reception from mroute programming. Suggested-by: Ryan HalbrookSigned-off-by: Julien Gomes --- include/uapi/linux/rtnetlink.h | 3 +++ security/selinux/nlmsgtab.c| 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 564790e854f7..cd1afb900929 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -146,6 +146,9 @@ enum { RTM_GETSTATS = 94, #define RTM_GETSTATS RTM_GETSTATS + RTM_NEWCACHEREPORT = 96, +#define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT + __RTM_MAX, #define RTM_MAX(((__RTM_MAX + 3) & ~3) - 1) }; diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 5aeaf30b7a13..7b7433a1a34c 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -79,6 +79,7 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { RTM_GETNSID, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_GETSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ }, + { RTM_NEWCACHEREPORT, NETLINK_ROUTE_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = @@ -158,7 +159,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) switch (sclass) { case SECCLASS_NETLINK_ROUTE_SOCKET: /* RTM_MAX always point to RTM_SET, ie RTM_NEWxxx + 3 */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWSTATS + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWCACHEREPORT + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; -- 2.13.1
[PATCH net-next v2 0/4] ipmr/ip6mr: add Netlink notifications on cache reports
Currently, all ipmr/ip6mr cache reports are sent through the mroute/mroute6 socket only. This forces the use of a single socket for mroute programming, cache reports and, regarding ipmr, IGMP messages without Router Alert option reception. The present patches are aiming to send Netlink notifications in addition to the existing igmpmsg/mrt6msg to give user programs a way to handle cache reports in parallel with multiple sockets other than the mroute/mroute6 socket. Changes in v2: - Changed attributes naming from {IPMRA,IP6MRA}_CACHEREPORTA_* to {IPMRA,IP6MRA}_CREPORT_* - Improved packet data copy to handle non-linear packets in ipmr/ip6mr cache report Netlink notification creation - Added two rtnetlink groups with restricted-binding - Changed cache report notified groups from RTNL_{IPV4,IPV6}_MROUTE to the new restricted groups in ipmr/ip6mr Julien Gomes (4): rtnetlink: add NEWCACHEREPORT message type rtnetlink: add restricted rtnl groups for ipv4 and ipv6 mroute ipmr: add netlink notifications on igmpmsg cache reports ip6mr: add netlink notifications on mrt6msg cache reports include/uapi/linux/mroute.h| 12 include/uapi/linux/mroute6.h | 12 include/uapi/linux/rtnetlink.h | 7 + net/core/rtnetlink.c | 13 net/ipv4/ipmr.c| 67 -- net/ipv6/ip6mr.c | 67 -- security/selinux/nlmsgtab.c| 3 +- 7 files changed, 176 insertions(+), 5 deletions(-) -- 2.13.1
[PATCH net-next v2 3/4] ipmr: add netlink notifications on igmpmsg cache reports
Add Netlink notifications on cache reports in ipmr, in addition to the existing igmpmsg sent to mroute_sk. Send RTM_NEWCACHEREPORT notifications to RTNLGRP_IPV4_MROUTE_R. MSGTYPE, VIF_ID, SRC_ADDR and DST_ADDR Netlink attributes contain the same data as their equivalent fields in the igmpmsg header. PKT attribute is the packet sent to mroute_sk, without the added igmpmsg header. Suggested-by: Ryan HalbrookSigned-off-by: Julien Gomes --- include/uapi/linux/mroute.h | 12 net/ipv4/ipmr.c | 67 +++-- 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index f904367c0cee..e8e5041dea8e 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -152,6 +152,18 @@ enum { }; #define IPMRA_VIFA_MAX (__IPMRA_VIFA_MAX - 1) +/* ipmr netlink cache report attributes */ +enum { + IPMRA_CREPORT_UNSPEC, + IPMRA_CREPORT_MSGTYPE, + IPMRA_CREPORT_VIF_ID, + IPMRA_CREPORT_SRC_ADDR, + IPMRA_CREPORT_DST_ADDR, + IPMRA_CREPORT_PKT, + __IPMRA_CREPORT_MAX +}; +#define IPMRA_CREPORT_MAX (__IPMRA_CREPORT_MAX - 1) + /* That's all usermode folks */ #define MFC_ASSERT_THRESH (3*HZ) /* Maximal freq. of asserts */ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3e7454aa49e8..1e591bcaad6d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -109,6 +109,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); +static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); static void mroute_clean_tables(struct mr_table *mrt, bool all); static void ipmr_expire_process(unsigned long arg); @@ -995,8 +996,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, } } -/* Bounce a cache query up to mrouted. We could use netlink for this but mrouted - * expects the following bizarre scheme. +/* Bounce a cache query up to mrouted and netlink. * * Called under mrt_lock. */ @@ -1062,6 +1062,8 @@ static int ipmr_cache_report(struct mr_table *mrt, return -EINVAL; } + igmpmsg_netlink_event(mrt, skb); + /* Deliver to mrouted */ ret = sock_queue_rcv_skb(mroute_sk, skb); rcu_read_unlock(); @@ -2341,6 +2343,67 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); } +static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) +{ + struct net *net = read_pnet(>net); + struct nlmsghdr *nlh; + struct rtgenmsg *rtgenm; + struct igmpmsg *msg; + struct sk_buff *skb; + struct nlattr *nla; + int payloadlen; + int msgsize; + + payloadlen = pkt->len - sizeof(struct igmpmsg); + msg = (struct igmpmsg *)skb_network_header(pkt); + msgsize = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + + nla_total_size(1) + /* IPMRA_CREPORT_MSGTYPE */ + + nla_total_size(1) + /* IPMRA_CREPORT_VIF_ID */ + + nla_total_size(4) + /* IPMRA_CREPORT_SRC_ADDR */ + + nla_total_size(4) + /* IPMRA_CREPORT_DST_ADDR */ + + nla_total_size(payloadlen) + /* IPMRA_CREPORT_PKT */ + ; + + skb = nlmsg_new(msgsize, GFP_ATOMIC); + if (!skb) + goto errout; + + nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, + sizeof(struct rtgenmsg), 0); + if (!nlh) + goto errout; + rtgenm = nlmsg_data(nlh); + rtgenm->rtgen_family = RTNL_FAMILY_IPMR; + if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) || + nla_put_u8(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) || + nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR, + msg->im_src.s_addr) || + nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR, + msg->im_dst.s_addr)) + goto nla_put_failure; + + nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen); + if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg), + nla_data(nla), payloadlen)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC); + return; + +nla_put_failure: + nlmsg_cancel(skb, nlh); +errout: + kfree_skb(skb); +
[PATCH net-next v2 2/4] rtnetlink: add restricted rtnl groups for ipv4 and ipv6 mroute
Add RTNLGRP_{IPV4,IPV6}_MROUTE_R as two new restricted groups for the NETLINK_ROUTE family. Binding to these groups specifically requires CAP_NET_ADMIN to allow multicast of sensitive messages (e.g. mroute cache reports). Signed-off-by: Julien Gomes--- include/uapi/linux/rtnetlink.h | 4 net/core/rtnetlink.c | 13 + 2 files changed, 17 insertions(+) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index cd1afb900929..d148505010a7 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -669,6 +669,10 @@ enum rtnetlink_groups { #define RTNLGRP_NSID RTNLGRP_NSID RTNLGRP_MPLS_NETCONF, #define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF + RTNLGRP_IPV4_MROUTE_R, +#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R + RTNLGRP_IPV6_MROUTE_R, +#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R __RTNLGRP_MAX }; #define RTNLGRP_MAX(__RTNLGRP_MAX - 1) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3aa57848a895..4aefa5a2625f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4218,6 +4218,18 @@ static void rtnetlink_rcv(struct sk_buff *skb) rtnl_unlock(); } +static int rtnetlink_bind(struct net *net, int group) +{ + switch (group) { + case RTNLGRP_IPV4_MROUTE_R: + case RTNLGRP_IPV6_MROUTE_R: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + break; + } + return 0; +} + static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); @@ -4252,6 +4264,7 @@ static int __net_init rtnetlink_net_init(struct net *net) .input = rtnetlink_rcv, .cb_mutex = _mutex, .flags = NL_CFG_F_NONROOT_RECV, + .bind = rtnetlink_bind, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, ); -- 2.13.1
[PATCH net-next v2 4/4] ip6mr: add netlink notifications on mrt6msg cache reports
Add Netlink notifications on cache reports in ip6mr, in addition to the existing mrt6msg sent to mroute6_sk. Send RTM_NEWCACHEREPORT notifications to RTNLGRP_IPV6_MROUTE_R. MSGTYPE, MIF_ID, SRC_ADDR and DST_ADDR Netlink attributes contain the same data as their equivalent fields in the mrt6msg header. PKT attribute is the packet sent to mroute6_sk, without the added mrt6msg header. Suggested-by: Ryan HalbrookSigned-off-by: Julien Gomes --- include/uapi/linux/mroute6.h | 12 net/ipv6/ip6mr.c | 67 ++-- 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h index ed5721148768..e4746816c855 100644 --- a/include/uapi/linux/mroute6.h +++ b/include/uapi/linux/mroute6.h @@ -133,4 +133,16 @@ struct mrt6msg { struct in6_addr im6_src, im6_dst; }; +/* ip6mr netlink cache report attributes */ +enum { + IP6MRA_CREPORT_UNSPEC, + IP6MRA_CREPORT_MSGTYPE, + IP6MRA_CREPORT_MIF_ID, + IP6MRA_CREPORT_SRC_ADDR, + IP6MRA_CREPORT_DST_ADDR, + IP6MRA_CREPORT_PKT, + __IP6MRA_CREPORT_MAX +}; +#define IP6MRA_CREPORT_MAX (__IP6MRA_CREPORT_MAX - 1) + #endif /* _UAPI__LINUX_MROUTE6_H */ diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index b0e2bf1f4212..28a1fb49f12e 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -116,6 +116,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm); static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, int cmd); +static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr6_table *mrt, bool all); @@ -1125,8 +1126,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, } /* - * Bounce a cache query up to pim6sd. We could use netlink for this but pim6sd - * expects the following bizarre scheme. + * Bounce a cache query up to pim6sd and netlink. * * Called under mrt_lock. */ @@ -1208,6 +1208,8 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, return -EINVAL; } + mrt6msg_netlink_event(mrt, skb); + /* * Deliver to user space multicast routing algorithms */ @@ -2457,6 +2459,67 @@ static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } +static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt) +{ + struct net *net = read_pnet(>net); + struct nlmsghdr *nlh; + struct rtgenmsg *rtgenm; + struct mrt6msg *msg; + struct sk_buff *skb; + struct nlattr *nla; + int payloadlen; + int msgsize; + + payloadlen = pkt->len - sizeof(struct mrt6msg); + msg = (struct mrt6msg *)skb_transport_header(pkt); + msgsize = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + + nla_total_size(1) + /* IP6MRA_CREPORT_MSGTYPE */ + + nla_total_size(2) + /* IP6MRA_CREPORT_MIF_ID */ + + nla_total_size(sizeof(struct in6_addr)) + /* IP6MRA_CREPORT_SRC_ADDR */ + + nla_total_size(sizeof(struct in6_addr)) + /* IP6MRA_CREPORT_DST_ADDR */ + + nla_total_size(payloadlen) + /* IP6MRA_CREPORT_PKT */ + ; + + skb = nlmsg_new(msgsize, GFP_ATOMIC); + if (!skb) + goto errout; + + nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, + sizeof(struct rtgenmsg), 0); + if (!nlh) + goto errout; + rtgenm = nlmsg_data(nlh); + rtgenm->rtgen_family = RTNL_FAMILY_IP6MR; + if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) || + nla_put_u16(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) || + nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR, +>im6_src) || + nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR, +>im6_dst)) + goto nla_put_failure; + + nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen); + if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg), + nla_data(nla), payloadlen)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC); + return; + +nla_put_failure: +
[PATCH 3/5] rtlwifi: Add and use convenience macro rtl_btc
bluetooth coexistence functions always check get_btc_status before accessing the function. Centralize this via a convenience macro to neaten the source code a little. Signed-off-by: Joe Perches--- drivers/net/wireless/realtek/rtlwifi/base.c | 8 ++-- drivers/net/wireless/realtek/rtlwifi/core.c | 12 +++- drivers/net/wireless/realtek/rtlwifi/pci.c | 4 +--- drivers/net/wireless/realtek/rtlwifi/ps.c | 24 ++-- drivers/net/wireless/realtek/rtlwifi/wifi.h | 7 +++ 5 files changed, 23 insertions(+), 32 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c index cc4b50e1b7e5..997dd692e6bb 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.c +++ b/drivers/net/wireless/realtek/rtlwifi/base.c @@ -1312,11 +1312,9 @@ static void setup_arp_tx(struct rtl_priv *rtlpriv, struct rtl_ps_ctl *ppsc) { struct ieee80211_hw *hw = rtlpriv->hw; struct rtl_hal_ops *ops = rtlpriv->cfg->ops; - struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; rtlpriv->ra.is_special_data = true; - if (ops->get_btc_status()) - btc_ops->btc_special_packet_notify(rtlpriv, 1); + rtl_btc(rtlpriv, ops, btc_special_packet_notify(rtlpriv, 1)); rtl_lps_leave(hw); ppsc->last_delaylps_stamp_jiffies = jiffies; } @@ -1575,7 +1573,6 @@ void rtl_watchdog_wq_callback(void *data) struct ieee80211_hw *hw = rtlworks->hw; struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_hal_ops *ops = rtlpriv->cfg->ops; - struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; struct rtl_hal *rtlhal = rtl_hal(rtl_priv(hw)); struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); bool busytraffic = false; @@ -1714,8 +1711,7 @@ void rtl_watchdog_wq_callback(void *data) } } - if (ops->get_btc_status()) - btc_ops->btc_periodical(rtlpriv); + rtl_btc(rtlpriv, ops, btc_periodical(rtlpriv)); rtlpriv->link_info.bcn_rx_inperiod = 0; } diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c index e08febc2d0d6..8d3eddeeffea 100644 --- a/drivers/net/wireless/realtek/rtlwifi/core.c +++ b/drivers/net/wireless/realtek/rtlwifi/core.c @@ -1046,7 +1046,6 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_hal_ops *ops = rtlpriv->cfg->ops; - struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; struct rtl_hal *rtlhal = rtl_hal(rtlpriv); struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); struct rtl_ps_ctl *ppsc = rtl_psc(rtl_priv(hw)); @@ -1193,8 +1192,7 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, ppsc->report_linked = (mstatus == RT_MEDIA_CONNECT) ? true : false; - if (ops->get_btc_status()) - btc_ops->btc_mediastatus_notify(rtlpriv, mstatus); + rtl_btc(rtlpriv, ops, btc_mediastatus_notify(rtlpriv, mstatus)); } if (changed & BSS_CHANGED_ERP_CTS_PROT) { @@ -1428,7 +1426,6 @@ static void rtl_op_sw_scan_start(struct ieee80211_hw *hw, { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_hal_ops *ops = rtlpriv->cfg->ops; - struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); RT_TRACE(rtlpriv, COMP_MAC80211, DBG_LOUD, "\n"); @@ -1438,8 +1435,7 @@ static void rtl_op_sw_scan_start(struct ieee80211_hw *hw, return; } - if (ops->get_btc_status()) - btc_ops->btc_scan_notify(rtlpriv, 1); + rtl_btc(rtlpriv, ops, btc_scan_notify(rtlpriv, 1)); if (rtlpriv->dm.supp_phymode_switch) { if (ops->chk_switch_dmdp) @@ -1465,7 +1461,6 @@ static void rtl_op_sw_scan_complete(struct ieee80211_hw *hw, { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_hal_ops *ops = rtlpriv->cfg->ops; - struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); RT_TRACE(rtlpriv, COMP_MAC80211, DBG_LOUD, "\n"); @@ -1492,8 +1487,7 @@ static void rtl_op_sw_scan_complete(struct ieee80211_hw *hw, } ops->scan_operation_backup(hw, SCAN_OPT_RESTORE); - if (ops->get_btc_status()) - btc_ops->btc_scan_notify(rtlpriv, 0); + rtl_btc(rtlpriv, ops, btc_scan_notify(rtlpriv, 0)); } static int rtl_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index f129c4c76c05..fa93401acdab 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -1859,15 +1859,13 @@ static void rtl_pci_stop(struct
[PATCH 0/5] rtlwifi: Neatening
Joe Perches (5): rtlwifi: Use temporary ops variable to reduce code size rtlwifi: Use temporary variable btc_ops for rtlpriv->btcoexist.btc_ops rtlwifi: Add and use convenience macro rtl_btc realtek: btcoexist: Make the rtl_btc_ops struct const realtek: rtlwifi: drivers: Use the rtl_btc convenience macro drivers/net/wireless/realtek/rtlwifi/base.c| 46 ++-- .../wireless/realtek/rtlwifi/btcoexist/rtl_btc.c | 4 +- .../wireless/realtek/rtlwifi/btcoexist/rtl_btc.h | 2 +- drivers/net/wireless/realtek/rtlwifi/core.c| 189 drivers/net/wireless/realtek/rtlwifi/efuse.c | 9 +- drivers/net/wireless/realtek/rtlwifi/pci.c | 242 +++-- drivers/net/wireless/realtek/rtlwifi/ps.c | 83 +++ .../net/wireless/realtek/rtlwifi/rtl8192ee/hw.c| 3 +- .../wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c | 4 +- .../net/wireless/realtek/rtlwifi/rtl8723ae/hw.c| 3 +- .../net/wireless/realtek/rtlwifi/rtl8723be/hw.c| 3 +- .../net/wireless/realtek/rtlwifi/rtl8821ae/fw.c| 6 +- .../net/wireless/realtek/rtlwifi/rtl8821ae/hw.c| 3 +- drivers/net/wireless/realtek/rtlwifi/usb.c | 40 ++-- drivers/net/wireless/realtek/rtlwifi/wifi.h| 9 +- 15 files changed, 336 insertions(+), 310 deletions(-) -- 2.10.0.rc2.1.g053435c
[PATCH 2/5] rtlwifi: Use temporary variable btc_ops for rtlpriv->btcoexist.btc_ops
Reduce the code line length a little. Signed-off-by: Joe Perches--- drivers/net/wireless/realtek/rtlwifi/base.c | 7 --- drivers/net/wireless/realtek/rtlwifi/core.c | 10 ++ drivers/net/wireless/realtek/rtlwifi/pci.c | 8 +--- drivers/net/wireless/realtek/rtlwifi/ps.c | 16 ++-- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c index 4436addcace3..cc4b50e1b7e5 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.c +++ b/drivers/net/wireless/realtek/rtlwifi/base.c @@ -1312,11 +1312,11 @@ static void setup_arp_tx(struct rtl_priv *rtlpriv, struct rtl_ps_ctl *ppsc) { struct ieee80211_hw *hw = rtlpriv->hw; struct rtl_hal_ops *ops = rtlpriv->cfg->ops; + struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; rtlpriv->ra.is_special_data = true; if (ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_special_packet_notify( - rtlpriv, 1); + btc_ops->btc_special_packet_notify(rtlpriv, 1); rtl_lps_leave(hw); ppsc->last_delaylps_stamp_jiffies = jiffies; } @@ -1575,6 +1575,7 @@ void rtl_watchdog_wq_callback(void *data) struct ieee80211_hw *hw = rtlworks->hw; struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_hal_ops *ops = rtlpriv->cfg->ops; + struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; struct rtl_hal *rtlhal = rtl_hal(rtl_priv(hw)); struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); bool busytraffic = false; @@ -1714,7 +1715,7 @@ void rtl_watchdog_wq_callback(void *data) } if (ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_periodical(rtlpriv); + btc_ops->btc_periodical(rtlpriv); rtlpriv->link_info.bcn_rx_inperiod = 0; } diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c index 63f5c0cd6935..e08febc2d0d6 100644 --- a/drivers/net/wireless/realtek/rtlwifi/core.c +++ b/drivers/net/wireless/realtek/rtlwifi/core.c @@ -1046,6 +1046,7 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_hal_ops *ops = rtlpriv->cfg->ops; + struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; struct rtl_hal *rtlhal = rtl_hal(rtlpriv); struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); struct rtl_ps_ctl *ppsc = rtl_psc(rtl_priv(hw)); @@ -1193,8 +1194,7 @@ static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, true : false; if (ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_mediastatus_notify( - rtlpriv, mstatus); + btc_ops->btc_mediastatus_notify(rtlpriv, mstatus); } if (changed & BSS_CHANGED_ERP_CTS_PROT) { @@ -1428,6 +1428,7 @@ static void rtl_op_sw_scan_start(struct ieee80211_hw *hw, { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_hal_ops *ops = rtlpriv->cfg->ops; + struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); RT_TRACE(rtlpriv, COMP_MAC80211, DBG_LOUD, "\n"); @@ -1438,7 +1439,7 @@ static void rtl_op_sw_scan_start(struct ieee80211_hw *hw, } if (ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_scan_notify(rtlpriv, 1); + btc_ops->btc_scan_notify(rtlpriv, 1); if (rtlpriv->dm.supp_phymode_switch) { if (ops->chk_switch_dmdp) @@ -1464,6 +1465,7 @@ static void rtl_op_sw_scan_complete(struct ieee80211_hw *hw, { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_hal_ops *ops = rtlpriv->cfg->ops; + struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); RT_TRACE(rtlpriv, COMP_MAC80211, DBG_LOUD, "\n"); @@ -1491,7 +1493,7 @@ static void rtl_op_sw_scan_complete(struct ieee80211_hw *hw, ops->scan_operation_backup(hw, SCAN_OPT_RESTORE); if (ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_scan_notify(rtlpriv, 0); + btc_ops->btc_scan_notify(rtlpriv, 0); } static int rtl_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index 81c36978df8a..f129c4c76c05 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -1814,6 +1814,7 @@ static int rtl_pci_start(struct ieee80211_hw *hw) { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_hal_ops *ops = rtlpriv->cfg->ops; + struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
[PATCH 1/5] rtlwifi: Use temporary ops variable to reduce code size
rtlpriv->cfg->ops-> is used frequently in the source. Repeated use of this multiply dereferenced table creates larger objects. Using a temporary reduces code size as well as source code line length. $ size -t drivers/net/wireless/realtek/rtlwifi/*.o.defconfig.new textdata bss dec hex filename 126523024 24 157003d54 [...]/base.o.defconfig.new 16700 83 0 16783418f [...]/core.o.defconfig.new 10794 0 4 107982a2e [...]/efuse.o.defconfig.new 20988 5 0 209935201 [...]/pci.o.defconfig.new 6182 8 06190182e [...]/ps.o.defconfig.new 8410 1 4841520df [...]/usb.o.defconfig.new 757263121 32 78879 1341f (TOTALS) $ size -t drivers/net/wireless/realtek/rtlwifi/*.o.defconfig.old textdata bss dec hex filename 126043024 24 156523d24 [...]/base.o.defconfig.old 16892 83 0 16975424f [...]/core.o.defconfig.old 10794 0 4 107982a2e [...]/efuse.o.defconfig.old 21161 5 0 2116652ae [...]/pci.o.defconfig.old 6262 8 06270187e [...]/ps.o.defconfig.old 8435 1 4844020f8 [...]/usb.o.defconfig.old 761483121 32 79301 135c5 (TOTALS) Miscellanea around modified code: o Fix a few misindented code blocks o Realign arguments o Ignored 80 column checkpatch warnings Signed-off-by: Joe Perches--- drivers/net/wireless/realtek/rtlwifi/base.c | 45 ++--- drivers/net/wireless/realtek/rtlwifi/core.c | 187 +++-- drivers/net/wireless/realtek/rtlwifi/efuse.c | 9 +- drivers/net/wireless/realtek/rtlwifi/pci.c | 236 ++- drivers/net/wireless/realtek/rtlwifi/ps.c| 75 + drivers/net/wireless/realtek/rtlwifi/usb.c | 40 +++-- 6 files changed, 313 insertions(+), 279 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c index 710e5b447cff..4436addcace3 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.c +++ b/drivers/net/wireless/realtek/rtlwifi/base.c @@ -498,7 +498,7 @@ EXPORT_SYMBOL_GPL(rtl_deinit_deferred_work); void rtl_init_rfkill(struct ieee80211_hw *hw) { struct rtl_priv *rtlpriv = rtl_priv(hw); - + struct rtl_hal_ops *ops = rtlpriv->cfg->ops; bool radio_state; bool blocked; u8 valid = 0; @@ -507,7 +507,7 @@ void rtl_init_rfkill(struct ieee80211_hw *hw) rtlpriv->rfkill.rfkill_state = true; wiphy_rfkill_set_hw_state(hw->wiphy, 0); - radio_state = rtlpriv->cfg->ops->radio_onoff_checking(hw, ); + radio_state = ops->radio_onoff_checking(hw, ); if (valid) { pr_info("rtlwifi: wireless switch is %s\n", @@ -588,8 +588,9 @@ void rtl_init_rx_config(struct ieee80211_hw *hw) { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); + struct rtl_hal_ops *ops = rtlpriv->cfg->ops; - rtlpriv->cfg->ops->get_hw_reg(hw, HW_VAR_RCR, (u8 *) (>rx_conf)); + ops->get_hw_reg(hw, HW_VAR_RCR, (u8 *)>rx_conf); } EXPORT_SYMBOL_GPL(rtl_init_rx_config); @@ -1178,13 +1179,14 @@ bool rtl_tx_mgmt_proc(struct ieee80211_hw *hw, struct sk_buff *skb) { struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); struct rtl_priv *rtlpriv = rtl_priv(hw); + struct rtl_hal_ops *ops = rtlpriv->cfg->ops; __le16 fc = rtl_get_fc(skb); if (rtlpriv->dm.supp_phymode_switch && mac->link_state < MAC80211_LINKED && (ieee80211_is_auth(fc) || ieee80211_is_probe_req(fc))) { - if (rtlpriv->cfg->ops->chk_switch_dmdp) - rtlpriv->cfg->ops->chk_switch_dmdp(hw); + if (ops->chk_switch_dmdp) + ops->chk_switch_dmdp(hw); } if (ieee80211_is_auth(fc)) { RT_TRACE(rtlpriv, COMP_SEND, DBG_DMESG, "MAC80211_LINKING\n"); @@ -1309,11 +1311,12 @@ EXPORT_SYMBOL_GPL(rtl_action_proc); static void setup_arp_tx(struct rtl_priv *rtlpriv, struct rtl_ps_ctl *ppsc) { struct ieee80211_hw *hw = rtlpriv->hw; + struct rtl_hal_ops *ops = rtlpriv->cfg->ops; rtlpriv->ra.is_special_data = true; - if (rtlpriv->cfg->ops->get_btc_status()) + if (ops->get_btc_status()) rtlpriv->btcoexist.btc_ops->btc_special_packet_notify( - rtlpriv, 1); + rtlpriv, 1); rtl_lps_leave(hw); ppsc->last_delaylps_stamp_jiffies = jiffies; } @@ -1571,6 +1574,7 @@ void rtl_watchdog_wq_callback(void *data) watchdog_wq); struct ieee80211_hw *hw = rtlworks->hw; struct rtl_priv *rtlpriv = rtl_priv(hw); + struct rtl_hal_ops *ops = rtlpriv->cfg->ops; struct rtl_hal *rtlhal =
[PATCH 4/5] realtek: btcoexist: Make the rtl_btc_ops struct const
Avoid allowing a write into what should be const. Signed-off-by: Joe Perches--- drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.c | 4 ++-- drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.h | 2 +- drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +- drivers/net/wireless/realtek/rtlwifi/wifi.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.c index 3ab0cfe26513..19a95b69255d 100644 --- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.c +++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.c @@ -29,7 +29,7 @@ #include "rtl_btc.h" #include "halbt_precomp.h" -static struct rtl_btc_ops rtl_btc_operation = { +static const struct rtl_btc_ops rtl_btc_operation = { .btc_init_variables = rtl_btc_init_variables, .btc_init_hal_vars = rtl_btc_init_hal_vars, .btc_init_hw_config = rtl_btc_init_hw_config, @@ -161,7 +161,7 @@ void rtl_btc_special_packet_notify(struct rtl_priv *rtlpriv, u8 pkt_type) return exhalbtc_special_packet_notify(_bt_coexist, pkt_type); } -struct rtl_btc_ops *rtl_btc_get_ops_pointer(void) +const struct rtl_btc_ops *rtl_btc_get_ops_pointer(void) { return _btc_operation; } diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.h b/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.h index fff5117e1c4e..83c5bb2d6ad8 100644 --- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.h +++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/rtl_btc.h @@ -44,7 +44,7 @@ bool rtl_btc_is_disable_edca_turbo(struct rtl_priv *rtlpriv); bool rtl_btc_is_bt_disabled(struct rtl_priv *rtlpriv); void rtl_btc_special_packet_notify(struct rtl_priv *rtlpriv, u8 pkt_type); -struct rtl_btc_ops *rtl_btc_get_ops_pointer(void); +const struct rtl_btc_ops *rtl_btc_get_ops_pointer(void); u8 rtl_get_hwpg_bt_exist(struct rtl_priv *rtlpriv); u8 rtl_get_hwpg_bt_type(struct rtl_priv *rtlpriv); diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index fa93401acdab..8000894c4212 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -1814,7 +1814,7 @@ static int rtl_pci_start(struct ieee80211_hw *hw) { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_hal_ops *ops = rtlpriv->cfg->ops; - struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; + const struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops; struct rtl_hal *rtlhal = rtl_hal(rtl_priv(hw)); struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw)); struct rtl_ps_ctl *ppsc = rtl_psc(rtl_priv(hw)); diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h index 9a916188a703..d03f0ca92530 100644 --- a/drivers/net/wireless/realtek/rtlwifi/wifi.h +++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h @@ -2475,7 +2475,7 @@ struct rtl_btc_info { }; struct bt_coexist_info { - struct rtl_btc_ops *btc_ops; + const struct rtl_btc_ops *btc_ops; struct rtl_btc_info btc_info; /* EEPROM BT info. */ u8 eeprom_bt_coexist; -- 2.10.0.rc2.1.g053435c
[PATCH 5/5] realtek: rtlwifi: drivers: Use the rtl_btc convenience macro
Convert the uses of the btcoexist. to the rtl_btc macro to save a few lines of code. Signed-off-by: Joe Perches--- drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c | 3 +-- drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c | 4 +--- drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c | 3 +-- drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c | 3 +-- drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c | 6 ++ drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c | 3 +-- 6 files changed, 7 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c index 11d97fa0e921..9fc3e79e5a43 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c @@ -2572,8 +2572,7 @@ void rtl92ee_bt_hw_init(struct ieee80211_hw *hw) { struct rtl_priv *rtlpriv = rtl_priv(hw); - if (rtlpriv->cfg->ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_init_hw_config(rtlpriv); + rtl_btc(rtlpriv, rtlpriv->cfg->ops, btc_init_hw_config(rtlpriv)); } void rtl92ee_suspend(struct ieee80211_hw *hw) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c index ec9bcf32f0ab..8621ea8f6644 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c @@ -1767,9 +1767,7 @@ void rtl_8723e_c2h_command_handle(struct ieee80211_hw *hw) rtl8723e_dm_bt_parse_bt_info(hw, ptmp_buf, c2h_event.cmd_len); - if (rtlpriv->cfg->ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_periodical(rtlpriv); - + rtl_btc(rtlpriv, rtlpriv->cfg->ops, btc_periodical(rtlpriv)); break; default: break; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c index 5ac7b815648a..3b6d140bb863 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c @@ -2419,8 +2419,7 @@ void rtl8723e_bt_hw_init(struct ieee80211_hw *hw) { struct rtl_priv *rtlpriv = rtl_priv(hw); - if (rtlpriv->cfg->ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_init_hw_config(rtlpriv); + rtl_btc(rtlpriv, rtlpriv->cfg->ops, btc_init_hw_config(rtlpriv)); } void rtl8723e_suspend(struct ieee80211_hw *hw) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c index a79f936bb394..bddc4f56832d 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c @@ -2719,8 +2719,7 @@ void rtl8723be_bt_hw_init(struct ieee80211_hw *hw) { struct rtl_priv *rtlpriv = rtl_priv(hw); - if (rtlpriv->cfg->ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_init_hw_config(rtlpriv); + rtl_btc(rtlpriv, rtlpriv->cfg->ops, btc_init_hw_config(rtlpriv)); } diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c index 73350103b736..a7a537716a8e 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c @@ -1879,10 +1879,8 @@ void rtl8821ae_c2h_content_parsing(struct ieee80211_hw *hw, case C2H_8812_BT_INFO: RT_TRACE(rtlpriv, COMP_FW, DBG_LOUD, "[C2H], C2H_8812_BT_INFO!!\n"); - if (rtlpriv->cfg->ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_btinfo_notify(rtlpriv, - tmp_buf, - c2h_cmd_len); + rtl_btc(rtlpriv, rtlpriv->cfg->ops, + btc_btinfo_notify(rtlpriv, tmp_buf, c2h_cmd_len)); break; default: break; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c index 2bc6bace069c..f76c64570d16 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c @@ -4012,8 +4012,7 @@ void rtl8821ae_bt_hw_init(struct ieee80211_hw *hw) { struct rtl_priv *rtlpriv = rtl_priv(hw); - if (rtlpriv->cfg->ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_init_hw_config(rtlpriv); + rtl_btc(rtlpriv, rtlpriv->cfg->ops, btc_init_hw_config(rtlpriv)); } void rtl8821ae_suspend(struct ieee80211_hw *hw) -- 2.10.0.rc2.1.g053435c
[PATCH iproute2 1/1] tc: fixed typo in usage text.
Signed-off-by: Roman Mashak--- tc/f_u32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tc/f_u32.c b/tc/f_u32.c index ff700e9..b272c2c 100644 --- a/tc/f_u32.c +++ b/tc/f_u32.c @@ -34,7 +34,7 @@ static void explain(void) "Usage: ... u32 [ match SELECTOR ... ] [ link HTID ] [ classid CLASSID ]\n" " [ action ACTION_SPEC ] [ offset OFFSET_SPEC ]\n" " [ ht HTID ] [ hashkey HASHKEY_SPEC ]\n" - " [ sample SAMPLE ] [skip-hw | skip-sw]\n" + " [ sample SAMPLE ] [skip_hw | skip_sw]\n" "or u32 divisor DIVISOR\n" "\n" "Where: SELECTOR := SAMPLE SAMPLE ...\n" -- 1.9.1
[PATCH] liquidio: stop using huge static buffer, save 4096k in .data
Only compile-tested - I don't have the hardware. >From code inspection, octeon_pci_write_core_mem() appears to be safe wrt unaligned source. In any case, u8 fbuf[] was not guaranteed to be aligned anyway. Signed-off-by: Denys VlasenkoCC: Felix Manlunas CC: Prasad Kanneganti CC: Derek Chickles CC: David Miller CC: netdev@vger.kernel.org CC: linux-ker...@vger.kernel.org --- drivers/net/ethernet/cavium/liquidio/octeon_console.c | 6 +- drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c | 4 ++-- drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_console.c b/drivers/net/ethernet/cavium/liquidio/octeon_console.c index 53f38d0..e08f760 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_console.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_console.c @@ -724,13 +724,11 @@ static int octeon_console_read(struct octeon_device *oct, u32 console_num, } #define FBUF_SIZE (4 * 1024 * 1024) -u8 fbuf[FBUF_SIZE]; int octeon_download_firmware(struct octeon_device *oct, const u8 *data, size_t size) { int ret = 0; - u8 *p = fbuf; u32 crc32_result; u64 load_addr; u32 image_len; @@ -805,10 +803,8 @@ int octeon_download_firmware(struct octeon_device *oct, const u8 *data, else size = FBUF_SIZE; - memcpy(p, data, size); - /* download the image */ - octeon_pci_write_core_mem(oct, load_addr, p, (u32)size); + octeon_pci_write_core_mem(oct, load_addr, data, (u32)size); data += size; rem -= (u32)size; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c index 5cd96e7..4c85ae6 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c @@ -167,10 +167,10 @@ octeon_pci_read_core_mem(struct octeon_device *oct, void octeon_pci_write_core_mem(struct octeon_device *oct, u64 coreaddr, - u8 *buf, + const u8 *buf, u32 len) { - __octeon_pci_rw_core_mem(oct, coreaddr, buf, len, 0); + __octeon_pci_rw_core_mem(oct, coreaddr, (u8 *)buf, len, 0); } u64 octeon_read_device_mem64(struct octeon_device *oct, u64 coreaddr) diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h index bae2fdd..47a3ff5 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h @@ -66,7 +66,7 @@ octeon_pci_read_core_mem(struct octeon_device *oct, void octeon_pci_write_core_mem(struct octeon_device *oct, u64 coreaddr, - u8 *buf, + const u8 *buf, u32 len); #endif -- 2.9.2
Re: [PATCH net] sctp: ensure ep is not destroyed before doing the dump
From: Xin LongDate: Sat, 17 Jun 2017 16:10:27 +0800 > Now before dumping a sock in sctp_diag, it only holds the sock while > the ep may be already destroyed. It can cause a use-after-free panic > when accessing ep->asocs. > > This patch is to set sctp_sk(sk)->ep NULL in sctp_endpoint_destroy, > and check if this ep is already destroyed before dumping this ep. > > Suggested-by: Marcelo Ricardo Leitner > Signed-off-by: Xin Long Applied, thanks.
Re: [RFC PATCH net-next v2 01/15] bpf: BPF support for socket ops
On 06/17/2017 11:48 PM, Lawrence Brakmo wrote: On 6/16/17, 5:07 AM, "Daniel Borkmann"wrote: On 06/15/2017 10:08 PM, Lawrence Brakmo wrote: > Two new corresponding structs (one for the kernel one for the user/BPF > program): > > /* kernel version */ > struct bpf_socket_ops_kern { > struct sock *sk; > __u32 is_req_sock:1; > __u32 op; > union { > __u32 reply; > __u32 replylong[4]; > }; > }; > > /* user version */ > struct bpf_socket_ops { > __u32 op; > union { > __u32 reply; > __u32 replylong[4]; > }; > __u32 family; > __u32 remote_ip4; > __u32 local_ip4; > __u32 remote_ip6[4]; > __u32 local_ip6[4]; > __u32 remote_port; > __u32 local_port; > }; Above and ... struct bpf_sock { __u32 bound_dev_if; __u32 family; __u32 type; __u32 protocol; }; ... would result in two BPF sock user versions. It's okayish, but given struct bpf_sock is quite generic, couldn't we merge the members from struct bpf_socket_ops into struct bpf_sock instead? Idea would be that sock_filter_is_valid_access() for cgroups would then check off < 0 || off + size > offsetofend(struct bpf_sock, protocol) to disallow new members, and your socket_ops_is_valid_access() could allow and xlate the full range. The family member is already duplicate and the others could then be accessed from these kind of BPF progs as well, plus we have a single user representation similar as with __sk_buff that multiple types will use. I am concerned that it could make usage more confusing. One type of sock program (cgroup) could only use a subset of the fields while the other type (socket_ops) could use all (or a different subset). Then what happens if there is a need to add a new field to cgroup type sock program? In addition, in the near future I will have a patch to attach socket_ops programs to cgroups. I rather leave it as it is. Okay, I'm fine with that as well. For the __sk_buff, we also have the case that some members are not available for all program types like tc_classid, so it's similar there. But if indeed the majority of members cannot be supported for the most parts (?) then having different structs seems okay, probably easier to use, but we should try hard to not ending up with 10 different uapi socket structs that apply to program types working on sockets in one way or another.
Re: rtnetlink: add IFLA_GROUP to ifla_policy
From: David MillerDate: Mon, 19 Jun 2017 14:47:44 -0400 (EDT) > From: Serhey Popovych > Date: Fri, 16 Jun 2017 15:22:24 +0300 > >> Network interface groups support added while ago, however >> there is no IFLA_GROUP attribute description in policy >> and netlink message size calculations until now. >> >> Add IFLA_GROUP attribute to the policy. >> >> Fixes: cbda10fa97d7 ("net_device: add support for network device groups") >> Signed-off-by: Serhey Popovych > > Applied and queued up for -stable, thanks. Actually, this doesn't apply cleanly to the 'net' tree, please respin. Thansk.
Re: rtnetlink: add IFLA_GROUP to ifla_policy
From: Serhey PopovychDate: Fri, 16 Jun 2017 15:22:24 +0300 > Network interface groups support added while ago, however > there is no IFLA_GROUP attribute description in policy > and netlink message size calculations until now. > > Add IFLA_GROUP attribute to the policy. > > Fixes: cbda10fa97d7 ("net_device: add support for network device groups") > Signed-off-by: Serhey Popovych Applied and queued up for -stable, thanks.
Re: [RFC PATCH net-next v2 01/15] bpf: BPF support for socket ops
On 06/17/2017 01:41 AM, Lawrence Brakmo wrote: On 6/16/17, 5:07 AM, "Daniel Borkmann"wrote: [...] I see. You are saying have one struct in common but still keep the two PROG_TYPES? That makes sense. Do we really need two different is_valid_access functions? Both types should be able to see all the fields (otherwise adding new fields becomes messy). Would probably leave the two is_valid_access() separate initially, and once people ask for it we could potentially open this up to some of the other fields that are available at that time. > Currently there are two types of ops. The first type expects the BPF > program to return a value which is then used by the caller (or a > negative value to indicate the operation is not supported). The second > type expects state changes to be done by the BPF program, for example > through a setsockopt BPF helper function, and they ignore the return > value. [...] > +/* Call BPF_SOCKET_OPS program that returns an int. If the return value > + * is < 0, then the BPF op failed (for example if the loaded BPF > + * program does not support the chosen operation or there is no BPF > + * program loaded). > + */ > +#ifdef CONFIG_BPF > +static inline int tcp_call_bpf(struct sock *sk, bool is_req_sock, int op) > +{ > + struct bpf_socket_ops_kern socket_ops; > + > + memset(_ops, 0, sizeof(socket_ops)); > + socket_ops.sk = sk; > + socket_ops.is_req_sock = is_req_sock ? 1 : 0; Is is_req_sock actually used here in this patch (apart from setting it)? Not seeing that BPF prog will access it, if it also shouldn't access it, then bool type would be better. The only reason I used a bit was in case I wanted to add more fields later on. Does it make sense or should I just use bool? Didn't know that, but I think starting out with bool seems a bit cleaner, if needed we could later still switch to bitfield. > + socket_ops.op = op; > + > + return bpf_socket_ops_call(_ops); > +} [...] > +/* Global BPF program for sockets */ > +static struct bpf_prog *bpf_socket_ops_prog; > +static DEFINE_RWLOCK(bpf_socket_ops_lock); > + > +int bpf_socket_ops_set_prog(int fd) > +{ > + int err = 0; > + > + write_lock(_socket_ops_lock); > + if (bpf_socket_ops_prog) { > + bpf_prog_put(bpf_socket_ops_prog); > + bpf_socket_ops_prog = NULL; > + } > + > + /* fd of zero is used as a signal to remove the current > + * bpf_socket_ops_prog. > + */ > + if (fd == 0) { Can we make the fd related semantics similar to dev_change_xdp_fd()? Do you mean remove program is fd < 0 instead of == 0? Yes, that and also the ordering of dropping the ref of the existing bpf_socket_ops_prog program with setting the new one, so you can convert bpf_socket_ops_prog to RCU more easily. > + write_unlock(_socket_ops_lock); > + return 1; > + } > + > + bpf_socket_ops_prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_OPS); > + if (IS_ERR(bpf_socket_ops_prog)) { > + bpf_prog_put(bpf_socket_ops_prog); This will crash the kernel, passing err value to bpf_prog_put(). [...]
Re: [PATCH] loopback: Force LOOPBACK_IFINDEX for registration
From: Serhey PopovychDate: Fri, 16 Jun 2017 15:10:03 +0300 > Now with commit 9c7dafb (net: Allow to create links with > given ifindex) support registration of network devices > with specific ifindex is added. > > We can force loopback network device index before call to > register_netdev() to ensure we always configure it with > LOOPBACK_IFINDEX. > > Kill BUG_ON() since system can continue without network > namespace failed in loopback init path, unless it is > init_net namespace where we panic() anyway. > > Signed-off-by: Serhey Popovych Is the BUG_ON() triggering, if so why? It looks to me that unless there is a bug, this assignment is unnecessary.
Re: [PATCH v3 0/4] PTP support for macb driver
From: Rafal OziebloDate: Fri, 16 Jun 2017 12:58:18 +0100 > This patch series adds support for PTP synchronization protocol > in Cadence GEM driver based on PHC. This doesn't apply cleanly to net-next, please respin.
Re: ipv6: Do not leak throw route references
From: Serhey PopovychDate: Fri, 16 Jun 2017 14:42:17 +0300 > While commit 73ba57b (ipv6: fix backtracking for throw routes) > does good job on error propagation to the fib_rules_lookup() > in fib rules core framework that also corrects throw routes > handling, it does not solve route reference leakage problem > happened when we return -EAGAIN to the fib_rules_lookup() > and leave routing table entry referenced in arg->result. > > If rule with matched throw route isn't last matched in the > list we overwrite arg->result loosing reference on throw > route stored previously forever. > > We also partially revert commit ab997ad (ipv6: fix the > incorrect return value of throw route) since we never return > routing table entry with dst.error == -EAGAIN when > CONFIG_IPV6_MULTIPLE_TABLES is on. Also there is no point > to check for RTF_REJECT flag since it is always set throw > route. > > Fixes: 73ba57b (ipv6: fix backtracking for throw routes) > Signed-off-by: Serhey Popovych This does not apply cleanly to the net tree, please respin.
Re: [PATCH NET] net/hns:bugfix of ethtool -t phy self_test
From: Lin Yun ShengDate: Fri, 16 Jun 2017 17:24:51 +0800 > This patch fixes the phy loopback self_test failed issue. when > Marvell Phy Module is loaded, it will powerdown fiber when doing > phy loopback self test, which cause phy loopback self_test fail. > > Signed-off-by: Lin Yun Sheng Applied.
Re: [PATCH v2] arm: eBPF JIT compiler
On 06/17/2017 02:23 PM, Shubham Bansal wrote: Hi Daniel, Not all of the helpers have 4 or less byte arguments only, there are a few with 8 byte arguments, so making that general assumption wouldn't work. I guess what could be done is that helpers have a flag in struct bpf_func_proto which indicates for JITs that all args are 4 byte on 32bit so you could probably use convention similar to case2 for them. Presumably for that information to process, the JIT might need to be reworked to extract that via bpf_analyzer() that does a verifier run to re-analyze the program like in nfp JIT case. Let me try a better solution which can be used to support both 4 byte and 8 byte arguments. I hope it would work out. Are you sure this patch can pass if it only supports 4 byte arguments though? Let me list out what I have to do, so that you can tell me if I am thinking in a wrong way :- * I will add a bit flag in bpf_func_proto to represent whether different arguments in a function call are 4 bytes or 8 bytes. If lsb of bit flag is set then first argument is 8 byte, otherwise its not. I think I can handle this flag properly in build_insn() in my code. Does this sound okay? I don't understand second part of your solution, i.e. Presumably for that information to process, the JIT might need to be reworked to extract that via bpf_analyzer() that does a verifier run to re-analyze the program like in nfp JIT case. Please explain what are you suggesting and how can I extract bit flag from bpf_func_proto(). Please reply asap, as I would like to finish it over the weekend. Please. Sorry, had a travel over the weekend, so didn't read it in time. What is the issue with imitating in JIT what the interpreter is doing as a starting point? That should be generic enough to handle any case. Otherwise you'd need some sort of reverse mapping since verifier already converted BPF_CALL insns into relative helper addresses in imm part. -Shubham
Re: [PATCH net] net: 8021q: Fix one possible panic caused by BUG_ON in free_netdev
From: gfree.w...@vip.163.com Date: Fri, 16 Jun 2017 15:00:02 +0800 > From: Gao Feng> > The register_vlan_device would invoke free_netdev directly, when > register_vlan_dev failed. It would trigger the BUG_ON in free_netdev > if the dev was already registered. In this case, the netdev would be > freed in netdev_run_todo later. > > So add one condition check now. Only when dev is not registered, then > free it directly. > > The following is the part coredump when netdev_upper_dev_link failed > in register_vlan_dev. I removed the lines which are too long. ... > Signed-off-by: Gao Feng Ok, I guess this is how we will have to fix this. Applied, thanks.
Re: [PATCH v2 1/2] ip_tunnel: fix ip tunnel lookup in collect_md mode
On Mon, Jun 19, 2017 at 6:13 AM, 严海双wrote: > > >> On 19 Jun 2017, at 1:43 PM, Pravin Shelar wrote: >> >> On Fri, Jun 16, 2017 at 8:27 PM, Haishuang Yan >> wrote: >>> In collect_md mode, if the tun dev is down, it still can call >>> ip_tunnel_rcv to receive on packets, and the rx statistics increase >>> improperly. >>> >>> Fixes: 2e15ea390e6f ("ip_gre: Add support to collect tunnel metadata.") >>> Cc: Pravin B Shelar >>> Signed-off-by: Haishuang Yan >>> >>> --- >>> Change since v2: >>> * Fix wrong recipient addresss >>> --- >>> net/ipv4/ip_tunnel.c | 2 +- >>> 1 file changed, 1 insertion(+), 1 deletion(-) >>> >>> diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c >>> index 0f1d876..a3caba1 100644 >>> --- a/net/ipv4/ip_tunnel.c >>> +++ b/net/ipv4/ip_tunnel.c >>> @@ -176,7 +176,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net >>> *itn, >>>return cand; >>> >>>t = rcu_dereference(itn->collect_md_tun); >>> - if (t) >>> + if (t && (t->dev->flags & IFF_UP)) >>>return t; >>> >> It would be nice if we could increment drop count if tunnel device is not up. >> > Hi Pravin > > I think it’s not necessary, for example as gre tunnel, if ipgre_rcv fails, it > would trigger send an icmp unreachable > message: > > if (ipgre_rcv(skb, , hdr_len) == PACKET_RCVD) > return 0; > > icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); > > Since the tunnel device didn’t touch the packets, so increase drop statistics > is not necessary. > icmp err packets are not reliable on all networks. device stats are much more convenient during debugging connectivity issues.
Re: [PATCH v3 2/2] tcp: md5: add TCP_MD5SIG_EXT socket option to set a key address prefix
From: Ivan DelalandeDate: Thu, 15 Jun 2017 18:07:07 -0700 > Replace first padding in the tcp_md5sig structure with a new flag field > and address prefix length so it can be specified when configuring a new > key for TCP MD5 signature. The tcpm_flags field will only be used if the > socket option is TCP_MD5SIG_EXT to avoid breaking existing programs, and > tcpm_prefixlen only when the TCP_MD5SIG_FLAG_PREFIX flag is set. > > Signed-off-by: Bob Gilligan > Signed-off-by: Eric Mowat > Signed-off-by: Ivan Delalande Applied but I had to renumber TCP_MD5SIG_EXT to 32 since 31 is already taken by TCP_ULP in my tree. It's a shame we had to add a new sockopt number to do this, but I can't think of a better idea. Thanks.
Re: [PATCH v3 1/2] tcp: md5: add an address prefix for key lookup
From: Ivan DelalandeDate: Thu, 15 Jun 2017 18:07:06 -0700 > This allows the keys used for TCP MD5 signature to be used for whole > range of addresses, specified with a prefix length, instead of only one > address as it currently is. > > Signed-off-by: Bob Gilligan > Signed-off-by: Eric Mowat > Signed-off-by: Ivan Delalande Applied.
Re: [PATCH net-next 3/4] s390/diag: add diag26c support
From: Martin SchwidefskyDate: Mon, 19 Jun 2017 17:34:25 +0200 > We (as in the s390 guys) tend to add __packed to hardware and hypervisor > structures even if the attribute is not strictly necessary. Most of the > diagnose related structures look that way. Dunno if it is worth to change > them. It causes gcc to generate bad code on certain platforms (yes, probably not yours) and is in general something to avoid. Please do not use __packed unless absolutely necessary. > The diag26c struct needs to be aligned on a doubleword boundary, the > __aligned(8) is necessary. That's fine. > The __packed attribute is again superfluous but follows along the > lines of the other diag structures. Please remove it.
Re: [Bug 196093] New: dot1q S-VLAN frame on dot1ad configured interface is accepted
Confirmed. Works with 1.8.1-3 on Debian via unstable. > On Jun 18, 2017, at 10:35 PM, Toshiaki Makita> wrote: > > Hi, > > On 2017/06/17 0:40, Stephen Hemminger wrote: >> I suspect that VLAN offload on this Intel NIC is allowing any of the VLAN >> types. >> >> Begin forwarded message: >> >> Date: Fri, 16 Jun 2017 15:33:35 + >> From: bugzilla-dae...@bugzilla.kernel.org >> To: step...@networkplumber.org >> Subject: [Bug 196093] New: dot1q S-VLAN frame on dot1ad configured interface >> is accepted >> >> >> https://bugzilla.kernel.org/show_bug.cgi?id=196093 >> >>Bug ID: 196093 >> Summary: dot1q S-VLAN frame on dot1ad configured interface is >>accepted >> Product: Networking >> Version: 2.5 >>Kernel Version: 3.16.0 and 4.9.0 >> Hardware: Intel >>OS: Linux >> Tree: Mainline >>Status: NEW >> Severity: normal >> Priority: P1 >> Component: Other >> Assignee: step...@networkplumber.org >> Reporter: jason-kernelbugzi...@lixfeld.ca >>Regression: No >> >> Using the following configuration on an Intel 82599 port. Tested in Debian 8 >> with Kernel 3.16.0 and 4.9.0: >> >> ip link set dev eth4 up >> ip link add link eth4 eth4.100ad type vlan proto 802.1ad id 100 >> ip link add link eth4.100ad eth4.100ad.10q type vlan proto 802.1Q id 10 >> ip link set dev eth4 netns nni-ad >> ip link set dev eth4.100ad netns nni-ad >> ip link set dev eth4.100ad.10q netns nni-ad >> ip netns exec nni-ad ip link set dev eth4 up >> ip netns exec nni-ad ip link set dev eth4.100ad up >> ip netns exec nni-ad ip link set dev eth4.100ad.10q up >> ip netns exec nni-ad ip addr add 10.4.100.10/8 dev eth4.100ad.10q >> >> Ping to 10.4.100.10 while doing tcpdump on eth4 shows the frame has ether >> type >> 0x8100 (dot1q) on the S-VLAN, not 0x88a8 (dot1ad), yet the frame is still > > libpcap was not reliable in vlan protocol parsing. > https://github.com/the-tcpdump-group/libpcap/pull/346 > AFAIK libpcap 1.7.2 is required to parse it correctly. > >> accepted, and an echo reply is generated. >> >> The echo reply has the correct ethertype on the S-VLAN (0x88a8). My >> understanding is that if the frame received on the wire does not match the >> ether type of the configured interface, the frame should be dropped? > > Yes, it should. > > Toshiaki Makita >
Re: [PATCH nf-next] netns: add and use net_ns_barrier
On Tue, May 30, 2017 at 11:38:12AM +0200, Florian Westphal wrote: > Quoting Joe Stringer: > If a user loads nf_conntrack_ftp, sends FTP traffic through a network > namespace, destroys that namespace then unloads the FTP helper module, > then the kernel will crash. > > Events that lead to the crash: > 1. conntrack is created with ftp helper in netns x > 2. This netns is destroyed > 3. netns destruction is scheduled > 4. netns destruction wq starts, removes netns from global list > 5. ftp helper is unloaded, which resets all helpers of the conntracks > via for_each_net() > > but because netns is already gone from list the for_each_net() loop > doesn't include it, therefore all of these conntracks are unaffected. > > 6. helper module unload finishes > 7. netns wq invokes destructor for rmmod'ed helper Applied, thanks everyone.
Re: [PATCH net] netfilter: do not hold dev in ipt_CLUSTERIP
On Sat, May 20, 2017 at 05:08:06PM +0800, Xin Long wrote: > It's a terrible thing to hold dev in iptables target. When the dev is > being removed, unregister_netdevice has to wait for the dev to become > free. dmesg will keep logging the err: > > kernel:unregister_netdevice: waiting for veth0_in to become free. \ > Usage count = 1 > > until iptables rules with this target are removed manually. > > The worse thing is when deleting a netns, a virtual nic will be deleted > instead of reset to init_net in default_device_ops exit/exit_batch. As > it is earlier than to flush the iptables rules in iptable_filter_net_ops > exit, unregister_netdevice will block to wait for the nic to become free. > > As unregister_netdevice is actually waiting for iptables rules flushing > while iptables rules have to be flushed after unregister_netdevice. This > 'dead lock' will cause unregister_netdevice to block there forever. As > the netns is not available to operate at that moment, iptables rules can > not even be flushed manually either. > > The reproducer can be: > > # ip netns add test > # ip link add veth0_in type veth peer name veth0_out > # ip link set veth0_in netns test > # ip netns exec test ip link set lo up > # ip netns exec test ip link set veth0_in up > # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ > CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ > --local-node 1 --hashmode sourceip-sourceport > # ip netns del test > > This issue can be triggered by all virtual nics with ipt_CLUSTERIP. > > This patch is to fix it by not holding dev in ipt_CLUSTERIP, but only > save dev->ifindex instead of dev. When removing the mc from the dev, > it will get dev by c->ifindex through dev_get_by_index. > > Note that it doesn't save dev->name but dev->ifindex, as a dev->name > can be changed, it will confuse ipt_CLUSTERIP. Applied to nf-next. This problem has been there since day 1, and it's a large patch, so I prefer we follow nf-next path. Thanks!
Re: [PATCH net-next] net/mlx4_en: don't set CHECKSUM_COMPLETE on SCTP packets
hello Tariq, On Sun, 2017-06-18 at 14:10 +0300, Tariq Toukan wrote: > > @@ -624,12 +632,13 @@ static int check_csum(struct mlx4_cqe *cqe, struct > > sk_buff *skb, void *va, > > hdr += sizeof(struct vlan_hdr); > > } > > > > - if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4)) > > - get_fixed_ipv4_csum(hw_checksum, skb, hdr); > > + if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4)) && > > + (unlikely(get_fixed_ipv4_csum(hw_checksum, skb, hdr > > No! The lazy evaluation trick is wrong here. > This way you'll end up going almost always to the else (ipv6) for the > wrong reason. you are right! thanks for spotting this. > > + return -1; > > #if IS_ENABLED(CONFIG_IPV6) > > - else if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) > > - if (unlikely(get_fixed_ipv6_csum(hw_checksum, skb, hdr))) > > - return -1; > > + else if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) && > > + (unlikely(get_fixed_ipv6_csum(hw_checksum, skb, hdr > > + return -1; > > Let's not change this, might cause future bugs, similarly to the one above. > > #endif > > return 0; > > } maybe we can avoid adding braces, remove that 'else' keyword and the nested 'if', thus saving one line, given that check_csum() returns the same set of values as get_fixed_ipv{4,6}_checksum(), with the same meaning (-1 => go with CHECKSUM_NONE, 0 => go with CHECKSUM_COMPLETE). >8 @@ -625,11 +633,10 @@ static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va, } if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4)) - get_fixed_ipv4_csum(hw_checksum, skb, hdr); + return get_fixed_ipv4_csum(hw_checksum, skb, hdr); #if IS_ENABLED(CONFIG_IPV6) - else if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) - if (unlikely(get_fixed_ipv6_csum(hw_checksum, skb, hdr))) - return -1; + if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) + return get_fixed_ipv6_csum(hw_checksum, skb, hdr); #endif return 0; } 8< I will test and repost a v2 with this modification, unless you have any objections. Thank you in advance! regards -- davide
Re: [PATCH V3 net-next 2/8] net: hns3: Add support of the HNAE3 framework
On Sat, 17 Jun 2017 18:24:25 +0100 Salil Mehtawrote: > + > +/* This struct defines the operation on the handle. > + * > + * init_ae_dev(): (mandatory) > + * Get PF configure from pci_dev and initialize PF hardware > + * uninit_ae_dev() > + * Disable PF device and release PF resource > + * register_client > + * Register client to ae_dev > + * unregister_client() > + * Unregister client from ae_dev > + * start() > + * Enable the hardware > + * stop() > + * Disable the hardware > + * get_status() > + * Get the carrier state of the back channel of the handle, 1 for ok, 0 for > + * non-ok > + * get_ksettings_an_result() > + * Get negotiation status,speed and duplex > + * update_speed_duplex_h() > + * Update hardware speed and duplex > + * get_media_type() > + * Get media type of MAC > + * adjust_link() > + * Adjust link status > + * set_loopback() > + * Set loopback > + * set_promisc_mode > + * Set promisc mode > + * set_mtu() > + * set mtu > + * get_pauseparam() > + * get tx and rx of pause frame use > + * set_pauseparam() > + * set tx and rx of pause frame use > + * set_autoneg() > + * set auto autonegotiation of pause frame use > + * get_autoneg() > + * get auto autonegotiation of pause frame use > + * get_coalesce_usecs() > + * get usecs to delay a TX interrupt after a packet is sent > + * get_rx_max_coalesced_frames() > + * get Maximum number of packets to be sent before a TX interrupt. > + * set_coalesce_usecs() > + * set usecs to delay a TX interrupt after a packet is sent > + * set_coalesce_frames() > + * set Maximum number of packets to be sent before a TX interrupt. > + * get_mac_addr() > + * get mac address > + * set_mac_addr() > + * set mac address > + * add_uc_addr > + * Add unicast addr to mac table > + * rm_uc_addr > + * Remove unicast addr from mac table > + * set_mc_addr() > + * Set multicast address > + * add_mc_addr > + * Add multicast address to mac table > + * rm_mc_addr > + * Remove multicast address from mac table > + * update_stats() > + * Update Old network device statistics > + * get_ethtool_stats() > + * Get ethtool network device statistics > + * get_strings() > + * Get a set of strings that describe the requested objects > + * get_sset_count() > + * Get number of strings that @get_strings will write > + * update_led_status() > + * Update the led status > + * set_led_id() > + * Set led id > + * get_regs() > + * Get regs dump > + * get_regs_len() > + * Get the len of the regs dump > + * get_rss_key_size() > + * Get rss key size > + * get_rss_indir_size() > + * Get rss indirection table size > + * get_rss() > + * Get rss table > + * set_rss() > + * Set rss table > + * get_tc_size() > + * Get tc size of handle > + * get_vector() > + * Get vector number and vector infomation > + * map_ring_to_vector() > + * Map rings to vector > + * unmap_ring_from_vector() > + * Unmap rings from vector > + * add_tunnel_udp() > + * Add tunnel information to hardware > + * del_tunnel_udp() > + * Delete tunnel information from hardware > + * reset_queue() > + * Reset queue > + * get_fw_version() > + * Get firmware version > + * get_mdix_mode() > + * Get media typr of phy > + * set_vlan_filter() > + * Set vlan filter config of Ports > + * set_vf_vlan_filter() > + * Set vlan filter config of vf > + */ > +struct hnae3_ae_ops { > + int (*init_ae_dev)(struct hnae3_ae_dev *ae_dev); > + void (*uninit_ae_dev)(struct hnae3_ae_dev *ae_dev); > + > + int (*register_client)(struct hnae3_client *client, > +struct hnae3_ae_dev *ae_dev); > + void (*unregister_client)(struct hnae3_client *client, > + struct hnae3_ae_dev *ae_dev); > + int (*start)(struct hnae3_handle *handle); > + void (*stop)(struct hnae3_handle *handle); > + int (*get_status)(struct hnae3_handle *handle); > + void (*get_ksettings_an_result)(struct hnae3_handle *handle, > + u8 *auto_neg, u32 *speed, u8 *duplex); > + > + int (*update_speed_duplex_h)(struct hnae3_handle *handle); > + int (*cfg_mac_speed_dup_h)(struct hnae3_handle *handle, int speed, > +u8 duplex); > + > + void (*get_media_type)(struct hnae3_handle *handle, u8 *media_type); > + void (*adjust_link)(struct hnae3_handle *handle, int speed, int duplex); > + int (*set_loopback)(struct hnae3_handle *handle, > + enum hnae3_loop loop_mode, bool en); > + > + void (*set_promisc_mode)(struct hnae3_handle *handle, u32 en); > + int (*set_mtu)(struct hnae3_handle *handle, int new_mtu); > + > + void (*get_pauseparam)(struct hnae3_handle *handle, > +u32 *auto_neg, u32 *rx_en, u32 *tx_en); > + int (*set_pauseparam)(struct hnae3_handle *handle, > + u32 auto_neg, u32 rx_en, u32 tx_en); > + > + int
Re: [PATCH v1 1/2] dt-binding: ptp: add bindings document for dte based ptp clock
Hi David, Rob, I will address all of Rob's comments below. Since a part of the patch was applied to 'net-next', would you like me to send a new patch (based on the applied one), or a 'V2' of this patch ? Thanks Arun On 17-06-18 07:04 AM, Rob Herring wrote: > On Mon, Jun 12, 2017 at 01:26:00PM -0700, Arun Parameswaran wrote: >> Add device tree binding documentation for the Broadcom DTE >> PTP clock driver. >> >> Signed-off-by: Arun Parameswaran>> --- >> Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt | 13 + >> 1 file changed, 13 insertions(+) >> create mode 100644 Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt >> >> diff --git a/Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt >> b/Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt >> new file mode 100644 >> index 000..07590bc >> --- /dev/null >> +++ b/Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt >> @@ -0,0 +1,13 @@ >> +* Broadcom Digital Timing Engine(DTE) based PTP clock driver > > Bindings describe h/w, not drivers. > >> + >> +Required properties: >> +- compatible: should be "brcm,ptp-dte" > > Looks too generic. You need SoC specific compatible strings. > >> +- reg: address and length of the DTE block's NCO registers >> + >> +Example: >> + >> +ptp_dte: ptp_dte@180af650 { > > Don't use '_' in node names. > >> +compatible = "brcm,ptp-dte"; >> +reg = <0x180af650 0x10>; >> +status = "okay"; >> +}; >> -- >> 1.9.1 >>
[PATCH net-next] net: stmmac: enable TSO for IPv6
There is nothing in the IP that prevents us from enabling TSO for IPv6. Before patch: ftp fe80::2aa:bbff:fecc:1336%eth0 ftp> get /dev/zero 882512708 bytes received in 00:14 (56.11 MiB/s) After patch: ftp fe80::2aa:bbff:fecc:1336%eth0 ftp> get /dev/zero 1203326784 bytes received in 00:12 (94.52 MiB/s) Signed-off-by: Niklas Cassel--- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 6a1cb59728fe..fefbf817399a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2965,7 +2965,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) /* Manage oversized TCP frames for GMAC4 device */ if (skb_is_gso(skb) && priv->tso) { - if (ip_hdr(skb)->protocol == IPPROTO_TCP) + if (skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) return stmmac_tso_xmit(skb, dev); } @@ -4126,7 +4126,7 @@ int stmmac_dvr_probe(struct device *device, NETIF_F_RXCSUM; if ((priv->plat->tso_en) && (priv->dma_cap.tsoen)) { - ndev->hw_features |= NETIF_F_TSO; + ndev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6; priv->tso = true; dev_info(priv->device, "TSO feature enabled\n"); } -- 2.11.0
[PATCH net-next] ibmvnic: Return from ibmvnic_resume if not in VNIC_OPEN state
If the ibmvnic driver is not in the VNIC_OPEN state, return from ibmvnic_resume callback. If we are not in the VNIC_OPEN state, interrupts may not be initialized and directly calling the interrupt handler will cause a crash. Signed-off-by: John Allen--- diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 722daf5..0135095 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -3859,6 +3859,9 @@ static int ibmvnic_resume(struct device *dev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); int i; + if (adapter->state != VNIC_OPEN) + return 0; + /* kick the interrupt handlers just in case we lost an interrupt */ for (i = 0; i < adapter->req_rx_queues; i++) ibmvnic_interrupt_rx(adapter->rx_scrq[i]->irq,
[PATCH] dt-bindings: net: sms911x: Add missing optional VDD regulators
The lan911x family of devices require supplying from 3.3 V power supplies (connected to VDD_IO, VDD_A and VREG_3.3 pins). The existing driver however obtains only VDD_IO and VDD_A regulators in an optional way so document this in bindings. Signed-off-by: Krzysztof Kozlowski--- Documentation/devicetree/bindings/net/smsc911x.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/smsc911x.txt b/Documentation/devicetree/bindings/net/smsc911x.txt index 16c3a9501f5d..acfafc8e143c 100644 --- a/Documentation/devicetree/bindings/net/smsc911x.txt +++ b/Documentation/devicetree/bindings/net/smsc911x.txt @@ -27,6 +27,7 @@ Optional properties: of the device. On many systems this is wired high so the device goes out of reset at power-on, but if it is under program control, this optional GPIO can wake up in response to it. +- vdd33a-supply, vddvario-supply : 3.3V analog and IO logic power supplies Examples: -- 2.9.3
Re: [PATCH V2 net-next 1/8] net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC
On Wed, 14 Jun 2017 00:10:28 +0100 Salil Mehtawrote: > +hns3_nic_get_stats64(struct net_device *ndev, struct rtnl_link_stats64 > *stats) > +{ > + struct hns3_nic_priv *priv = netdev_priv(ndev); > + int queue_num = priv->ae_handle->kinfo.num_tqps; > + u64 tx_bytes = 0; > + u64 rx_bytes = 0; > + u64 tx_pkts = 0; > + u64 rx_pkts = 0; > + int idx = 0; unnecessary initialization > + > + for (idx = 0; idx < queue_num; idx++) { > + tx_bytes += priv->ring_data[idx].ring->stats.tx_bytes; > + tx_pkts += priv->ring_data[idx].ring->stats.tx_pkts; > + rx_bytes += > + priv->ring_data[idx + queue_num].ring->stats.rx_bytes; > + rx_pkts += priv->ring_data[idx + queue_num].ring->stats.rx_pkts; > + } > + Since rx_bytes and other statistics are 64 bit values. You need to use something to ensure that updates to these values are atomic on 32 bit platforms. The most common way to handle this is with the u64_stats_sync mechanism which is a nop on 64 bit architectures, and uses a seqcount to do updates on 32 bit CPU's.
Re: [PATCH net-next 3/4] s390/diag: add diag26c support
Hi Dave, On Mon, 19 Jun 2017 10:47:26 -0400 (EDT) David Millerwrote: > From: Julian Wiedmann > Date: Mon, 19 Jun 2017 13:22:24 +0200 > > > +#define DIAG26C_GET_MAC0x > > +struct diag26c_mac_req { > > + u32 resp_buf_len; > > + u32 resp_version; > > + u16 op_code; > > + u16 devno; > > + u8 res[4]; > > +} __packed; > > The packed attribute is not necessary here, the structure will be > perfectly packed together because of the types used and the order of > the members. We (as in the s390 guys) tend to add __packed to hardware and hypervisor structures even if the attribute is not strictly necessary. Most of the diagnose related structures look that way. Dunno if it is worth to change them. I agree that __packed should be avoided for software defined structures. > __packed is to be used only in the last possible resort for > correctness and every effort whatsoever should be used to avoid using > it. > > > + > > +struct diag26c_mac_resp { > > + u32 version; > > + u8 mac[ETH_ALEN]; > > + u16 res; > > +} __packed __aligned(8); > > Using packed with an 8 byte alignment is even more unnecessary. > > Again, it is not needed, so please don't use it. The diag26c struct needs to be aligned on a doubleword boundary, the __aligned(8) is necessary. The __packed attribute is again superfluous but follows along the lines of the other diag structures. I do not mind the extra __packed attributes, but if you care about them we could remove them from the structures in diag.h. > > + */ > > +static inline int __diag26c(void *req, void *resp, enum diag26c_sc > > subcode) > > Do not mark functions inline in *.c files, let the compiler decide. > Here I disagree. Basically all of our functions with assembly code are static inline, it is a common pattern even in C files. Sometimes the compiler *is* stupid and won't inline a function. And on s390 function calls do not come for free. -- blue skies, Martin. "Reality continues to ruin my life." - Calvin.
Re: [PATCH 06/44] iommu/dma: don't rely on DMA_ERROR_CODE
On 16/06/17 19:10, Christoph Hellwig wrote: > DMA_ERROR_CODE is not a public API and will go away soon. dma dma-iommu > driver already implements a proper ->mapping_error method, so it's only > using the value internally. Add a new local define using the value > that arm64 which is the only current user of dma-iommu. I was angling at just open-coding 0/!dma_addr/etc. for simplicity rather than having anything #defined at all - nothing except the 4th and final hunks actually have any relevance to dma_mapping_error(), and I reckon it's plenty clear enough in context. The rest is just proactively blatting address arguments with "arbitrary definitely-invalid value", which is more paranoia than anything else (and arguably unnecessary). It's not the biggest deal, though, so either way: Reviewed-by: Robin Murphy> Signed-off-by: Christoph Hellwig > --- > drivers/iommu/dma-iommu.c | 18 ++ > 1 file changed, 10 insertions(+), 8 deletions(-) > > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c > index 62618e77bedc..9403336f1fa6 100644 > --- a/drivers/iommu/dma-iommu.c > +++ b/drivers/iommu/dma-iommu.c > @@ -31,6 +31,8 @@ > #include > #include > > +#define IOMMU_MAPPING_ERROR 0 > + > struct iommu_dma_msi_page { > struct list_headlist; > dma_addr_t iova; > @@ -500,7 +502,7 @@ void iommu_dma_free(struct device *dev, struct page > **pages, size_t size, > { > __iommu_dma_unmap(iommu_get_domain_for_dev(dev), *handle, size); > __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT); > - *handle = DMA_ERROR_CODE; > + *handle = IOMMU_MAPPING_ERROR; > } > > /** > @@ -533,7 +535,7 @@ struct page **iommu_dma_alloc(struct device *dev, size_t > size, gfp_t gfp, > dma_addr_t iova; > unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap; > > - *handle = DMA_ERROR_CODE; > + *handle = IOMMU_MAPPING_ERROR; > > min_size = alloc_sizes & -alloc_sizes; > if (min_size < PAGE_SIZE) { > @@ -627,11 +629,11 @@ static dma_addr_t __iommu_dma_map(struct device *dev, > phys_addr_t phys, > > iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev); > if (!iova) > - return DMA_ERROR_CODE; > + return IOMMU_MAPPING_ERROR; > > if (iommu_map(domain, iova, phys - iova_off, size, prot)) { > iommu_dma_free_iova(cookie, iova, size); > - return DMA_ERROR_CODE; > + return IOMMU_MAPPING_ERROR; > } > return iova + iova_off; > } > @@ -671,7 +673,7 @@ static int __finalise_sg(struct device *dev, struct > scatterlist *sg, int nents, > > s->offset += s_iova_off; > s->length = s_length; > - sg_dma_address(s) = DMA_ERROR_CODE; > + sg_dma_address(s) = IOMMU_MAPPING_ERROR; > sg_dma_len(s) = 0; > > /* > @@ -714,11 +716,11 @@ static void __invalidate_sg(struct scatterlist *sg, int > nents) > int i; > > for_each_sg(sg, s, nents, i) { > - if (sg_dma_address(s) != DMA_ERROR_CODE) > + if (sg_dma_address(s) != IOMMU_MAPPING_ERROR) > s->offset += sg_dma_address(s); > if (sg_dma_len(s)) > s->length = sg_dma_len(s); > - sg_dma_address(s) = DMA_ERROR_CODE; > + sg_dma_address(s) = IOMMU_MAPPING_ERROR; > sg_dma_len(s) = 0; > } > } > @@ -836,7 +838,7 @@ void iommu_dma_unmap_resource(struct device *dev, > dma_addr_t handle, > > int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) > { > - return dma_addr == DMA_ERROR_CODE; > + return dma_addr == IOMMU_MAPPING_ERROR; > } > > static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, >
[PATCH 0/2] fix loadable module for DPAA Ethernet
The DPAA Ethernet makes use of a symbol that is not exported. Address the issue by propagating the dma_ops rather than calling arch_setup_dma_ops(). Madalin Bucur (2): fsl/fman: propagate dma_ops dpaa_eth: reuse the dma_ops provided by the FMan MAC device drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- drivers/net/ethernet/freescale/fman/mac.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) -- 2.1.0