Re: [PATCH 16/16] thunderbolt: Add support for networking over Thunderbolt cable
On Tue, Sep 19, 2017 at 01:21:44AM +0200, Andrew Lunn wrote: > On Mon, Sep 18, 2017 at 06:30:49PM +0300, Mika Westerberg wrote: > > From: Amir Levy> > > > ThunderboltIP is a protocol created by Apple to tunnel IP/ethernet > > traffic over a Thunderbolt cable. The protocol consists of configuration > > phase where each side sends ThunderboltIP login packets (the protocol is > > determined by UUID in the XDomain packet header) over the configuration > > channel. Once both sides get positive acknowledgment to their login > > packet, they configure high-speed DMA path accordingly. This DMA path is > > then used to transmit and receive networking traffic. > > > > This patch creates a virtual ethernet interface the host software can > > use in the same way as any other networking interface. Once the > > interface is brought up successfully network packets get tunneled over > > the Thunderbolt cable to the remote host and back. > > > > The connection is terminated by sending a ThunderboltIP logout packet > > over the configuration channel. We do this when the network interface is > > brought down by user or the driver is unloaded. > > > > Signed-off-by: Amir Levy > > Signed-off-by: Michael Jamet > > Signed-off-by: Mika Westerberg > > Reviewed-by: Yehezkel Bernat > > --- > > Documentation/admin-guide/thunderbolt.rst | 24 + > > drivers/thunderbolt/Kconfig | 12 + > > drivers/thunderbolt/Makefile |3 + > > drivers/thunderbolt/net.c | 1392 > > + > > 4 files changed, 1431 insertions(+) > > create mode 100644 drivers/thunderbolt/net.c > > Hi Mika > > Could this be renamed to driver/net/thunderbolt.c? I pondered between drivers/thunderbolt/net.c and drivers/net/thunderbolt.c and then decided to go with the former because it follows drivers/firewire/net.c and kind of makes it easier for user to enabled. But no problem moving it into drivers/net if that's what networking people prefer. > At minimum, it needs a MAINTAINER entry pointing to netdev, so patches > get reviewed by netdev people. However, since the driver seems to be a > lot more netdev than thunderbolt, placing it in driver/net could be > better. OK.
[PATCH,net-next,2/2] tun: enable napi_gro_frags() for TUN/TAP driver
Add a TUN/TAP receive mode that exercises the napi_gro_frags() interface. This mode is available only in TAP mode, as the interface expects packets with Ethernet headers. Furthermore, packets follow the layout of the iovec_iter that was received. The first iovec is the linear data, and every one after the first is a fragment. If there are more fragments than the max number, drop the packet. Additionally, invoke eth_get_headlen() to exercise flow dissector code and to verify that the header resides in the linear data. The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option. This is imposed because this mode is intended for testing via tools like syzkaller and packetdrill, and the increased flexibility it provides can introduce security vulnerabilities. Signed-off-by: Petar PenkovCc: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Cc: da...@davemloft.net Cc: ppen...@stanford.edu --- drivers/net/tun.c | 135 ++-- include/uapi/linux/if_tun.h | 1 + 2 files changed, 130 insertions(+), 6 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 46cca1094c91..ebe0d7dc7de6 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -75,6 +75,7 @@ #include #include #include +#include #include @@ -120,8 +121,15 @@ do { \ #define TUN_VNET_LE 0x8000 #define TUN_VNET_BE 0x4000 +#if IS_ENABLED(CONFIG_TUN_NAPI) +#define TUN_FEATURES_EXTRA IFF_NAPI_FRAGS +#else +#define TUN_FEATURES_EXTRA 0 +#endif + #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ - IFF_MULTI_QUEUE) + IFF_MULTI_QUEUE | TUN_FEATURES_EXTRA) + #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 @@ -173,6 +181,7 @@ struct tun_file { unsigned int ifindex; }; struct napi_struct napi; + struct mutex napi_mutex;/* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct skb_array tx_array; @@ -276,6 +285,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile) netif_napi_add(tun->dev, >napi, tun_napi_poll, NAPI_POLL_WEIGHT); napi_enable(>napi); + mutex_init(>napi_mutex); } } @@ -291,6 +301,11 @@ static void tun_napi_del(struct tun_file *tfile) netif_napi_del(>napi); } +static bool tun_napi_frags_enabled(const struct tun_struct *tun) +{ + return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS; +} + #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { @@ -1034,7 +1049,8 @@ static void tun_poll_controller(struct net_device *dev) * supports polling, which enables bridge devices in virt setups to * still use netconsole * If NAPI is enabled, however, we need to schedule polling for all -* queues. +* queues unless we are using napi_gro_frags(), which we call in +* process context and not in NAPI context. */ if (IS_ENABLED(CONFIG_TUN_NAPI)) { @@ -1042,6 +1058,9 @@ static void tun_poll_controller(struct net_device *dev) struct tun_file *tfile; int i; + if (tun_napi_frags_enabled(tun)) + return; + rcu_read_lock(); for (i = 0; i < tun->numqueues; i++) { tfile = rcu_dereference(tun->tfiles[i]); @@ -1264,6 +1283,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) return mask; } +static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, + size_t len, + const struct iov_iter *it) +{ + struct sk_buff *skb; + size_t linear; + int err; + int i; + + if (it->nr_segs > MAX_SKB_FRAGS + 1) + return ERR_PTR(-ENOMEM); + + local_bh_disable(); + skb = napi_get_frags(>napi); + local_bh_enable(); + if (!skb) + return ERR_PTR(-ENOMEM); + + linear = iov_iter_single_seg_count(it); + err = __skb_grow(skb, linear); + if (err) + goto free; + + skb->len = len; + skb->data_len = len - linear; + skb->truesize += skb->data_len; + + for (i = 1; i < it->nr_segs; i++) { + size_t fragsz = it->iov[i].iov_len; + unsigned long offset; + struct page *page; + void *data; + + if (fragsz == 0 || fragsz > PAGE_SIZE) { + err = -EINVAL; + goto free; + } + + local_bh_disable(); + data =
Re: [PATCH net-next 04/14] gtp: udp recv clean up
Hi Tom, [auto build test ERROR on net-next/master] url: https://github.com/0day-ci/linux/commits/Tom-Herbert/gtp-Additional-feature-support/20170919-143920 config: i386-randconfig-x016-201738 (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: # save the attached .config to linux build tree make ARCH=i386 Note: the linux-review/Tom-Herbert/gtp-Additional-feature-support/20170919-143920 HEAD 737a09b8f9cd56706d01703d17523b0fea907f41 builds fine. It only hurts bisectibility. All errors (new ones prefixed by >>): drivers//net/gtp.c: In function 'gtp_rx': >> drivers//net/gtp.c:222:21: error: 'gtp' undeclared (first use in this >> function) gro_cells_receive(>gro_cells, skb); ^~~ drivers//net/gtp.c:222:21: note: each undeclared identifier is reported only once for each function it appears in drivers//net/gtp.c: In function 'gtp_link_setup': drivers//net/gtp.c:628:18: error: 'gtp' undeclared (first use in this function) gro_cells_init(>gro_cells, dev); ^~~ vim +/gtp +222 drivers//net/gtp.c 190 191 static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb, 192 unsigned int hdrlen, unsigned int role) 193 { 194 struct pcpu_sw_netstats *stats; 195 196 if (!gtp_check_ms(skb, pctx, hdrlen, role)) { 197 netdev_dbg(pctx->dev, "No PDP ctx for this MS\n"); 198 return 1; 199 } 200 201 /* Get rid of the GTP + UDP headers. */ 202 if (iptunnel_pull_header(skb, hdrlen, skb->protocol, 203 !net_eq(sock_net(pctx->sk), dev_net(pctx->dev 204 return -1; 205 206 netdev_dbg(pctx->dev, "forwarding packet from GGSN to uplink\n"); 207 208 /* Now that the UDP and the GTP header have been removed, set up the 209 * new network header. This is required by the upper layer to 210 * calculate the transport header. 211 */ 212 skb_reset_network_header(skb); 213 214 skb->dev = pctx->dev; 215 216 stats = this_cpu_ptr(pctx->dev->tstats); 217 u64_stats_update_begin(>syncp); 218 stats->rx_packets++; 219 stats->rx_bytes += skb->len; 220 u64_stats_update_end(>syncp); 221 > 222 gro_cells_receive(>gro_cells, skb); 223 224 return 0; 225 } 226 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
[PATCH,net-next,1/2] tun: enable NAPI for TUN/TAP driver
Changes TUN driver to use napi_gro_receive() upon receiving packets rather than netif_rx_ni(). Adds flag CONFIG_TUN_NAPI that enables these changes and operation is not affected if the flag is disabled. SKBs are constructed upon packet arrival and are queued to be processed later. The new path was evaluated with a benchmark with the following setup: Open two tap devices and a receiver thread that reads in a loop for each device. Start one sender thread and pin all threads to different CPUs. Send 1M minimum UDP packets to each device and measure sending time for each of the sending methods: napi_gro_receive(): 4.90s netif_rx_ni(): 4.90s netif_receive_skb():7.20s Signed-off-by: Petar PenkovCc: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Cc: da...@davemloft.net Cc: ppen...@stanford.edu --- drivers/net/Kconfig | 8 drivers/net/tun.c | 120 +++- 2 files changed, 118 insertions(+), 10 deletions(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index aba0d652095b..0176264b1e70 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -307,6 +307,14 @@ config TAP This option is selected by any driver implementing tap user space interface for a virtual interface to re-use core tap functionality. +config TUN_NAPI + bool "NAPI support on tx path for TUN/TAP driver" + default n + depends on TUN + ---help--- + This option allows the TUN/TAP driver to use NAPI to pass packets to + the kernel when receiving packets from user space via write()/send(). + config TUN_VNET_CROSS_LE bool "Support for cross-endian vnet headers on little-endian kernels" default n diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3c9985f29950..46cca1094c91 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -172,6 +172,7 @@ struct tun_file { u16 queue_index; unsigned int ifindex; }; + struct napi_struct napi; struct list_head next; struct tun_struct *detached; struct skb_array tx_array; @@ -229,6 +230,67 @@ struct tun_struct { struct bpf_prog __rcu *xdp_prog; }; +static int tun_napi_receive(struct napi_struct *napi, int budget) +{ + struct tun_file *tfile = container_of(napi, struct tun_file, napi); + struct sk_buff_head *queue = >sk.sk_write_queue; + struct sk_buff_head process_queue; + struct sk_buff *skb; + int received = 0; + + __skb_queue_head_init(_queue); + + spin_lock(>lock); + skb_queue_splice_tail_init(queue, _queue); + spin_unlock(>lock); + + while (received < budget && (skb = __skb_dequeue(_queue))) { + napi_gro_receive(napi, skb); + ++received; + } + + if (!skb_queue_empty(_queue)) { + spin_lock(>lock); + skb_queue_splice(_queue, queue); + spin_unlock(>lock); + } + + return received; +} + +static int tun_napi_poll(struct napi_struct *napi, int budget) +{ + unsigned int received; + + received = tun_napi_receive(napi, budget); + + if (received < budget) + napi_complete_done(napi, received); + + return received; +} + +static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile) +{ + if (IS_ENABLED(CONFIG_TUN_NAPI)) { + netif_napi_add(tun->dev, >napi, tun_napi_poll, + NAPI_POLL_WEIGHT); + napi_enable(>napi); + } +} + +static void tun_napi_disable(struct tun_file *tfile) +{ + if (IS_ENABLED(CONFIG_TUN_NAPI)) + napi_disable(>napi); +} + +static void tun_napi_del(struct tun_file *tfile) +{ + if (IS_ENABLED(CONFIG_TUN_NAPI)) + netif_napi_del(>napi); +} + #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { @@ -541,6 +603,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun = rtnl_dereference(tfile->tun); + if (tun && clean) { + tun_napi_disable(tfile); + tun_napi_del(tfile); + } + if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); @@ -598,6 +665,7 @@ static void tun_detach_all(struct net_device *dev) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); + tun_napi_disable(tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); @@ -613,6 +681,7 @@ static void tun_detach_all(struct net_device *dev) synchronize_net();
[PATCH net-next 1/4] qed: Add iWARP enablement support
This patch is the last of the initial iWARP patch series. It adds the possiblity to actually detect iWARP from the device and enable it in the critical locations which basically make iWARP available. It wasn't submitted until now as iWARP hadn't been accepted into the rdma tree. Signed-off-by: Michal KalderonSigned-off-by: Ariel Elior --- drivers/net/ethernet/qlogic/qed/qed_cxt.c | 6 ++ drivers/net/ethernet/qlogic/qed/qed_mcp.c | 10 +- drivers/net/ethernet/qlogic/qed/qed_rdma.c| 5 - drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 1 + 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c index af106be..afd07ad 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c @@ -2069,6 +2069,12 @@ static void qed_rdma_set_pf_params(struct qed_hwfn *p_hwfn, num_srqs = min_t(u32, 32 * 1024, p_params->num_srqs); + if (p_hwfn->mcp_info->func_info.protocol == QED_PCI_ETH_RDMA) { + DP_NOTICE(p_hwfn, + "Current day drivers don't support RoCE & iWARP simultaneously on the same PF. Default to RoCE-only\n"); + p_hwfn->hw_info.personality = QED_PCI_ETH_ROCE; + } + switch (p_hwfn->hw_info.personality) { case QED_PCI_ETH_IWARP: /* Each QP requires one connection */ diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 376485d..8b99c7d 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -1691,12 +1691,12 @@ int qed_mcp_get_media_type(struct qed_dev *cdev, u32 *p_media_type) case FW_MB_PARAM_GET_PF_RDMA_ROCE: *p_proto = QED_PCI_ETH_ROCE; break; - case FW_MB_PARAM_GET_PF_RDMA_BOTH: - DP_NOTICE(p_hwfn, - "Current day drivers don't support RoCE & iWARP. Default to RoCE-only\n"); - *p_proto = QED_PCI_ETH_ROCE; - break; case FW_MB_PARAM_GET_PF_RDMA_IWARP: + *p_proto = QED_PCI_ETH_IWARP; + break; + case FW_MB_PARAM_GET_PF_RDMA_BOTH: + *p_proto = QED_PCI_ETH_RDMA; + break; default: DP_NOTICE(p_hwfn, "MFW answers GET_PF_RDMA_PROTOCOL but param is %08x\n", diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 6fb9951..06715f7 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -156,7 +156,10 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn, return rc; p_hwfn->p_rdma_info = p_rdma_info; - p_rdma_info->proto = PROTOCOLID_ROCE; + if (QED_IS_IWARP_PERSONALITY(p_hwfn)) + p_rdma_info->proto = PROTOCOLID_IWARP; + else + p_rdma_info->proto = PROTOCOLID_ROCE; num_cons = qed_cxt_get_proto_cid_count(p_hwfn, p_rdma_info->proto, NULL); diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c index 46d0c3c..a1d33f3 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c @@ -377,6 +377,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn, p_ramrod->personality = PERSONALITY_ISCSI; break; case QED_PCI_ETH_ROCE: + case QED_PCI_ETH_IWARP: p_ramrod->personality = PERSONALITY_RDMA_AND_ETH; break; default: -- 1.8.3.1
[PATCH net-next 2/4] qed: Add iWARP out of order support
iWARP requires OOO support which is already provided by the ll2 interface (until now was used only for iSCSI offload). The changes mostly include opening a ll2 dedicated connection for OOO and notifiying the FW about the handle id. Signed-off-by: Michal KalderonSigned-off-by: Ariel Elior --- drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 44 + drivers/net/ethernet/qlogic/qed/qed_iwarp.h | 11 +++- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 7 +++-- 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c index 9d989c9..568e985 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c @@ -41,6 +41,7 @@ #include "qed_rdma.h" #include "qed_reg_addr.h" #include "qed_sp.h" +#include "qed_ooo.h" #define QED_IWARP_ORD_DEFAULT 32 #define QED_IWARP_IRD_DEFAULT 32 @@ -119,6 +120,13 @@ static void qed_iwarp_cid_cleaned(struct qed_hwfn *p_hwfn, u32 cid) spin_unlock_bh(_hwfn->p_rdma_info->lock); } +void qed_iwarp_init_fw_ramrod(struct qed_hwfn *p_hwfn, + struct iwarp_init_func_params *p_ramrod) +{ + p_ramrod->ll2_ooo_q_index = RESC_START(p_hwfn, QED_LL2_QUEUE) + + p_hwfn->p_rdma_info->iwarp.ll2_ooo_handle; +} + static int qed_iwarp_alloc_cid(struct qed_hwfn *p_hwfn, u32 *cid) { int rc; @@ -1876,6 +1884,16 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL; } + if (iwarp_info->ll2_ooo_handle != QED_IWARP_HANDLE_INVAL) { + rc = qed_ll2_terminate_connection(p_hwfn, + iwarp_info->ll2_ooo_handle); + if (rc) + DP_INFO(p_hwfn, "Failed to terminate ooo connection\n"); + + qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_ooo_handle); + iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL; + } + qed_llh_remove_mac_filter(p_hwfn, p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr); return rc; @@ -1927,10 +1945,12 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) struct qed_iwarp_info *iwarp_info; struct qed_ll2_acquire_data data; struct qed_ll2_cbs cbs; + u16 n_ooo_bufs; int rc = 0; iwarp_info = _hwfn->p_rdma_info->iwarp; iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL; + iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL; iwarp_info->max_mtu = params->max_mtu; @@ -1978,6 +1998,29 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) if (rc) goto err; + /* Start OOO connection */ + data.input.conn_type = QED_LL2_TYPE_OOO; + data.input.mtu = params->max_mtu; + + n_ooo_bufs = (QED_IWARP_MAX_OOO * QED_IWARP_RCV_WND_SIZE_DEF) / +iwarp_info->max_mtu; + n_ooo_bufs = min_t(u32, n_ooo_bufs, QED_IWARP_LL2_OOO_MAX_RX_SIZE); + + data.input.rx_num_desc = n_ooo_bufs; + data.input.rx_num_ooo_buffers = n_ooo_bufs; + + data.input.tx_max_bds_per_packet = 1; /* will never be fragmented */ + data.input.tx_num_desc = QED_IWARP_LL2_OOO_DEF_TX_SIZE; + data.p_connection_handle = _info->ll2_ooo_handle; + + rc = qed_ll2_acquire_connection(p_hwfn, ); + if (rc) + goto err; + + rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_ooo_handle); + if (rc) + goto err; + return rc; err: qed_iwarp_ll2_stop(p_hwfn, p_ptt); @@ -2014,6 +2057,7 @@ int qed_iwarp_setup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, qed_spq_register_async_cb(p_hwfn, PROTOCOLID_IWARP, qed_iwarp_async_event); + qed_ooo_setup(p_hwfn); return qed_iwarp_ll2_start(p_hwfn, params, p_ptt); } diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h index 148ef3c..9e2bfde 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h @@ -47,7 +47,12 @@ enum qed_iwarp_qp_state { #define QED_IWARP_LL2_SYN_TX_SIZE (128) #define QED_IWARP_LL2_SYN_RX_SIZE (256) #define QED_IWARP_MAX_SYN_PKT_SIZE (128) -#define QED_IWARP_HANDLE_INVAL (0xff) + +#define QED_IWARP_LL2_OOO_DEF_TX_SIZE (256) +#define QED_IWARP_MAX_OOO (16) +#define QED_IWARP_LL2_OOO_MAX_RX_SIZE (16384) + +#define QED_IWARP_HANDLE_INVAL (0xff) struct qed_iwarp_ll2_buff { void *data; @@ -67,6 +72,7 @@ struct qed_iwarp_info { u8 crc_needed; u8 tcp_flags;
[PATCH net-next 0/4] qed: iWARP fixes and enhancements
This patch series includes several fixes and enhancements related to iWARP. Patch #1 is actually the last of the initial iWARP submission. It has been delayed until now as I wanted to make sure that qedr supports iWARP prior to enabling iWARP device detection. iWARP changes in RDMA tree have been accepted and targeted at kernel 4.15, therefore, all iWARP fixes for this cycle are submitted to net-next. Signed-off by: michal.kalde...@cavium.com Signed-off-by: Ariel EliorMichal Kalderon (4): qed: Add iWARP enablement support qed: Add iWARP out of order support qed: Fix maximum number of CQs for iWARP qed: iWARP - Add check for errors on a SYN packet drivers/net/ethernet/qlogic/qed/qed_cxt.c | 6 +++ drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 52 +++ drivers/net/ethernet/qlogic/qed/qed_iwarp.h | 11 - drivers/net/ethernet/qlogic/qed/qed_ll2.c | 1 + drivers/net/ethernet/qlogic/qed/qed_mcp.c | 10 ++--- drivers/net/ethernet/qlogic/qed/qed_rdma.c| 24 +++ drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 1 + include/linux/qed/qed_ll2_if.h| 1 + 8 files changed, 91 insertions(+), 15 deletions(-) -- 1.8.3.1
[PATCH net-next 4/4] qed: iWARP - Add check for errors on a SYN packet
A SYN packet which arrives with errors from FW should be dropped. This required adding an additional field to the ll2 rx completion data. Signed-off-by: Michal KalderonSigned-off-by: Ariel Elior --- drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 8 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 1 + include/linux/qed/qed_ll2_if.h | 1 + 3 files changed, 10 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c index 568e985..8fc9c811 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c @@ -1733,6 +1733,14 @@ int qed_iwarp_reject(void *rdma_cxt, struct qed_iwarp_reject_in *iparams) memset(_info, 0, sizeof(cm_info)); ll2_syn_handle = p_hwfn->p_rdma_info->iwarp.ll2_syn_handle; + + /* Check if packet was received with errors... */ + if (data->err_flags) { + DP_NOTICE(p_hwfn, "Error received on SYN packet: 0x%x\n", + data->err_flags); + goto err; + } + if (GET_FIELD(data->parse_flags, PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED) && GET_FIELD(data->parse_flags, PARSING_AND_ERR_FLAGS_L4CHKSMERROR)) { diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index c06ad4f..250afa5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -413,6 +413,7 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn, struct qed_ll2_comp_rx_data *data) { data->parse_flags = le16_to_cpu(p_cqe->rx_cqe_fp.parse_flags.flags); + data->err_flags = le16_to_cpu(p_cqe->rx_cqe_fp.err_flags.flags); data->length.packet_length = le16_to_cpu(p_cqe->rx_cqe_fp.packet_length); data->vlan = le16_to_cpu(p_cqe->rx_cqe_fp.vlan); diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index dd7a3b8..89fa0bb 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -101,6 +101,7 @@ struct qed_ll2_comp_rx_data { void *cookie; dma_addr_t rx_buf_addr; u16 parse_flags; + u16 err_flags; u16 vlan; bool b_last_packet; u8 connection_handle; -- 1.8.3.1
[PATCH] net: compat: assert the size of cmsg copied in is as expected
The actual length of cmsg fetched in during the second loop (i.e., kcmsg - kcmsg_base) could be different from what we get from the first loop (i.e., kcmlen). The main reason is that the two get_user() calls in the two loops (i.e., get_user(ucmlen, >cmsg_len) and __get_user(ucmlen, >cmsg_len)) could cause ucmlen to have different values even they fetch from the same userspace address, as user can race to change the memory content in >cmsg_len across fetches. Although in the second loop, the sanity check if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) is inplace, it only ensures that the cmsg fetched in during the second loop does not exceed the length of kcmlen, but not necessarily equal to kcmlen. But indicated by the assignment kmsg->msg_controllen = kcmlen, we should enforce that. This patch adds this additional sanity check and ensures that what is recorded in kmsg->msg_controllen is the actual cmsg length. Signed-off-by: Meng Xu--- net/compat.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/net/compat.c b/net/compat.c index 6ded6c8..2238171 100644 --- a/net/compat.c +++ b/net/compat.c @@ -185,6 +185,13 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); } + /* +* check the length of messages copied in is the same as the +* what we get from the first loop +*/ + if ((char *)kcmsg - (char *)kcmsg_base != kcmlen) + goto Einval; + /* Ok, looks like we made it. Hook it up and return success. */ kmsg->msg_control = kcmsg_base; kmsg->msg_controllen = kcmlen; -- 2.7.4
[PATCH] net: emac: Fix napi poll list corruption
This patch is pretty much a carbon copy of commit 3079c652141f ("caif: Fix napi poll list corruption") with "caif" replaced by "emac". The commit d75b1ade567f ("net: less interrupt masking in NAPI") breaks emac. It is now required that if the entire budget is consumed when poll returns, the napi poll_list must remain empty. However, like some other drivers emac tries to do a last-ditch check and if there is more work it will call napi_reschedule and then immediately process some of this new work. Should the entire budget be consumed while processing such new work then we will violate the new caller contract. This patch fixes this by not touching any work when we reschedule in emac. Signed-off-by: Christian Lamparter--- drivers/net/ethernet/ibm/emac/mal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index 2c74baa2398a..fff09dcf9e34 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -402,7 +402,7 @@ static int mal_poll(struct napi_struct *napi, int budget) unsigned long flags; MAL_DBG2(mal, "poll(%d)" NL, budget); - again: + /* Process TX skbs */ list_for_each(l, >poll_list) { struct mal_commac *mc = @@ -451,7 +451,6 @@ static int mal_poll(struct napi_struct *napi, int budget) spin_lock_irqsave(>lock, flags); mal_disable_eob_irq(mal); spin_unlock_irqrestore(>lock, flags); - goto again; } mc->ops->poll_tx(mc->dev); } -- 2.14.1
Re: [PATCH net] MAINTAINERS: Remove Yuval Mintz from maintainers list
From:Date: Tue, 19 Sep 2017 12:54:34 +0300 > From: Ariel Elior > > Remove Yuval from maintaining the bnx2x & qed* modules as he is no longer > working for the company. Thanks Yuval for your huge contributions and > tireless efforts over the many years and various companies. > > Ariel > Signed-off-by: Ariel Elior Applied, thanks.
Re: [PATCH net-next v3 1/4] bpf: add helper bpf_perf_event_read_value for perf event array map
Hi Yonghong, [auto build test ERROR on net-next/master] url: https://github.com/0day-ci/linux/commits/Yonghong-Song/bpf-add-two-helpers-to-read-perf-event-enabled-running-time/20170919-134113 config: m68k-allyesconfig (attached as .config) compiler: m68k-linux-gcc (GCC) 4.9.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree make.cross ARCH=m68k All errors (new ones prefixed by >>): kernel/bpf/arraymap.c: In function 'perf_event_fd_array_get_ptr': >> kernel/bpf/arraymap.c:495:6: error: too many arguments to function >> 'perf_event_read_local' if (perf_event_read_local(event, , NULL, NULL) == -EOPNOTSUPP) ^ In file included from kernel/bpf/arraymap.c:18:0: include/linux/perf_event.h:1290:19: note: declared here static inline int perf_event_read_local(struct perf_event *event, u64 *value) ^ vim +/perf_event_read_local +495 kernel/bpf/arraymap.c 480 481 static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 482 struct file *map_file, int fd) 483 { 484 struct bpf_event_entry *ee; 485 struct perf_event *event; 486 struct file *perf_file; 487 u64 value; 488 489 perf_file = perf_event_get(fd); 490 if (IS_ERR(perf_file)) 491 return perf_file; 492 493 ee = ERR_PTR(-EOPNOTSUPP); 494 event = perf_file->private_data; > 495 if (perf_event_read_local(event, , NULL, NULL) == > -EOPNOTSUPP) 496 goto err_out; 497 498 ee = bpf_event_entry_gen(perf_file, map_file); 499 if (ee) 500 return ee; 501 ee = ERR_PTR(-ENOMEM); 502 err_out: 503 fput(perf_file); 504 return ee; 505 } 506 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: Page allocator bottleneck
On Mon, Sep 18, 2017 at 06:33:20PM +0300, Tariq Toukan wrote: > > > On 18/09/2017 10:44 AM, Aaron Lu wrote: > > On Mon, Sep 18, 2017 at 03:34:47PM +0800, Aaron Lu wrote: > > > On Sun, Sep 17, 2017 at 07:16:15PM +0300, Tariq Toukan wrote: > > > > > > > > It's nice to have the option to dynamically play with the parameter. > > > > But maybe we should also think of changing the default fraction > > > > guaranteed > > > > to the PCP, so that unaware admins of networking servers would also > > > > benefit. > > > > > > I collected some performance data with will-it-scale/page_fault1 process > > > mode on different machines with different pcp->batch sizes, starting > > > from the default 31(calculated by zone_batchsize(), 31 is the standard > > > value for any zone that has more than 1/2MiB memory), then incremented > > > by 31 upwards till 527. PCP's upper limit is 6*batch. > > > > > > An image is plotted and attached: batch_full.png(full here means the > > > number of process started equals to CPU number). > > > > To be clear: X-axis is the value of batch size(31, 62, 93, ..., 527), > > Y-axis is the value of per_process_ops, generated by will-it-scale, One correction here, Y-axis isn't per_process_ops but per_process_ops * nr_processes. Still, higher is better. > > higher is better. > > > > > > > > From the image: > > > - For EX machines, they all see throughput increase with increased batch > > >size and peaked at around batch_size=310, then fall; > > > - For EP machines, Haswell-EP and Broadwell-EP also see throughput > > >increase with increased batch size and peaked at batch_size=279, then > > >fall, batch_size=310 also delivers pretty good result. Skylake-EP is > > >quite different in that it doesn't see any obvious throughput increase > > >after batch_size=93, though the trend is still increasing, but in a > > > very > > >small way and finally peaked at batch_size=403, then fall. > > >Ivybridge EP behaves much like desktop ones. > > > - For Desktop machines, they do not see any obvious changes with > > >increased batch_size. > > > > > > So the default batch size(31) doesn't deliver good enough result, we > > > probbaly should change the default value. > > Thanks Aaron for sharing your experiment results. > That's a good analysis of the effect of the batch value. > I agree with your conclusion. > > From networking perspective, we should reconsider the defaults to be able to > reach the increasing NICs linerates. > Not only for pcp->batch, but also for pcp->high. I guess I didn't make it clear in my last email: when pcp->batch is changed, pcp->high is also changed. Their relationship is: pcp->high = pcp->batch * 6. Manipulating percpu_pagelist_fraction could increase pcp->high, but not pcp->batch(it has an upper limit as 96 currently). My test shows even when pcp->high being the same, changing pcp->batch could further improve will-it-scale's performance. e.g. in the below two cases, pcp->high are both set to 1860 but with different pcp->batch: will-it-scalenative_queued_spin_lock_slowpath(perf) pcp->batch=9615762348 79.95% pcp->batch=310 19291492 +22.3% 74.87% -5.1% Granted, this is the case for will-it-scale and may not apply to your case. I have a small patch that adds a batch interface for debug purpose, echo a value could set batch and high will be batch * 6. You are welcome to give it a try if you think it's worth(attached). Regards, Aaron >From e3c9516beb8302cb8fb2f5ab866bbe2686fda5fb Mon Sep 17 00:00:00 2001 From: Aaron LuDate: Thu, 6 Jul 2017 15:00:07 +0800 Subject: [PATCH] percpu_pagelist_batch: add a batch interface Signed-off-by: Aaron Lu --- include/linux/mmzone.h | 2 ++ kernel/sysctl.c| 9 + mm/page_alloc.c| 40 +++- 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ef6a13b7bd3e..0548d038b7cd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -875,6 +875,8 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +int percpu_pagelist_batch_sysctl_handler(struct ctl_table *, int, + void __user *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dfba1a76cc3..85cc4544db1b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -108,6 +108,7 @@ extern unsigned int core_pipe_limit; extern int pid_max; extern int pid_max_min,
Re: [PATCH net] bpf: fix ri->map prog pointer on bpf_prog_realloc
On 09/19/2017 03:43 AM, Alexei Starovoitov wrote: On Tue, Sep 19, 2017 at 03:16:44AM +0200, Daniel Borkmann wrote: Commit 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") passed the pointer to the prog itself to be loaded into r4 prior on bpf_redirect_map() helper call, so that we can store the owner into ri->map_owner out of the helper. Issue with that is that the actual address of the prog is still subject to change when subsequent rewrites occur, e.g. through patching other helper functions or constant blinding. Thus, we really need to take prog->aux as the address we're holding, and then during runtime fetch the actual pointer via aux->prog. This also works with prog clones as they share the same aux and fixup pointer to self after blinding finished. Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Signed-off-by: Daniel Borkmann--- kernel/bpf/verifier.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 799b245..243c09f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4205,9 +4205,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) } if (insn->imm == BPF_FUNC_redirect_map) { - u64 addr = (unsigned long)prog; + /* Note, we cannot use prog directly as imm as subsequent +* rewrites would still change the prog pointer. The only +* stable address we can use is aux, which also works with +* prog clones during blinding. +*/ good catch. extra load at runtime sucks, but I don't see better solution. + u64 addr = (unsigned long)prog->aux; + const int r4 = BPF_REG_4; struct bpf_insn r4_ld[] = { - BPF_LD_IMM64(BPF_REG_4, addr), + BPF_LD_IMM64(r4, addr), + BPF_LDX_MEM(BPF_DW, r4, r4, + offsetof(struct bpf_prog_aux, prog)), needs to be BPF_FIELD_SIZEOF(struct bpf_prog_aux, prog) to work on 32-bit Good point, will spin a v2. Thanks!
Re: [PATCH v2] net: stmmac: dwmac-sun8i: Use reset exclusive
On Mon, Sep 18, 2017 at 08:30:43PM +0200, Corentin Labbe wrote: > The current dwmac_sun8i module cannot be rmmod/modprobe due to that > the reset controller was not released when removed. > > This patch remove ambiguity, by using of_reset_control_get_exclusive and > add the missing reset_control_put(). > > Note that we cannot use devm_reset_control_get, since the reset is not > in the device node. > > Signed-off-by: Corentin Labbe> --- > Changes since v1: > - added a note about devm_reset_control_get in commit message That comment would be better if it was in the code. > > drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c > b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c > index 57bb6dd7b401..1736d7cb0d96 100644 > --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c > +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c > @@ -854,6 +854,7 @@ static int sun8i_dwmac_unpower_internal_phy(struct > sunxi_priv_data *gmac) > > clk_disable_unprepare(gmac->ephy_clk); > reset_control_assert(gmac->rst_ephy); > + reset_control_put(gmac->rst_ephy); Putting it here is weird. What would happen if power_phy / unpower_phy is called several times? Can't we just make it symetric and undo in remove what we do in probe? Maxime -- Maxime Ripard, Free Electrons Embedded Linux and Kernel engineering http://free-electrons.com signature.asc Description: PGP signature
Re: [PATCH RFC V1 net-next 0/6] Time based packet transmission
On Tue, Sep 19, 2017 at 04:43:02PM +0200, Miroslav Lichvar wrote: > If I understand it correctly, this also allows us to make a PTP/NTP > "one-step" clock with HW that doesn't support it directly. Cool, yeah, I hadn't thought of that, but it would work... Thanks, Richard
[PATCH net] tcp: fastopen: fix on syn-data transmit failure
From: Eric DumazetOur recent change exposed a bug in TCP Fastopen Client that syzkaller found right away [1] When we prepare skb with SYN+DATA, we attempt to transmit it, and we update socket state as if the transmit was a success. In socket RTX queue we have two skbs, one with the SYN alone, and a second one containing the DATA. When (malicious) ACK comes in, we now complain that second one had no skb_mstamp. The proper fix is to make sure that if the transmit failed, we do not pretend we sent the DATA skb, and make it our send_head. When 3WHS completes, we can now send the DATA right away, without having to wait for a timeout. [1] WARNING: CPU: 0 PID: 100189 at net/ipv4/tcp_input.c:3117 tcp_clean_rtx_queue+0x2057/0x2ab0 net/ipv4/tcp_input.c:3117() WARN_ON_ONCE(last_ackt == 0); Modules linked in: CPU: 0 PID: 100189 Comm: syz-executor1 Not tainted Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 8800b35cb1d8 81cad00d 828a4347 88009f86c080 8316eb20 0d7f 8800b35cb220 812c33c2 8800baad2440 0009d46575c0 Call Trace: [] __dump_stack [] dump_stack+0xc1/0x124 [] warn_slowpath_common+0xe2/0x150 [] warn_slowpath_null+0x2e/0x40 [] tcp_clean_rtx_queue+0x2057/0x2ab0 n [] tcp_ack+0x151d/0x3930 [] tcp_rcv_state_process+0x1c69/0x4fd0 [] tcp_v4_do_rcv+0x54f/0x7c0 [] sk_backlog_rcv [] __release_sock+0x12b/0x3a0 [] release_sock+0x5e/0x1c0 [] inet_wait_for_connect [] __inet_stream_connect+0x545/0xc50 [] tcp_sendmsg_fastopen [] tcp_sendmsg+0x2298/0x35a0 [] inet_sendmsg+0xe5/0x520 [] sock_sendmsg_nosec [] sock_sendmsg+0xcf/0x110 Fixes: 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully") Fixes: 783237e8daf1 ("net-tcp: Fast Open client - sending SYN-data") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Neal Cardwell Cc: Yuchung Cheng --- net/ipv4/tcp_output.c |9 + 1 file changed, 9 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 517d737059d18d8821b65dcdf54d9bb3448784c2..0bc9e46a53696578eb6e911f2f75e6b34c80894f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3389,6 +3389,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto done; } + /* data was not sent, this is our new send_head */ + sk->sk_send_head = syn_data; + tp->packets_out -= tcp_skb_pcount(syn_data); + fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) @@ -3441,6 +3445,11 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + buff = tcp_send_head(sk); + if (unlikely(buff)) { + tp->snd_nxt = TCP_SKB_CB(buff)->seq; + tp->pushed_seq = TCP_SKB_CB(buff)->seq; + } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */
[PATCH net-next 3/4] qed: Fix maximum number of CQs for iWARP
The maximum number of CQs supported is bound to the number of connections supported, which differs between RoCE and iWARP. This fixes a crash that occurred in iWARP when running 1000 sessions using perftest. Signed-off-by: Michal KalderonSigned-off-by: Ariel Elior --- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 4f46f28..c8c4b39 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -209,11 +209,11 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn, goto free_pd_map; } - /* Allocate bitmap for cq's. The maximum number of CQs is bounded to -* twice the number of QPs. + /* Allocate bitmap for cq's. The maximum number of CQs is bound to +* the number of connections we support. (num_qps in iWARP or +* num_qps/2 in RoCE). */ - rc = qed_rdma_bmap_alloc(p_hwfn, _rdma_info->cq_map, -p_rdma_info->num_qps * 2, "CQ"); + rc = qed_rdma_bmap_alloc(p_hwfn, _rdma_info->cq_map, num_cons, "CQ"); if (rc) { DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Failed to allocate cq bitmap, rc = %d\n", rc); @@ -222,10 +222,10 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn, /* Allocate bitmap for toggle bit for cq icids * We toggle the bit every time we create or resize cq for a given icid. -* The maximum number of CQs is bounded to twice the number of QPs. +* Size needs to equal the size of the cq bmap. */ rc = qed_rdma_bmap_alloc(p_hwfn, _rdma_info->toggle_bits, -p_rdma_info->num_qps * 2, "Toggle"); +num_cons, "Toggle"); if (rc) { DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Failed to allocate toogle bits, rc = %d\n", rc); -- 1.8.3.1
Re: [PATCH net] bpf: do not disable/enable BH in bpf_map_free_id()
On Tue, Sep 19, 2017 at 04:15:59PM +, Eric Dumazet wrote: > From: Eric Dumazet> > syzkaller reported following splat [1] > > Since hard irq are disabled by the caller, bpf_map_free_id() > should not try to enable/disable BH. > > Another solution would be to change htab_map_delete_elem() to > defer the free_htab_elem() call after > raw_spin_unlock_irqrestore(>lock, flags), but this might be not > enough to cover other code paths. Thanks for fixing it. Acked-by: Martin KaFai Lau
Re: [PATCH] VSOCK: fix uapi/linux/vm_sockets.h incomplete types
From: Stefan HajnocziDate: Mon, 18 Sep 2017 16:21:00 +0100 > On Fri, Sep 15, 2017 at 02:14:32PM -0700, David Miller wrote: >> > diff --git a/include/uapi/linux/vm_sockets.h >> > b/include/uapi/linux/vm_sockets.h >> > index b4ed5d895699..4ae5c625ac56 100644 >> > --- a/include/uapi/linux/vm_sockets.h >> > +++ b/include/uapi/linux/vm_sockets.h >> > @@ -18,6 +18,10 @@ >> > >> > #include >> > >> > +#ifndef __KERNEL__ >> > +#include /* struct sockaddr */ >> > +#endif >> > + >> >> There is no precedence whatsoever to include sys/socket.h in _any_ UAPI >> header file provided by the kernel. > > does it for the same reason: > > include/uapi/linux/if.h:#include /* for struct > sockaddr. */ You don't need it for struct sockaddr, you need it for sa_family_t, the comment is very misleading. Please do as I have instructed and it will fix this problem. Thank you.
[PATCH net-next 4/4] net: dsa: move master ethtool code
DSA overrides the master device ethtool ops, so that it can inject stats from its dedicated switch CPU port as well. The related code is currently split in dsa.c and slave.c, but it only scopes the master net device. Move it to a new master.c DSA core file. This file will be later extented with master net device specific code. Signed-off-by: Vivien Didelot--- net/dsa/Makefile | 2 +- net/dsa/dsa.c | 28 - net/dsa/dsa2.c | 4 +- net/dsa/dsa_priv.h | 7 ++-- net/dsa/legacy.c | 4 +- net/dsa/master.c | 120 + net/dsa/slave.c| 83 7 files changed, 129 insertions(+), 119 deletions(-) create mode 100644 net/dsa/master.c diff --git a/net/dsa/Makefile b/net/dsa/Makefile index fcce25da937c..2e7ac8bab19d 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,6 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o dsa2.o legacy.o port.o slave.o switch.o +dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index abadf7b49236..81c852e32821 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -112,34 +112,6 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) return ops; } -int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) -{ - struct dsa_switch *ds = cpu_dp->ds; - struct net_device *master; - struct ethtool_ops *cpu_ops; - - master = cpu_dp->netdev; - - cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL); - if (!cpu_ops) - return -ENOMEM; - - cpu_dp->orig_ethtool_ops = master->ethtool_ops; - if (cpu_dp->orig_ethtool_ops) - memcpy(cpu_ops, cpu_dp->orig_ethtool_ops, sizeof(*cpu_ops)); - - dsa_cpu_port_ethtool_init(cpu_ops); - master->ethtool_ops = cpu_ops; - - return 0; -} - -void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp) -{ - cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops; - cpu_dp->orig_ethtool_ops = NULL; -} - void dsa_cpu_dsa_destroy(struct dsa_port *port) { struct device_node *port_dn = port->dn; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 032f8bc3e788..dcccaebde708 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -440,7 +440,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) wmb(); dst->cpu_dp->netdev->dsa_ptr = dst; - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); + err = dsa_master_ethtool_setup(dst->cpu_dp->netdev); if (err) return err; @@ -457,7 +457,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) if (!dst->applied) return; - dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dsa_master_ethtool_restore(dst->cpu_dp->netdev); dst->cpu_dp->netdev->dsa_ptr = NULL; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9c3eeb72462d..f616b318 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -97,8 +97,6 @@ struct dsa_slave_priv { int dsa_cpu_dsa_setup(struct dsa_port *port); void dsa_cpu_dsa_destroy(struct dsa_port *dport); const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); -int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp); -void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp); bool dsa_schedule_work(struct work_struct *work); /* legacy.c */ @@ -112,6 +110,10 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); +/* master.c */ +int dsa_master_ethtool_setup(struct net_device *dev); +void dsa_master_ethtool_restore(struct net_device *dev); + /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); @@ -139,7 +141,6 @@ int dsa_port_vlan_del(struct dsa_port *dp, /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); -void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops); int dsa_slave_create(struct dsa_port *port, const char *name); void dsa_slave_destroy(struct net_device *slave_dev); int dsa_slave_suspend(struct net_device *slave_dev); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 163910699db7..ae505d8e4417 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -602,7 +602,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, wmb(); dev->dsa_ptr = dst; - return dsa_cpu_port_ethtool_setup(dst->cpu_dp); + return dsa_master_ethtool_setup(dst->cpu_dp->netdev); } static int dsa_probe(struct platform_device *pdev) @@ -667,7 +667,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) {
[PATCH net-next 3/4] net: dsa: setup master ethtool after dsa_ptr
DSA overrides the master's ethtool ops so that we can inject its CPU port's statistics. Because of that, we need to setup the ethtool ops after the master's dsa_ptr pointer has been assigned, not before. This patch setups the ethtool ops after dsa_ptr is assigned, and restores them before it gets cleared. Signed-off-by: Vivien Didelot--- net/dsa/dsa2.c | 12 +++- net/dsa/legacy.c | 10 +++--- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index bd19304f862f..032f8bc3e788 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -433,16 +433,17 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); - if (err) - return err; - /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. */ wmb(); dst->cpu_dp->netdev->dsa_ptr = dst; + + err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); + if (err) + return err; + dst->applied = true; return 0; @@ -456,6 +457,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) if (!dst->applied) return; + dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dst->cpu_dp->netdev->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype @@ -472,7 +475,6 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - dsa_cpu_port_ethtool_restore(dst->cpu_dp); dst->cpu_dp = NULL; pr_info("DSA: tree %d unapplied\n", dst->tree); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 91e6f7981d39..163910699db7 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -206,10 +206,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, netdev_err(master, "[%d] : can't configure CPU and DSA ports\n", index); - ret = dsa_cpu_port_ethtool_setup(ds->dst->cpu_dp); - if (ret) - return ret; - return 0; } @@ -606,7 +602,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, wmb(); dev->dsa_ptr = dst; - return 0; + return dsa_cpu_port_ethtool_setup(dst->cpu_dp); } static int dsa_probe(struct platform_device *pdev) @@ -671,6 +667,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; + dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dst->cpu_dp->netdev->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype @@ -686,8 +684,6 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } - dsa_cpu_port_ethtool_restore(dst->cpu_dp); - dev_put(dst->cpu_dp->netdev); } -- 2.14.1
[PATCH net-next 2/4] net: dsa: setup master ethtool unconditionally
When a DSA switch tree is meant to be applied, it already has a CPU port. Thus remove the condition of dst->cpu_dp. Moreover, the next lines access dst->cpu_dp unconditionally. Signed-off-by: Vivien Didelot--- net/dsa/dsa2.c | 14 +- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 873af0108e24..bd19304f862f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -433,11 +433,9 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - if (dst->cpu_dp) { - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); - if (err) - return err; - } + err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); + if (err) + return err; /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get @@ -474,10 +472,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - if (dst->cpu_dp) { - dsa_cpu_port_ethtool_restore(dst->cpu_dp); - dst->cpu_dp = NULL; - } + dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dst->cpu_dp = NULL; pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; -- 2.14.1
[PATCH net-next 0/4] net: dsa: move master ethtool code
The DSA core overrides the master device's ethtool_ops structure so that it can inject statistics and such of its dedicated switch CPU port. This ethtool code is currently called on unnecessary conditions or before the master interface and its switch CPU port get wired up. This patchset fixes this. Similarly to slave.c where the DSA slave net_device is the entry point of the dsa_slave_* functions, this patchset also isolates the master's ethtool code in a new master.c file, where the DSA master net_device is the entry point of the dsa_master_* functions. This is a first step towards better control of the master device and support for multiple CPU ports. Vivien Didelot (4): net: dsa: remove copy of master ethtool_ops net: dsa: setup master ethtool unconditionally net: dsa: setup master ethtool after dsa_ptr net: dsa: move master ethtool code include/net/dsa.h | 1 - net/dsa/Makefile | 2 +- net/dsa/dsa.c | 28 - net/dsa/dsa2.c | 18 net/dsa/dsa_priv.h | 7 ++-- net/dsa/legacy.c | 10 ++--- net/dsa/master.c | 120 + net/dsa/slave.c| 80 --- 8 files changed, 136 insertions(+), 130 deletions(-) create mode 100644 net/dsa/master.c -- 2.14.1
Re: Re: [PATCH] net/packet: fix race condition between fanout_add and __unregister_prot_hook
On Tue, Sep 19, 2017 at 3:21 AM, Nixiaomingwrote: > On Fri, Sep 15, 2017 at 10:46 AM, Willem de Bruijn > > wrote: > >> > >> In case of failure we also need to unlink and free match. I > >> sent the following: > >> > >> http://patchwork.ozlabs.org/patch/813945/ > > > > + spin_lock(>bind_lock); > > + if (po->running && > > + match->type == type && > >match->prot_hook.type == po->prot_hook.type && > >match->prot_hook.dev == po->prot_hook.dev) { > > err = -ENOSPC; > > @@ -1761,6 +1760,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 > type_flags) > > err = 0; > > } > >} > > + spin_unlock(>bind_lock); > > + > > + if (err && !refcount_read(>sk_ref)) { > > +list_del(>list); > > +kfree(match); > > + } > > > > > > In the function fanout_add add spin_lock to protect po-> running and po-> > fanout, > > then whether it should be in the function fanout_release also add spin_lock > protection ? po->bind_lock is held when registering and unregistering the protocol hook. fanout_release does access po->running or prot_hook. It is called from packet_release, which does hold the bind_lock when unregistering the protocol hook.
RE: [PATCH net-next 05/12] net: dsa: b53: Use a macro to define I/O operations
> >>> +#define b53_build_op(type, op_size, val_type)\ > >>> +static inline int b53_##type##op_size(struct b53_device *dev, u8 > >page,\ > >>> + u8 reg, val_type val) > >>> \ > >>> +{ > >>> \ > >>> + int ret; > >>> \ > >>> + > >>> \ > >>> + mutex_lock(>reg_mutex); > >>> \ > >>> + ret = dev->ops->type##op_size(dev, page, reg, val); > >>> \ > >>> + mutex_unlock(>reg_mutex); > >>> \ > >>> + > >>> \ > >>> + return ret; > >>> \ > >>> } > >> > >> Why separate the 'type' and 'op_size' arguments since they > >> are always pasted together? > > > >For read/write48, the value type is u64. > > The way I read David's comment is that instead of calling the macro with > read, 48, just combine that > in a single argument: read48. I don't have a preference about that and can > respin eventually. Indeed, factoring in the type is harder because reads want 'u64 *' not 'u64'. While that could be factored, it would take more source lines and make things very obfuscated. David
[RFC PATCH 1/3] usbnet: Get rid of spammy usbnet "kevent X may have been dropped"
Every once in a while when my system is under a bit of stress I see some spammy messages show up in my logs that say: kevent X may have been dropped As far as I can tell these messages aren't terribly useful. The comments around the messages make me think that either workqueues used to work differently or that the original author of the code missed a sublety related to them. The error message appears to predate the git conversion of the kernel so it's somewhat hard to tell. Specifically, workqueues should work like this: A) If a workqueue hasn't been scheduled then schedule_work() schedules it and returns true. B) If a workqueue has been scheduled (but hasn't started) then schedule_work() will do nothing and return false. C) If a workqueue has been scheduled (and has started) then schedule_work() will put it on the queue to run again and return true. Said another way: if you call schedule_work() you can guarantee that at least one full runthrough of the work will happen again. That should mean that the work will get processed and I don't see any reason to think something should be dropped. Reading the comments in in usbnet_defer_kevent() made me think that B) and C) would be treated the same. That is: even if we've started the work and are 99% of the way through then schedule_work() would return false and the work wouldn't be queued again. If schedule_work() really did behave that way then, truly, some amount of work would be lost. ...but it doesn't. NOTE: if somehow these warnings are useful to mean something then perhaps we should change them to make it more obvious. If it's interesting to know when the work is backlogged then we should change the spam to say "warning: usbnet is backlogged". ALSO NOTE: If somehow some of the types of work need to be repeated if usbnet_defer_kevent() is called multiple times then that should be quite easy to accomplish without dropping any work on the floor. We can just keep an atomic count for that type of work and add a loop into usbnet_deferred_kevent(). Signed-off-by: Douglas Anderson--- drivers/net/usb/usbnet.c | 16 +++- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 6510e5cc1817..a3e8dbaadcf9 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -450,19 +450,17 @@ static enum skb_state defer_bh(struct usbnet *dev, struct sk_buff *skb, } /* some work can't be done in tasklets, so we use keventd - * - * NOTE: annoying asymmetry: if it's active, schedule_work() fails, - * but tasklet_schedule() doesn't. hope the failure is rare. */ void usbnet_defer_kevent (struct usbnet *dev, int work) { set_bit (work, >flags); - if (!schedule_work (>kevent)) { - if (net_ratelimit()) - netdev_err(dev->net, "kevent %d may have been dropped\n", work); - } else { - netdev_dbg(dev->net, "kevent %d scheduled\n", work); - } + + /* If work is already started this will mark it to run again when it +* finishes; if we already had work pending and it hadn't started +* yet then that's fine too. +*/ + schedule_work (>kevent); + netdev_dbg(dev->net, "kevent %d scheduled\n", work); } EXPORT_SYMBOL_GPL(usbnet_defer_kevent); -- 2.14.1.690.gbb1197296e-goog
[PATCH V2 net 4/7] net: hns3: Fixes the initialization of MAC address in hardware
From: LipengThis patch fixes the initialization of MAC address, fetched from HNS3 firmware i.e. when it is not randomly generated, to the HNS3 hardware. Fixes: ca60906d2795 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Lipeng Signed-off-by: Salil Mehta --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 1c3e294..4d68d6e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -2705,10 +2705,11 @@ static void hns3_init_mac_addr(struct net_device *netdev) eth_hw_addr_random(netdev); dev_warn(priv->dev, "using random MAC address %pM\n", netdev->dev_addr); - /* Also copy this new MAC address into hdev */ - if (h->ae_algo->ops->set_mac_addr) - h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr); } + + if (h->ae_algo->ops->set_mac_addr) + h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr); + } static void hns3_nic_set_priv_ops(struct net_device *netdev) -- 2.7.4
[PATCH V2 net 6/7] net: hns3: Fixes the default VLAN-id of PF
From: LipengWhen there is no vlan id in the packets, hardware will treat the vlan id as 0 and look for the mac_vlan table. This patch set the default vlan id of PF as 0. Without this config, it will fail when look for mac_vlan table, and hardware will drop packets. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Mingguang Qu Signed-off-by: Lipeng Signed-off-by: Salil Mehta --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 8e172af..74008ef 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3673,6 +3673,7 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev) { #define HCLGE_VLAN_TYPE_VF_TABLE 0 #define HCLGE_VLAN_TYPE_PORT_TABLE 1 + struct hnae3_handle *handle; int ret; ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_VLAN_TYPE_VF_TABLE, @@ -3682,8 +3683,11 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev) ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_VLAN_TYPE_PORT_TABLE, true); + if (ret) + return ret; - return ret; + handle = >vport[0].nic; + return hclge_set_port_vlan_filter(handle, htons(ETH_P_8021Q), 0, false); } static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu) -- 2.7.4
[PATCH V2 net 0/7] Bug fixes for the HNS3 Ethernet Driver for Hip08 SoC
This patch set presents some bug fixes for the HNS3 Ethernet driver identified during internal testing & stabilization efforts. Change Log: Patch V2: Resolved comments from Leon Romanovsky Patch V1: Initial Submit Lipeng (6): net: hns3: Fixes initialization of phy address from firmware net: hns3: Fixes the command used to unmap ring from vector net: hns3: Fixes ring-to-vector map-and-unmap command net: hns3: Fixes the initialization of MAC address in hardware net: hns3: Fixes the default VLAN-id of PF net: hns3: Fixes the premature exit of loop when matching clients Salil Mehta (1): net: hns3: Fixes the ether address copy with appropriate API drivers/net/ethernet/hisilicon/hns3/hnae3.c| 43 +- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 8 +++- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 20 -- .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 7 ++-- 4 files changed, 35 insertions(+), 43 deletions(-) -- 2.7.4
[PATCH V2 net 2/7] net: hns3: Fixes the command used to unmap ring from vector
From: LipengThis patch fixes the IMP command being used to unmap the vector from the corresponding ring. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Salil Mehta --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index db4e07d..e324bc6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2779,7 +2779,7 @@ static int hclge_unmap_ring_from_vector( } i = 0; hclge_cmd_setup_basic_desc(, - HCLGE_OPC_ADD_RING_TO_VECTOR, + HCLGE_OPC_DEL_RING_TO_VECTOR, false); req->int_vector_id = vector_id; } -- 2.7.4
[PATCH V2 net 3/7] net: hns3: Fixes ring-to-vector map-and-unmap command
From: LipengThis patch fixes the vector-to-ring map and unmap command and adds INT_GL(for, Gap Limiting Interrupts) and VF id to it as required by the hardware interface. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Mingguang Qu Signed-off-by: Salil Mehta --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 8 ++-- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 8 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 91ae013..c2b613b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -238,7 +238,7 @@ struct hclge_tqp_map { u8 rsv[18]; }; -#define HCLGE_VECTOR_ELEMENTS_PER_CMD 11 +#define HCLGE_VECTOR_ELEMENTS_PER_CMD 10 enum hclge_int_type { HCLGE_INT_TX, @@ -252,8 +252,12 @@ struct hclge_ctrl_vector_chain { #define HCLGE_INT_TYPE_S 0 #define HCLGE_INT_TYPE_M 0x3 #define HCLGE_TQP_ID_S 2 -#define HCLGE_TQP_ID_M (0x3fff << HCLGE_TQP_ID_S) +#define HCLGE_TQP_ID_M (0x7ff << HCLGE_TQP_ID_S) +#define HCLGE_INT_GL_IDX_S 13 +#define HCLGE_INT_GL_IDX_M (0x3 << HCLGE_INT_GL_IDX_S) __le16 tqp_type_and_id[HCLGE_VECTOR_ELEMENTS_PER_CMD]; + u8 vfid; + u8 rsv; }; #define HCLGE_TC_NUM 8 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e324bc6..eafd9c6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2680,7 +2680,11 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id, hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M, HCLGE_TQP_ID_S, node->tqp_index); + hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]); + req->vfid = vport->vport_id; if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) { req->int_cause_num = HCLGE_VECTOR_ELEMENTS_PER_CMD; @@ -2764,8 +2768,12 @@ static int hclge_unmap_ring_from_vector( hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M, HCLGE_TQP_ID_S, node->tqp_index); + hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]); + req->vfid = vport->vport_id; if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) { req->int_cause_num = HCLGE_VECTOR_ELEMENTS_PER_CMD; -- 2.7.4
[PATCH V2 net 5/7] net: hns3: Fixes the ether address copy with appropriate API
This patch replaces the ethernet address copy instance with more appropriate ether_addr_copy() function. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Salil Mehta--- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index eafd9c6..8e172af 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1063,8 +1063,7 @@ static int hclge_configure(struct hclge_dev *hdev) hdev->base_tqp_pid = 0; hdev->rss_size_max = 1; hdev->rx_buf_len = cfg.rx_buf_len; - for (i = 0; i < ETH_ALEN; i++) - hdev->hw.mac.mac_addr[i] = cfg.mac_addr[i]; + ether_addr_copy(hdev->hw.mac.mac_addr, cfg.mac_addr); hdev->hw.mac.media_type = cfg.media_type; hdev->hw.mac.phy_addr = cfg.phy_addr; hdev->num_desc = cfg.tqp_desc_num; -- 2.7.4
Re: [RFC net-next v2] bridge lwtunnel, VPLS & NVGRE
Hi David, What's next ? do you plan to send a v3 or should I do it ? On 09/11/2017 10:02 AM, Amine Kherbouche wrote: Hi David, Do you plan to send a v3? On 21/08/2017 18:15, David Lamparter wrote: Hi all, this is an update on the earlier "[RFC net-next] VPLS support". Note I've changed the subject lines on some of the patches to better reflect what they really do (tbh the earlier subject lines were crap.) As previously, iproute2 / FRR patches are at: - https://github.com/eqvinox/vpls-iproute2 - https://github.com/opensourcerouting/frr/commits/vpls while this patchset is also available at: - https://github.com/eqvinox/vpls-linux-kernel (but please be aware that I'm amending and rebasing commits)
Re: [PATCH net-next v2 12/12] net: dsa: bcm_sf2: Utilize b53_{enable,disable}_port
Florian Fainelliwrites: > Export b53_{enable,disable}_port and use these two functions in > bcm_sf2_port_setup and bcm_sf2_port_disable. The generic functions > cannot be used without wrapping because we need to manage additional > switch integration details (PHY, Broadcom tag etc.). > > Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot
Re: [PATCH net-next 4/4] test_rhashtable: add test case for rhl_table interface
Hi Florian, [auto build test WARNING on net-next/master] url: https://github.com/0day-ci/linux/commits/Florian-Westphal/test_rhashtable-add-test-case-for-rhl-table/20170919-135550 config: x86_64-randconfig-a0-09192105 (attached as .config) compiler: gcc-4.4 (Debian 4.4.7-8) 4.4.7 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 All warnings (new ones prefixed by >>): lib/test_rhashtable.c: In function 'test_rhltable': >> lib/test_rhashtable.c:433: warning: the frame size of 2144 bytes is larger >> than 2048 bytes vim +433 lib/test_rhashtable.c 254 255 static int __init test_rhltable(unsigned int entries) 256 { 257 struct test_obj_rhl *rhl_test_objects; 258 unsigned long *obj_in_table; 259 struct rhltable rhlt; 260 unsigned int i, j, k; 261 int ret, err; 262 263 if (entries == 0) 264 entries = 1; 265 266 rhl_test_objects = vzalloc(sizeof(*rhl_test_objects) * entries); 267 if (!rhl_test_objects) 268 return -ENOMEM; 269 270 ret = -ENOMEM; 271 obj_in_table = vzalloc(BITS_TO_LONGS(entries) * sizeof(unsigned long)); 272 if (!obj_in_table) 273 goto out_free; 274 275 /* nulls_base not supported in rhlist interface */ 276 test_rht_params.nulls_base = 0; 277 err = rhltable_init(, _rht_params); 278 if (WARN_ON(err)) 279 goto out_free; 280 281 k = prandom_u32(); 282 ret = 0; 283 for (i = 0; i < entries; i++) { 284 rhl_test_objects[i].value.id = k; 285 err = rhltable_insert(, _test_objects[i].list_node, 286test_rht_params); 287 if (WARN(err, "error %d on element %d\n", err, i)) 288 break; 289 if (err == 0) 290 set_bit(i, obj_in_table); 291 } 292 293 if (err) 294 ret = err; 295 296 pr_info("test %d add/delete pairs into rhlist\n", entries); 297 for (i = 0; i < entries; i++) { 298 struct rhlist_head *h, *pos; 299 struct test_obj_rhl *obj; 300 struct test_obj_val key = { 301 .id = k, 302 }; 303 bool found; 304 305 rcu_read_lock(); 306 h = rhltable_lookup(, , test_rht_params); 307 if (WARN(!h, "key not found during iteration %d of %d", i, entries)) { 308 rcu_read_unlock(); 309 break; 310 } 311 312 if (i) { 313 j = i - 1; 314 rhl_for_each_entry_rcu(obj, pos, h, list_node) { 315 if (WARN(pos == _test_objects[j].list_node, "old element found, should be gone")) 316 break; 317 } 318 } 319 320 cond_resched_rcu(); 321 322 found = false; 323 324 rhl_for_each_entry_rcu(obj, pos, h, list_node) { 325 if (pos == _test_objects[i].list_node) { 326 found = true; 327 break; 328 } 329 } 330 331 rcu_read_unlock(); 332 333 if (WARN(!found, "element %d not found", i)) 334 break; 335 336 err = rhltable_remove(, _test_objects[i].list_node, test_rht_params); 337 WARN(err, "rhltable_remove: err %d for iteration %d\n", err, i); 338 if (err == 0) 339 clear_bit(i, obj_in_table); 340 } 341 342 if (ret == 0 && err) 343 ret = err; 344 345 for (i = 0; i < entries; i++) { 346 WARN(test_bit(i, obj_in_table), "elem %d allegedly still present", i); 347 348 err = rhltable_insert(, _test_objects[i].list_node, 349test_rht_params); 350 if (WARN(err, "error %d on element %d\n", err, i)) 351 break; 352 if (err == 0) 353 set_bit(i, obj_in_table); 354 } 355 356 pr_info
Re: [PATCH net-next 00/14] gtp: Additional feature support
On Tue, Sep 19, 2017 at 5:43 AM, Harald Weltewrote: > Hi Tom, > > first of all, thanks a lot for your patch series. It makes me happy to > see contributions on the GTP code :) > > On Mon, Sep 18, 2017 at 05:38:50PM -0700, Tom Herbert wrote: >> - IPv6 support > > see my detailed comments in other mails. It's unfortunately only > support for the already "deprecated" IPv6-only PDP contexts, not the > more modern v4v6 type. In order to interoperate with old and new > approach, all three cases (v4, v6 and v4v6) should be supported from one > code base. > It sounds like something that can be subsequently added. Do you have a reference to the spec? >> - Configurable networking interfaces so that GTP kernel can be used >> and tested without needing GSN network emulation (i.e. no user space >> daemon needed). > > We have some pretty decent userspace utilities for configuring the GTP > interfaces and tunnels in the libgtpnl repository, but if it helps > people to have another way of configuration, I won't be against it. > AFAIK those userspace utilities don't support IPv6. Being able to configure GTP like any other encapsulation will facilitate development of IPv6 and other features. > What we have to keep in mind is that the current model of 1:1 mapping of > a "UDP socket' to a GTP netdevice is conceptually broken and needs to be > refactored soon (without breaking backwards compatibility). See related > earlier discussions with patches submitted by Andreas Schultz. > I don't think I changed the model, so this can evolve. > Summary: > > In real-world GGSNs you often want to host multiple virtual GGSNs on a > single GGSN (= UDP socket). Each virtual GGSN terminates into one > external PDN (packet data network), which can be a private corporate vpn > or any other IP network, with no routing between those networks. > Sounds like network virtualization and VNIs. > Naively one would assume you "simply" run another virtual GGSN > instance on another IP address, and then differentiate like that. > > However, the problem is that adding a new GGSN IP address will require > manual configuration changes at each of your roaming partners (easily > hundreds of operators!) and hence it is avoided at all cost due to the > related long schedule, requirement for interop testing with each of them, > etc. > > So what you do in reality at operators is that you operate many of those > virtual GGSNs on the same IP:Port combination (and hence UDP socket), > which means you have PDP contexts for vGGSN A which terminate on e.g. > gtp0 and PDP contexts for vGGSN B on gtp1, and so on. The decision > which gtp-device a given PDP context is a member is made by the GTP-C > instance. In the kenel we'll have to decouple net-devices from sockets. > > So whatever new configuration mechanism or architectural changes we > introduce, we need to make sure that those will accomodate the "new > model" rather than introducing further dependencies for which we will > have to maintain backwards compatibility workaronds later on. > >> - Port numbers are configurable > > I'm not sure if this is a useful feature. GTP is used only in > operator-controlled networks and only on standard ports. It's not > possible to negotiate any non-standard ports on the signaling plane > either. > Bear in mind that we're not required to do everything the GTP spec says. Adding port configuration is another one of those things that gives us flexibility and and better capability to test without needing a full blown GSN network. One feature I didn't implement was UDP source for flow entropy-- as we've seen with other encapsulation protocols this helps significantly to get good ECMP in the network. My impression is GTP designers probably didn't think in terms of getting best performance. But we can ;-) >> - Addition of a dst_cache in the GTP structure and other cleanup > > looks fine to me. > >> - GSO,GRO >> - Control of zero UDP checksums > > [...] > >> Additionally, this patch set also includes a couple of general support >> capabilities: >> >> - A facility that allows application specific GSO callbacks >> - Common functions to get a route fo for an IP tunnel > > This is where the "core netdev" folks will have to comment. I'm too > remote from mainline kernel development these days and will focus on > reviewing the GTP specific bits of your patch series. > Thanks. Obviously, I and many on this list have more expertise on the core networking side than GTP, so your review is quite welcome. >> For IPv6 support, the mobile subscriber needs to allow IPv6 addresses, >> and the remote enpoint can be IPv6. > > Minor correction: The mobile subscriber specifically requests a PDP Type > when establishing the PDP context via Session Management related > signaling from MS/UE to SGSN. The SGSN simply translates this to GTP > and then forwards it to the GGSN. So it's acutally not "allow" but > "specifically request". > Okay. >>
[PATCH net-next 1/4] net: dsa: remove copy of master ethtool_ops
There is no need to store a copy of the master ethtool ops, storing the original pointer in DSA and the new one in the master netdev itself is enough. In the meantime, set orig_ethtool_ops to NULL when restoring the master ethtool ops and check the presence of the master original ethtool ops as well as its needed functions before calling them. Signed-off-by: Vivien Didelot--- include/net/dsa.h | 1 - net/dsa/dsa.c | 8 net/dsa/slave.c | 19 +++ 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/net/dsa.h b/include/net/dsa.h index dd44d6ce1097..8dee216a5a9b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -188,7 +188,6 @@ struct dsa_port { /* * Original copy of the master netdev ethtool_ops */ - struct ethtool_ops ethtool_ops; const struct ethtool_ops *orig_ethtool_ops; }; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 03c58b0eb082..abadf7b49236 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -124,11 +124,10 @@ int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) if (!cpu_ops) return -ENOMEM; - memcpy(_dp->ethtool_ops, master->ethtool_ops, - sizeof(struct ethtool_ops)); cpu_dp->orig_ethtool_ops = master->ethtool_ops; - memcpy(cpu_ops, _dp->ethtool_ops, - sizeof(struct ethtool_ops)); + if (cpu_dp->orig_ethtool_ops) + memcpy(cpu_ops, cpu_dp->orig_ethtool_ops, sizeof(*cpu_ops)); + dsa_cpu_port_ethtool_init(cpu_ops); master->ethtool_ops = cpu_ops; @@ -138,6 +137,7 @@ int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp) { cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops; + cpu_dp->orig_ethtool_ops = NULL; } void dsa_cpu_dsa_destroy(struct dsa_port *port) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2afa99506f8b..2ff4f907d137 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -574,12 +574,13 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); struct dsa_switch *ds = cpu_dp->ds; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; s8 cpu_port = cpu_dp->index; int count = 0; - if (cpu_dp->ethtool_ops.get_sset_count) { - count = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS); - cpu_dp->ethtool_ops.get_ethtool_stats(dev, stats, data); + if (ops && ops->get_sset_count && ops->get_ethtool_stats) { + count = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_ethtool_stats(dev, stats, data); } if (ds->ops->get_ethtool_stats) @@ -591,10 +592,11 @@ static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); struct dsa_switch *ds = cpu_dp->ds; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; int count = 0; - if (cpu_dp->ethtool_ops.get_sset_count) - count += cpu_dp->ethtool_ops.get_sset_count(dev, sset); + if (ops && ops->get_sset_count) + count += ops->get_sset_count(dev, sset); if (sset == ETH_SS_STATS && ds->ops->get_sset_count) count += ds->ops->get_sset_count(ds); @@ -608,6 +610,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); struct dsa_switch *ds = cpu_dp->ds; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; s8 cpu_port = cpu_dp->index; int len = ETH_GSTRING_LEN; int mcount = 0, count; @@ -619,9 +622,9 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, /* We do not want to be NULL-terminated, since this is a prefix */ pfx[sizeof(pfx) - 1] = '_'; - if (cpu_dp->ethtool_ops.get_sset_count) { - mcount = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS); - cpu_dp->ethtool_ops.get_strings(dev, stringset, data); + if (ops && ops->get_sset_count && ops->get_strings) { + mcount = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_strings(dev, stringset, data); } if (stringset == ETH_SS_STATS && ds->ops->get_strings) { -- 2.14.1
Re: [REGRESSION] Warning in tcp_fastretrans_alert() of net/ipv4/tcp_input.c
And 2 more events: === $ dmesg --time-format iso | grep RIP … 2017-09-19T16:52:21,623328+0200 RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 2017-09-19T16:52:40,455296+0200 RIP: 0010:tcp_fastretrans_alert+0x7c8/0x990 2017-09-19T16:52:41,047378+0200 RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 … 2017-09-19T16:54:59,930726+0200 RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 2017-09-19T16:55:07,985767+0200 RIP: 0010:tcp_fastretrans_alert+0x7c8/0x990 2017-09-19T16:55:41,911527+0200 RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 … === On pondělí 18. září 2017 23:40:08 CEST Yuchung Cheng wrote: > On Mon, Sep 18, 2017 at 1:46 PM, Oleksandr Natalenko > >wrote: > > Actually, same warning was just triggered with RACK enabled. But main > > warning was not triggered in this case. > > Thanks. > > I assume this kernel does not have the patch that Neal proposed in his > first reply? > > The main warning needs to be triggered by another peculiar SACK that > kicks the sender into recovery again (after undo). Please let it run > longer if possible to see if we can get both. But the new data does > indicate the we can (validly) be in CA_Open with retrans_out > 0. > > > === > > Sep 18 22:44:32 defiant kernel: [ cut here ] > > Sep 18 22:44:32 defiant kernel: WARNING: CPU: 1 PID: 702 at net/ipv4/ > > tcp_input.c:2392 tcp_undo_cwnd_reduction+0xbd/0xd0 > > Sep 18 22:44:32 defiant kernel: Modules linked in: netconsole ctr ccm > > cls_bpf sch_htb act_mirred cls_u32 sch_ingress sit tunnel4 ip_tunnel > > 8021q mrp nf_conntrack_ipv6 nf_defrag_ipv6 nft_ct nft_set_bitmap > > nft_set_hash nft_set_rbtree nf_tables_inet nf_tables_ipv6 nft_masq_ipv4 > > nf_nat_masquerade_ipv4 nft_masq nft_nat nft_counter nft_meta > > nft_chain_nat_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat > > nf_conntrack libcrc32c crc32c_generic nf_tables_ipv4 nf_tables tun nct6775 > > nfnetlink hwmon_vid nls_iso8859_1 nls_cp437 vfat fat ext4 > > snd_hda_codec_hdmi mbcache jbd2 snd_hda_codec_realtek > > snd_hda_codec_generic f2fs arc4 fscrypto intel_rapl iTCO_wdt ath9k > > iTCO_vendor_support intel_powerclamp ath9k_common ath9k_hw coretemp > > kvm_intel ath mac80211 kvm irqbypass intel_cstate cfg80211 pcspkr > > snd_hda_intel snd_hda_codec r8169 > > Sep 18 22:44:32 defiant kernel: joydev evdev mii snd_hda_core mousedev > > mei_txe input_leds i2c_i801 mac_hid i915 lpc_ich mei shpchp snd_hwdep > > snd_intel_sst_acpi snd_intel_sst_core snd_soc_rt5670 > > snd_soc_sst_atom_hifi2_platform battery snd_soc_sst_match snd_soc_rl6231 > > drm_kms_helper hci_uart ov5693(C) ov2722(C) lm3554(C) btbcm btqca > > v4l2_common snd_soc_core btintel snd_compress videodev snd_pcm_dmaengine > > snd_pcm video bluetooth snd_timer drm media tpm_tis snd i2c_hid soundcore > > tpm_tis_core rfkill_gpio ac97_bus soc_button_array ecdh_generic rfkill > > crc16 tpm 8250_dw intel_gtt syscopyarea sysfillrect acpi_pad sysimgblt > > intel_int0002_vgpio fb_sys_fops pinctrl_cherryview i2c_algo_bit button > > sch_fq_codel tcp_bbr ifb ip_tables x_tables btrfs xor raid6_pq > > algif_skcipher af_alg hid_logitech_hidpp hid_logitech_dj usbhid hid uas > > Sep 18 22:44:32 defiant kernel: usb_storage dm_crypt dm_mod dax raid10 > > md_mod sd_mod crct10dif_pclmul crc32_pclmul crc32c_intel > > ghash_clmulni_intel pcbc ahci aesni_intel xhci_pci libahci aes_x86_64 > > crypto_simd glue_helper xhci_hcd cryptd libata usbcore scsi_mod > > usb_common serio sdhci_acpi sdhci led_class mmc_core > > Sep 18 22:44:32 defiant kernel: CPU: 1 PID: 702 Comm: irq/123-enp3s0 > > Tainted: GWC 4.13.0-pf4 #1 > > Sep 18 22:44:32 defiant kernel: Hardware name: To Be Filled By O.E.M. To > > Be > > Filled By O.E.M./J3710-ITX, BIOS P1.30 03/30/2016 > > Sep 18 22:44:32 defiant kernel: task: 88923a738000 task.stack: > > 95800150 > > Sep 18 22:44:32 defiant kernel: RIP: > > 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 > > Sep 18 22:44:32 defiant kernel: RSP: 0018:88927fc83a48 EFLAGS: > > 00010202 > > Sep 18 22:44:32 defiant kernel: RAX: 0001 RBX: > > 8892412d9800 > > RCX: 88927fc83b0c > > Sep 18 22:44:32 defiant kernel: RDX: 7fff RSI: > > 0001 > > RDI: 8892412d9800 > > Sep 18 22:44:32 defiant kernel: RBP: 88927fc83a50 R08: > > > > R09: 18dfb063 > > Sep 18 22:44:32 defiant kernel: R10: 18dfd223 R11: > > 18dfb063 > > R12: 5320 > > Sep 18 22:44:32 defiant kernel: R13: 88927fc83b10 R14: > > 0001 > > R15: 88927fc83b0c > > Sep 18 22:44:32 defiant kernel: FS: () > > GS:88927fc8() knlGS: > > Sep 18 22:44:32 defiant kernel: CS: 0010 DS: ES: CR0: > > 80050033 > > Sep 18 22:44:32 defiant kernel: CR2: 7f1cd1a43620 CR3: > > 000114a09000 > > CR4: 001006e0 > > Sep 18 22:44:32 defiant kernel: Call Trace: > > Sep 18 22:44:32 defiant kernel: > > Sep 18 22:44:32 defiant kernel:
Re: [PATCH net-next 03/14] gtp: Call common functions to get tunnel routes and add dst_cache
On Mon, Sep 18, 2017 at 9:17 PM, David Millerwrote: > From: Tom Herbert > Date: Mon, 18 Sep 2017 17:38:53 -0700 > >> Call ip_tunnel_get_route and dst_cache to pdp context which should >> improve performance by obviating the need to perform a route lookup >> on every packet. >> >> Signed-off-by: Tom Herbert > > Not caused by your changes, but something to think about: > >> -static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4, >> -const struct sock *sk, >> -__be32 daddr) >> -{ >> - memset(fl4, 0, sizeof(*fl4)); >> - fl4->flowi4_oif = sk->sk_bound_dev_if; >> - fl4->daddr = daddr; >> - fl4->saddr = inet_sk(sk)->inet_saddr; >> - fl4->flowi4_tos = RT_CONN_FLAGS(sk); >> - fl4->flowi4_proto = sk->sk_protocol; >> - >> - return ip_route_output_key(sock_net(sk), fl4); >> -} > > This and the new dst caching code ignores any source address selection > done by ip_route_output_key() or the new tunnel route lookup helpers. > > Either source address selection should be respected, or if saddr will > never be modified by a route lookup for some specific reason here, > that should be documented. Yes, I noticed that. In this case the source address is intended to be taken bound on the socket which would imply we aren't interested in source address selection. Tom
Re: [PATCH net-next 1/3] bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE
On 09/19/2017 05:08 PM, Craig Gallek wrote: On Mon, Sep 18, 2017 at 6:53 PM, Alexei Starovoitovwrote: On 9/18/17 12:30 PM, Craig Gallek wrote: [...] + + next_bit = extract_bit(key->data, node->prefixlen); + /* If we hit a node that has more than one child or is a valid +* prefix itself, do not remove it. Reset the root of the trim +* path to its descendant on our path. +*/ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || + (node->child[0] && node->child[1])) + trim = >child[next_bit]; + node = rcu_dereference_protected( + node->child[next_bit], lockdep_is_held(>lock)); + } + + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + ret = -ENOENT; + goto out; + } + + trie->n_entries--; + + /* If the node we are removing is not a leaf node, simply mark it +* as intermediate and we are done. +*/ + if (rcu_access_pointer(node->child[0]) || + rcu_access_pointer(node->child[1])) { + node->flags |= LPM_TREE_NODE_FLAG_IM; + goto out; + } + + /* trim should now point to the slot holding the start of a path from +* zero or more intermediate nodes to our leaf node for deletion. +*/ + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(>lock { + RCU_INIT_POINTER(*trim, NULL); + trim = rcu_access_pointer(node->child[0]) ? + >child[0] : + >child[1]; + kfree_rcu(node, rcu); can it be that some of the nodes this loop walks have both child[0] and [1] ? No, the loop above will push trim down the walk every time it encounters a node with two children. The only other trim assignment is the initial trim = >root. But the only time we would skip the assignment in the loop is if the node being removed is the root. If the root had multiple children and is being removed, it would be handled by the case that turns the node into an intermediate node rather than walking the trim path freeing things. Looks good to me. We should probably still merge nodes once we turn a real node into an im which just has a single child attached to it; parent can be im or real node. Thus, we don't need to traverse this extra one on lookup. Acked-by: Daniel Borkmann
Re: [PATCH net-next 3/3] bpf: Test deletion in BPF_MAP_TYPE_LPM_TRIE
On 09/18/2017 09:30 PM, Craig Gallek wrote: From: Craig GallekExtend the 'random' operation tests to include a delete operation (delete half of the nodes from both lpm implementions and ensure that lookups are still equivalent). Also, add a simple IPv4 test which verifies lookup behavior as nodes are deleted from the tree. Signed-off-by: Craig Gallek Acked-by: Daniel Borkmann
Re: Re: [PATCH] net/packet: fix race condition between fanout_add and __unregister_prot_hook
On Tue, Sep 19, 2017 at 12:09 PM, Willem de Bruijnwrote: > On Tue, Sep 19, 2017 at 3:21 AM, Nixiaoming wrote: >> On Fri, Sep 15, 2017 at 10:46 AM, Willem de Bruijn >> >> wrote: >> >>> >> >>> In case of failure we also need to unlink and free match. I >> >>> sent the following: >> >>> >> >>> http://patchwork.ozlabs.org/patch/813945/ >> >> >> >> + spin_lock(>bind_lock); >> >> + if (po->running && >> >> + match->type == type && >> >>match->prot_hook.type == po->prot_hook.type && >> >>match->prot_hook.dev == po->prot_hook.dev) { >> >> err = -ENOSPC; >> >> @@ -1761,6 +1760,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 >> type_flags) >> >> err = 0; >> >> } >> >>} >> >> + spin_unlock(>bind_lock); >> >> + >> >> + if (err && !refcount_read(>sk_ref)) { >> >> +list_del(>list); >> >> +kfree(match); >> >> + } >> >> >> >> >> >> In the function fanout_add add spin_lock to protect po-> running and po-> >> fanout, >> >> then whether it should be in the function fanout_release also add spin_lock >> protection ? > > po->bind_lock is held when registering and unregistering the > protocol hook. fanout_release does access po->running or > prot_hook. whoops. does *not* access.
Re: [PATCH net-next 2/3] bpf: Add uniqueness invariant to trivial lpm test implementation
On 09/18/2017 09:30 PM, Craig Gallek wrote: From: Craig GallekThe 'trivial' lpm implementation in this test allows equivalent nodes to be added (that is, nodes consisting of the same prefix and prefix length). For lookup operations, this is fine because insertion happens at the head of the (singly linked) list and the first, best match is returned. In order to support deletion, the tlpm data structue must first enforce uniqueness. This change modifies the insertion algorithm to search for equivalent nodes and remove them. Note: the BPF_MAP_TYPE_LPM_TRIE already has a uniqueness invariant that is implemented as node replacement. Signed-off-by: Craig Gallek Acked-by: Daniel Borkmann
[RFC PATCH 3/3] usbnet: Fix memory leak when rx_submit() fails
If rx_submit() returns an error code then nobody calls usb_free_urb(). That means it's leaked. NOTE: This problem was found solely by code inspection and not due to any failing test cases. Signed-off-by: Douglas Anderson--- drivers/net/usb/usbnet.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index e72547d8d0e6..4c067aaeea5a 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1182,9 +1182,12 @@ usbnet_deferred_kevent (struct work_struct *work) usb_free_urb(urb); goto fail_lowmem; } - if (rx_submit (dev, urb, GFP_KERNEL) == - -ENOLINK) - resched = 0; + status = rx_submit (dev, urb, GFP_KERNEL); + if (status) { + usb_free_urb(urb); + if (status == -ENOLINK) + resched = 0; + } usb_autopm_put_interface(dev->intf); fail_lowmem: if (resched) -- 2.14.1.690.gbb1197296e-goog
[PATCH net] bpf: do not disable/enable BH in bpf_map_free_id()
From: Eric Dumazetsyzkaller reported following splat [1] Since hard irq are disabled by the caller, bpf_map_free_id() should not try to enable/disable BH. Another solution would be to change htab_map_delete_elem() to defer the free_htab_elem() call after raw_spin_unlock_irqrestore(>lock, flags), but this might be not enough to cover other code paths. [1] WARNING: CPU: 1 PID: 8052 at kernel/softirq.c:161 __local_bh_enable_ip +0x1e/0x160 kernel/softirq.c:161 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 8052 Comm: syz-executor1 Not tainted 4.13.0-next-20170915+ #23 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:181 __warn+0x1c4/0x1d9 kernel/panic.c:542 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:261 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:__local_bh_enable_ip+0x1e/0x160 kernel/softirq.c:161 RSP: 0018:8801cdcd7748 EFLAGS: 00010046 RAX: 0082 RBX: 0201 RCX: RDX: 10b5933c RSI: 0201 RDI: 85ac99e0 RBP: 8801cdcd7758 R08: 85b87158 R09: 110039b9aec6 R10: 8801c99f24c0 R11: 0002 R12: 817b0b47 R13: dc00 R14: 8801cdcd77e8 R15: 0001 __raw_spin_unlock_bh include/linux/spinlock_api_smp.h:176 [inline] _raw_spin_unlock_bh+0x30/0x40 kernel/locking/spinlock.c:207 spin_unlock_bh include/linux/spinlock.h:361 [inline] bpf_map_free_id kernel/bpf/syscall.c:197 [inline] __bpf_map_put+0x267/0x320 kernel/bpf/syscall.c:227 bpf_map_put+0x1a/0x20 kernel/bpf/syscall.c:235 bpf_map_fd_put_ptr+0x15/0x20 kernel/bpf/map_in_map.c:96 free_htab_elem+0xc3/0x1b0 kernel/bpf/hashtab.c:658 htab_map_delete_elem+0x74d/0x970 kernel/bpf/hashtab.c:1063 map_delete_elem kernel/bpf/syscall.c:633 [inline] SYSC_bpf kernel/bpf/syscall.c:1479 [inline] SyS_bpf+0x2188/0x46a0 kernel/bpf/syscall.c:1451 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: f3f1c054c288 ("bpf: Introduce bpf_map ID") Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau --- kernel/bpf/syscall.c |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb17e1cd1d434dc2e052a2a9fb0aea967fcf4417..25d074920a009ff682d97bf88e68f466c79bd564 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map) static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { + unsigned long flags; + if (do_idr_lock) - spin_lock_bh(_idr_lock); + spin_lock_irqsave(_idr_lock, flags); else __acquire(_idr_lock); idr_remove(_idr, map->id); if (do_idr_lock) - spin_unlock_bh(_idr_lock); + spin_unlock_irqrestore(_idr_lock, flags); else __release(_idr_lock); }
[RFC PATCH 2/3] usbnet: Avoid potential races in usbnet_deferred_kevent()
In general when you've got a flag communicating that "something needs to be done" you want to clear that flag _before_ doing the task. If you clear the flag _after_ doing the task you end up with the risk that this will happen: 1. Requester sets flag saying task A needs to be done. 2. Worker comes and stars doing task A. 3. Worker finishes task A but hasn't yet cleared the flag. 4. Requester wants to set flag saying task A needs to be done again. 5. Worker clears the flag without doing anything. Let's make the usbnet codebase consistently clear the flag _before_ it does the requested work. That way if there's another request to do the work while the work is already in progress it won't be lost. NOTES: - No known bugs are fixed by this; it's just found by code inspection. - This changes the semantics in some of the error conditions. -> If we fail to clear the "tx halt" or "rx halt" we still clear the flag and thus won't retry the clear next time we happen to be in the work function. Had the old code really wanted to retry these events it should have re-scheduled the worker anyway. -> If we fail to allocate memory in usb_alloc_urb() we will still clear the EVENT_RX_MEMORY flag. This makes it consistent with how we would deal with other failures, including failure to allocate a memory chunk in rx_submit(). It can also be noted that usb_alloc_urb() in this case is allocating much less than 4K worth of data and probably never fails. Signed-off-by: Douglas Anderson--- drivers/net/usb/usbnet.c | 50 +--- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index a3e8dbaadcf9..e72547d8d0e6 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1103,8 +1103,6 @@ static void __handle_link_change(struct usbnet *dev) /* hard_mtu or rx_urb_size may change during link change */ usbnet_update_max_qlen(dev); - - clear_bit(EVENT_LINK_CHANGE, >flags); } static void usbnet_set_rx_mode(struct net_device *net) @@ -1118,8 +1116,6 @@ static void __handle_set_rx_mode(struct usbnet *dev) { if (dev->driver_info->set_rx_mode) (dev->driver_info->set_rx_mode)(dev); - - clear_bit(EVENT_SET_RX_MODE, >flags); } /* work that cannot be done in interrupt context uses keventd. @@ -1135,7 +1131,7 @@ usbnet_deferred_kevent (struct work_struct *work) int status; /* usb_clear_halt() needs a thread context */ - if (test_bit (EVENT_TX_HALT, >flags)) { + if (test_and_clear_bit (EVENT_TX_HALT, >flags)) { unlink_urbs (dev, >txq); status = usb_autopm_get_interface(dev->intf); if (status < 0) @@ -1150,12 +1146,11 @@ usbnet_deferred_kevent (struct work_struct *work) netdev_err(dev->net, "can't clear tx halt, status %d\n", status); } else { - clear_bit (EVENT_TX_HALT, >flags); if (status != -ESHUTDOWN) netif_wake_queue (dev->net); } } - if (test_bit (EVENT_RX_HALT, >flags)) { + if (test_and_clear_bit (EVENT_RX_HALT, >flags)) { unlink_urbs (dev, >rxq); status = usb_autopm_get_interface(dev->intf); if (status < 0) @@ -1170,41 +1165,39 @@ usbnet_deferred_kevent (struct work_struct *work) netdev_err(dev->net, "can't clear rx halt, status %d\n", status); } else { - clear_bit (EVENT_RX_HALT, >flags); tasklet_schedule (>bh); } } /* tasklet could resubmit itself forever if memory is tight */ - if (test_bit (EVENT_RX_MEMORY, >flags)) { + if (test_and_clear_bit (EVENT_RX_MEMORY, >flags)) { struct urb *urb = NULL; int resched = 1; - if (netif_running (dev->net)) + if (netif_running (dev->net)) { urb = usb_alloc_urb (0, GFP_KERNEL); - else - clear_bit (EVENT_RX_MEMORY, >flags); - if (urb != NULL) { - clear_bit (EVENT_RX_MEMORY, >flags); - status = usb_autopm_get_interface(dev->intf); - if (status < 0) { - usb_free_urb(urb); - goto fail_lowmem; - } - if (rx_submit (dev, urb, GFP_KERNEL) == -ENOLINK) - resched = 0; - usb_autopm_put_interface(dev->intf); + if (urb != NULL) { +
[PATCH V2 net 7/7] net: hns3: Fixes the premature exit of loop when matching clients
From: LipengWhen register/unregister ae_dev, ae_dev should match all client in the client_list. Enet and roce can co-exists together so we should continue checking for enet and roce presence together. So break should not be there. Above caused problems in loading and unloading of modules. Fixes: 38eddd126772 ("net: hns3: Add support of the HNAE3 framework") Signed-off-by: Lipeng Signed-off-by: Salil Mehta --- drivers/net/ethernet/hisilicon/hns3/hnae3.c | 43 ++--- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c index 59efbd6..5bcb223 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c @@ -37,20 +37,15 @@ static bool hnae3_client_match(enum hnae3_client_type client_type, } static int hnae3_match_n_instantiate(struct hnae3_client *client, -struct hnae3_ae_dev *ae_dev, -bool is_reg, bool *matched) +struct hnae3_ae_dev *ae_dev, bool is_reg) { int ret; - *matched = false; - /* check if this client matches the type of ae_dev */ if (!(hnae3_client_match(client->type, ae_dev->dev_type) && hnae_get_bit(ae_dev->flag, HNAE3_DEV_INITED_B))) { return 0; } - /* there is a match of client and dev */ - *matched = true; /* now, (un-)instantiate client by calling lower layer */ if (is_reg) { @@ -69,7 +64,6 @@ int hnae3_register_client(struct hnae3_client *client) { struct hnae3_client *client_tmp; struct hnae3_ae_dev *ae_dev; - bool matched; int ret = 0; mutex_lock(_common_lock); @@ -86,7 +80,7 @@ int hnae3_register_client(struct hnae3_client *client) /* if the client could not be initialized on current port, for * any error reasons, move on to next available port */ - ret = hnae3_match_n_instantiate(client, ae_dev, true, ); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(_dev->pdev->dev, "match and instantiation failed for port\n"); @@ -102,12 +96,11 @@ EXPORT_SYMBOL(hnae3_register_client); void hnae3_unregister_client(struct hnae3_client *client) { struct hnae3_ae_dev *ae_dev; - bool matched; mutex_lock(_common_lock); /* un-initialize the client on every matched port */ list_for_each_entry(ae_dev, _ae_dev_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, ); + hnae3_match_n_instantiate(client, ae_dev, false); } list_del(>node); @@ -124,7 +117,6 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo) const struct pci_device_id *id; struct hnae3_ae_dev *ae_dev; struct hnae3_client *client; - bool matched; int ret = 0; mutex_lock(_common_lock); @@ -151,13 +143,10 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo) * initialize the figure out client instance */ list_for_each_entry(client, _client_list, node) { - ret = hnae3_match_n_instantiate(client, ae_dev, true, - ); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(_dev->pdev->dev, "match and instantiation failed\n"); - if (matched) - break; } } @@ -175,7 +164,6 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo) const struct pci_device_id *id; struct hnae3_ae_dev *ae_dev; struct hnae3_client *client; - bool matched; mutex_lock(_common_lock); /* Check if there are matched ae_dev */ @@ -187,12 +175,8 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo) /* check the client list for the match with this ae_dev type and * un-initialize the figure out client instance */ - list_for_each_entry(client, _client_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, - ); - if (matched) - break; - } + list_for_each_entry(client, _client_list, node) + hnae3_match_n_instantiate(client, ae_dev, false); ae_algo->ops->uninit_ae_dev(ae_dev);
[PATCH V2 net 1/7] net: hns3: Fixes initialization of phy address from firmware
From: LipengDefault phy address of every port is 0. Therefore, phy address for each port need to be fetched from firmware and device initialized with fetched non-default phy address. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Salil Mehta --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index bb45365..db4e07d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1066,6 +1066,7 @@ static int hclge_configure(struct hclge_dev *hdev) for (i = 0; i < ETH_ALEN; i++) hdev->hw.mac.mac_addr[i] = cfg.mac_addr[i]; hdev->hw.mac.media_type = cfg.media_type; + hdev->hw.mac.phy_addr = cfg.phy_addr; hdev->num_desc = cfg.tqp_desc_num; hdev->tm_info.num_pg = 1; hdev->tm_info.num_tc = cfg.tc_num; -- 2.7.4
Re: [PATCH net v2] l2tp: fix race condition in l2tp_tunnel_delete
On Tue, Sep 19, 2017 at 03:40:40PM +0200, Sabrina Dubroca wrote: > If we try to delete the same tunnel twice, the first delete operation > does a lookup (l2tp_tunnel_get), finds the tunnel, calls > l2tp_tunnel_delete, which queues it for deletion by > l2tp_tunnel_del_work. > > The second delete operation also finds the tunnel and calls > l2tp_tunnel_delete. If the workqueue has already fired and started > running l2tp_tunnel_del_work, then l2tp_tunnel_delete will queue the > same tunnel a second time, and try to free the socket again. > > Add a dead flag to prevent firing the workqueue twice. Then we can > remove the check of queue_work's result that was meant to prevent that > race but doesn't. > > Also check the flag in the tunnel lookup functions, to avoid returning a > tunnel that is already scheduled for destruction. > > Reproducer: > > ip l2tp add tunnel tunnel_id 3000 peer_tunnel_id 4000 local 192.168.0.2 > remote 192.168.0.1 encap udp udp_sport 5000 udp_dport 6000 > ip l2tp add session name l2tp1 tunnel_id 3000 session_id 1000 > peer_session_id 2000 > ip link set l2tp1 up > ip l2tp del tunnel tunnel_id 3000 > ip l2tp del tunnel tunnel_id 3000 > > Fixes: f8ccac0e4493 ("l2tp: put tunnel socket release on a workqueue") > Reported-by: Jianlin Shi> Signed-off-by: Sabrina Dubroca > --- > v2: as Tom Parkin explained, we can't remove the tunnel from the > per-net list from netlink. v2 uses only a dead flag, and adds > corresponding checks during lookups > > net/l2tp/l2tp_core.c | 18 +- > net/l2tp/l2tp_core.h | 5 - > 2 files changed, 13 insertions(+), 10 deletions(-) > > diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c > index ee485df73ccd..3891f0260f2b 100644 > --- a/net/l2tp/l2tp_core.c > +++ b/net/l2tp/l2tp_core.c > @@ -203,7 +203,8 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net > *net, u32 tunnel_id) > > rcu_read_lock_bh(); > list_for_each_entry_rcu(tunnel, >l2tp_tunnel_list, list) { > - if (tunnel->tunnel_id == tunnel_id) { > + if (tunnel->tunnel_id == tunnel_id && > + !test_bit(0, >dead)) { > l2tp_tunnel_inc_refcount(tunnel); > rcu_read_unlock_bh(); > > @@ -390,7 +391,8 @@ struct l2tp_tunnel *l2tp_tunnel_find(const struct net > *net, u32 tunnel_id) > > rcu_read_lock_bh(); > list_for_each_entry_rcu(tunnel, >l2tp_tunnel_list, list) { > - if (tunnel->tunnel_id == tunnel_id) { > + if (tunnel->tunnel_id == tunnel_id && > + !test_bit(0, >dead)) { > rcu_read_unlock_bh(); > return tunnel; > } > @@ -409,7 +411,7 @@ struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net > *net, int nth) > > rcu_read_lock_bh(); > list_for_each_entry_rcu(tunnel, >l2tp_tunnel_list, list) { > - if (++count > nth) { > + if (++count > nth && !test_bit(0, >dead)) { > rcu_read_unlock_bh(); > return tunnel; > } > I don't get why you're checking the dead flag in l2tp_tunnel_{get,find}*(). Since it can be set concurrently right after test_bit(), it doesn't protect the caller from getting a tunnel that is being removed by l2tp_tunnel_delete(). Or have I missed something?
Re: [RFC PATCH 1/3] usbnet: Get rid of spammy usbnet "kevent X may have been dropped"
On Tue, Sep 19, 2017 at 9:15 AM, Douglas Andersonwrote: > Every once in a while when my system is under a bit of stress I see > some spammy messages show up in my logs that say: > > kevent X may have been dropped > > As far as I can tell these messages aren't terribly useful. The > comments around the messages make me think that either workqueues used > to work differently or that the original author of the code missed a > sublety related to them. The error message appears to predate the git > conversion of the kernel so it's somewhat hard to tell. > > Specifically, workqueues should work like this: > > A) If a workqueue hasn't been scheduled then schedule_work() schedules >it and returns true. > > B) If a workqueue has been scheduled (but hasn't started) then >schedule_work() will do nothing and return false. > > C) If a workqueue has been scheduled (and has started) then >schedule_work() will put it on the queue to run again and return >true. > > Said another way: if you call schedule_work() you can guarantee that > at least one full runthrough of the work will happen again. That > should mean that the work will get processed and I don't see any > reason to think something should be dropped. > > Reading the comments in in usbnet_defer_kevent() made me think that B) > and C) would be treated the same. That is: even if we've started the > work and are 99% of the way through then schedule_work() would return > false and the work wouldn't be queued again. If schedule_work() > really did behave that way then, truly, some amount of work would be > lost. ...but it doesn't. > > NOTE: if somehow these warnings are useful to mean something then > perhaps we should change them to make it more obvious. If it's > interesting to know when the work is backlogged then we should change > the spam to say "warning: usbnet is backlogged". > > ALSO NOTE: If somehow some of the types of work need to be repeated if > usbnet_defer_kevent() is called multiple times then that should be > quite easy to accomplish without dropping any work on the floor. We > can just keep an atomic count for that type of work and add a loop > into usbnet_deferred_kevent(). > > Signed-off-by: Douglas Anderson Reviewed-by: Guenter Roeck > --- > > drivers/net/usb/usbnet.c | 16 +++- > 1 file changed, 7 insertions(+), 9 deletions(-) > > diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c > index 6510e5cc1817..a3e8dbaadcf9 100644 > --- a/drivers/net/usb/usbnet.c > +++ b/drivers/net/usb/usbnet.c > @@ -450,19 +450,17 @@ static enum skb_state defer_bh(struct usbnet *dev, > struct sk_buff *skb, > } > > /* some work can't be done in tasklets, so we use keventd > - * > - * NOTE: annoying asymmetry: if it's active, schedule_work() fails, > - * but tasklet_schedule() doesn't. hope the failure is rare. > */ > void usbnet_defer_kevent (struct usbnet *dev, int work) > { > set_bit (work, >flags); > - if (!schedule_work (>kevent)) { > - if (net_ratelimit()) > - netdev_err(dev->net, "kevent %d may have been > dropped\n", work); > - } else { > - netdev_dbg(dev->net, "kevent %d scheduled\n", work); > - } > + > + /* If work is already started this will mark it to run again when it > +* finishes; if we already had work pending and it hadn't started > +* yet then that's fine too. > +*/ > + schedule_work (>kevent); > + netdev_dbg(dev->net, "kevent %d scheduled\n", work); > } > EXPORT_SYMBOL_GPL(usbnet_defer_kevent); > > -- > 2.14.1.690.gbb1197296e-goog >
Re: [PATCH net-next v2 08/12] net: dsa: b53: Move EEE functions to b53
Florian Fainelliwrites: > Move the bcm_sf2 EEE-related functions to the b53 driver because this is > shared > code amongst Gigabit capable switch, only 5325 and 5365 are too old to support > that. > > Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot
Re: [PATCH net-next v2 09/12] net: dsa: b53: Wire-up EEE
Florian Fainelliwrites: > Add support for enabling and disabling EEE, as well as re-negotiating it in > .adjust_link() and in .port_enable(). > > Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot
Re: [PATCH net-next v2 10/12] net: dsa: b53: Export b53_imp_vlan_setup()
Florian Fainelliwrites: > bcm_sf2 and b53 do exactly the same thing, so share that piece. > > Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot
Re: software interrupts close to 100 with 9000 tc filter entries
> Eric Dumazet wrote: > > On Tue, 2017-09-19 at 15:28 +0200, Marco Berizzi wrote: > > > Hi Folks, > > > > I'm running linux 4.12.10 x86_64 on a Slackware 14.2 64bit > > as a simple 4 NIC router. Network throughput processed by > > this machine is less than 200Mbit/s > > The cpu model is Intel(R) Xeon(R) CPU 5160 @ 3.00GHz with > > 2GB ram. > > > > I need to blacklist about 9000 single ip addresses. > > This is the relevant script to blacklist these ip addresses: > > > > tc qdisc add dev eth0 ingress > > tc qdisc add dev eth1 ingress > > > > while read -r line > > do > > tc filter add dev eth0 parent : protocol ip prio 50 u32 match ip src > > $line action drop > > tc filter add dev eth1 parent : protocol ip prio 50 u32 match ip src > > $line action drop > > done < blacklisted_ip_addresses > > > > After loading these ip addresses, the si (software interrupts) > > number shown by top is always close to 100 > > If I delete the ingress qdisc on both the device, the si > > fall down to less than 5 > > > > Running the same script with 'only' 700 ip addresses is > > flawless. > > > > Kindly I would like to ask if am I doing anything in > > a wrong way or if the hardware is too old for this kind > > of setup. > > > > I have selected the tc filter setup instead of netfilter > > one, because I was reading this from iproute2/doc/actions: > > > > A side effect is that we can now get stateless firewalling to work with tc.. > > Essentially this is now an alternative to iptables. > > I wont go into details of my dislike for iptables at times, but. > > scalability is one of the main issues; however, if you need stateful > > classification - use netfilter (for now). > > > > Any response are welcome > > TIA > > Processing a list of 700 rules per incoming packet is not wise. > > Alternatives : > > * netfilter with IPSET : This probably can be done with one lookup in a > table. Probably easiest way to setup. > > * BPF filter (XDP or TC ) Thanks Eric for the quick response. For better performance (latency time and network throughput) which is the better solution? netfilter with ipset or BPF?
Re: [PATCH net-next v2 11/12] net: dsa: bcm_sf2: Use SF2_NUM_EGRESS_QUEUES for CFP
Florian Fainelliwrites: > The magic number 8 in 3 locations in bcm_sf2_cfp.c actually designates the > number of switch port egress queues, so use that define instead of open-coding > it. > > Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot
[RFC 1/1] net/smc: add SMC rendezvous protocol
The SMC protocol [1] uses a rendezvous protocol to negotiate SMC capability between peers. The current Linux implementation does not use this rendezvous protocol and, thus, is not compliant to RFC7609 and incompatible with other SMC implementations like in zOS. This patch adds support for the SMC rendezvous protocol. Details: The SMC rendezvous protocol relies on the use of a new TCP experimental option. With this option, SMC capabilities are exchanged between the peers during the TCP three way handshake. The goal of this patch is to leave common TCP code unmodified. Thus, it uses netfilter hooks to intercept TCP SYN and SYN/ACK packets. For outgoing packets originating from SMC sockets, the experimental option is added. For inbound packets destined for SMC sockets, the experimental option is checked. Another goal was to minimize the performance impact on non-SMC traffic (when SMC is enabled). The netfilter hooks used for SMC client connections are active only during TCP connection establishment. The netfilter hooks used for SMC servers are active as long as there are listening SMC sockets. When the hooks are active, the following additional operations are performed on incoming and outgoing packets: (1) call SMC netfilter hook (all IPv4 packets) (2) check if TCP SYN or SYN/ACK packet (all IPv4 packets) (3) check if packet goes to/comes from SMC socket (SYN & SYN/ACK packets only) (4) check/add SMC experimental option (SMC sockets' SYN & SYN/ACK packets only) References: [1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609 Signed-off-by: Hans WippelSigned-off-by: Ursula Braun --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 66 ++- net/smc/smc.h| 10 +- net/smc/smc_rv.c | 542 +++ net/smc/smc_rv.h | 31 5 files changed, 644 insertions(+), 7 deletions(-) create mode 100644 net/smc/smc_rv.c create mode 100644 net/smc/smc_rv.h diff --git a/net/smc/Makefile b/net/smc/Makefile index 188104654b54..2155a7eff41d 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_rv.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8c6d24b2995d..6c280bbcd2fe 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -34,6 +34,7 @@ #include #include "smc.h" +#include "smc_rv.h" #include "smc_clc.h" #include "smc_llc.h" #include "smc_cdc.h" @@ -109,6 +110,7 @@ static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; + int old_state; int rc = 0; if (!sk) @@ -123,6 +125,7 @@ static int smc_release(struct socket *sock) lock_sock_nested(sk, SINGLE_DEPTH_NESTING); else lock_sock(sk); + old_state = sk->sk_state; if (smc->use_fallback) { sk->sk_state = SMC_CLOSED; @@ -132,6 +135,10 @@ static int smc_release(struct socket *sock) sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } + if (old_state == SMC_LISTEN) { + smc_rv_nf_unregister_hook(sock_net(sk), _nfho_serv); + kfree(smc->listen_pends); + } if (smc->clcsock) { sock_release(smc->clcsock); smc->clcsock = NULL; @@ -178,6 +185,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) sk->sk_destruct = smc_destruct; sk->sk_protocol = SMCPROTO_SMC; smc = smc_sk(sk); + smc->use_fallback = true; /* default: not SMC-capable */ INIT_WORK(>tcp_listen_work, smc_tcp_listen_work); INIT_LIST_HEAD(>accept_q); spin_lock_init(>accept_q_lock); @@ -386,6 +394,10 @@ static int smc_connect_rdma(struct smc_sock *smc) int rc = 0; u8 ibport; + if (smc->use_fallback) + /* peer has not signalled SMC-capability */ + goto out_connected; + /* IPSec connections opt out of SMC-R optimizations */ if (using_ipsec(smc)) { reason_code = SMC_CLC_DECL_IPSEC; @@ -496,7 +508,6 @@ static int smc_connect_rdma(struct smc_sock *smc) smc_tx_init(smc); out_connected: - smc_copy_sock_settings_to_clc(smc); if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -551,7 +562,11 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, } smc_copy_sock_settings_to_clc(smc); + smc_rv_nf_register_hook(sock_net(sk), _nfho_clnt); + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc != -EINPROGRESS) +
Re: [PATCH net-next 1/3] bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE
On Mon, Sep 18, 2017 at 6:53 PM, Alexei Starovoitovwrote: Thanks for the review! Please correct me if I'm wrong... > On 9/18/17 12:30 PM, Craig Gallek wrote: >> >> From: Craig Gallek >> >> This is a simple non-recursive delete operation. It prunes paths >> of empty nodes in the tree, but it does not try to further compress >> the tree as nodes are removed. >> >> Signed-off-by: Craig Gallek >> --- >> kernel/bpf/lpm_trie.c | 80 >> +-- >> 1 file changed, 77 insertions(+), 3 deletions(-) >> >> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c >> index 1b767844a76f..9d58a576b2ae 100644 >> --- a/kernel/bpf/lpm_trie.c >> +++ b/kernel/bpf/lpm_trie.c >> @@ -389,10 +389,84 @@ static int trie_update_elem(struct bpf_map *map, >> return ret; >> } >> >> -static int trie_delete_elem(struct bpf_map *map, void *key) >> +/* Called from syscall or from eBPF program */ >> +static int trie_delete_elem(struct bpf_map *map, void *_key) >> { >> - /* TODO */ >> - return -ENOSYS; >> + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); >> + struct bpf_lpm_trie_key *key = _key; >> + struct lpm_trie_node __rcu **trim; >> + struct lpm_trie_node *node; >> + unsigned long irq_flags; >> + unsigned int next_bit; >> + size_t matchlen = 0; >> + int ret = 0; >> + >> + if (key->prefixlen > trie->max_prefixlen) >> + return -EINVAL; >> + >> + raw_spin_lock_irqsave(>lock, irq_flags); >> + >> + /* Walk the tree looking for an exact key/length match and keeping >> +* track of where we could begin trimming the tree. The >> trim-point >> +* is the sub-tree along the walk consisting of only single-child >> +* intermediate nodes and ending at a leaf node that we want to >> +* remove. >> +*/ >> + trim = >root; >> + node = rcu_dereference_protected( >> + trie->root, lockdep_is_held(>lock)); >> + while (node) { >> + matchlen = longest_prefix_match(trie, node, key); >> + >> + if (node->prefixlen != matchlen || >> + node->prefixlen == key->prefixlen) >> + break; > > > curious why there is no need to do > 'node->prefixlen == trie->max_prefixlen' in the above > like update/lookup do? I don't believe the node->prefixlen == trie->max_prefixlen check in trie_update_elem is necessary. In order to get to this third clause, it implies that the first two clauses evaluated false. Which happens when we find an exact prefix match for the current node, but the to-be-inserted key prefix is different. If the node we are comparing against had a prefix of max_prefixlen, it would not be possible to have both a full prefix match but different prefix lengths. This assumes that there are no nodes in the tree with > max_prefixlen prefixes, but that is handled earlier in the update function. There's a similar (I believe) unnecessary max_prefixlen check in trie_lookup_elem. The function should behave the same way without that check, but at least in this case it's used as an early-out and saves a few lines of execution. > >> + >> + next_bit = extract_bit(key->data, node->prefixlen); >> + /* If we hit a node that has more than one child or is a >> valid >> +* prefix itself, do not remove it. Reset the root of the >> trim >> +* path to its descendant on our path. >> +*/ >> + if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || >> + (node->child[0] && node->child[1])) >> + trim = >child[next_bit]; >> + node = rcu_dereference_protected( >> + node->child[next_bit], >> lockdep_is_held(>lock)); >> + } >> + >> + if (!node || node->prefixlen != key->prefixlen || >> + (node->flags & LPM_TREE_NODE_FLAG_IM)) { >> + ret = -ENOENT; >> + goto out; >> + } >> + >> + trie->n_entries--; >> + >> + /* If the node we are removing is not a leaf node, simply mark it >> +* as intermediate and we are done. >> +*/ >> + if (rcu_access_pointer(node->child[0]) || >> + rcu_access_pointer(node->child[1])) { >> + node->flags |= LPM_TREE_NODE_FLAG_IM; >> + goto out; >> + } >> + >> + /* trim should now point to the slot holding the start of a path >> from >> +* zero or more intermediate nodes to our leaf node for deletion. >> +*/ >> + while ((node = rcu_dereference_protected( >> + *trim, lockdep_is_held(>lock { >> + RCU_INIT_POINTER(*trim, NULL); >> + trim = rcu_access_pointer(node->child[0]) ? >> + >child[0] : >> + >child[1]; >> +
Re: [PATCH RFC V1 net-next 0/6] Time based packet transmission
On Mon, Sep 18, 2017 at 09:41:15AM +0200, Richard Cochran wrote: > This series is an early RFC that introduces a new socket option > allowing time based transmission of packets. This option will be > useful in implementing various real time protocols over Ethernet, > including but not limited to P802.1Qbv, which is currently finding > its way into 802.1Q. If I understand it correctly, this also allows us to make a PTP/NTP "one-step" clock with HW that doesn't support it directly. > * Open questions about SO_TXTIME semantics > > - What should the kernel do if the dialed Tx time is in the past? > Should the packet be sent ASAP, or should we throw an error? Dropping the packet with an error would make more sense to me. > - What should the timescale be for the dialed Tx time? Should the > kernel select UTC when using the SW Qdisc and the HW time > otherwise? Or should the socket option include a clockid_t? I think for applications that don't (want to) bind their socket to a specific interface it would be useful if the cmsg specified clockid_t or maybe if_index. If the packet would be sent using a different PHC/interface, it should be dropped. > | | plain preempt_rt | so_txtime | txtime @ 250 us | > |-+--+---+-| > | min:|+1.940800e+04 | +4.72e+02 | +4.72e+02 | > | max:|+7.556000e+04 | +5.68e+02 | +5.76e+02 | > | pk-pk: |+5.615200e+04 | +9.60e+01 | +1.04e+02 | > | mean: |+3.292776e+04 | +5.072274e+02 | +5.073602e+02 | > | stddev: |+6.514709e+03 | +1.310849e+01 | +1.507144e+01 | > | count: | 60 |60 | 240 | > > Using so_txtime, the peak to peak jitter is about 100 nanoseconds, Nice! -- Miroslav Lichvar
Re: [PATCH net-next 07/14] gtp: Support encapsulation of IPv6 packets
From: Harald WelteDate: Tue, 19 Sep 2017 20:12:45 +0800 > Hi Dave, > > On Mon, Sep 18, 2017 at 09:19:08PM -0700, David Miller wrote: > >> > +static inline u32 ipv6_hashfn(const struct in6_addr *a) >> > +{ >> > + return __ipv6_addr_jhash(a, gtp_h_initval); >> > +} >> >> I know you are just following the pattern of the existing "ipv4_hashfn()" >> here >> but this kind of stuff is not very global namespace friendly. Even simply >> adding a "gtp_" prefix to these hash functions would be a lot better. > > I would agree if this was an inline function defined in a header file or > a non-static function. But where is the global namespace concern in > case of static inline functions defined and used in the same .c file? The problem is if we create a generic ipv6_hashfn() in linux/ipv6.h or something like that, then this driver stops building.
Re: [PATCH net-next 2/4] qed: Add iWARP out of order support
On Tue, Sep 19, 2017 at 08:26:17PM +0300, Michal Kalderon wrote: > iWARP requires OOO support which is already provided by the ll2 > interface (until now was used only for iSCSI offload). > The changes mostly include opening a ll2 dedicated connection for > OOO and notifiying the FW about the handle id. > > Signed-off-by: Michal Kalderon> Signed-off-by: Ariel Elior > --- > drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 44 > + > drivers/net/ethernet/qlogic/qed/qed_iwarp.h | 11 +++- > drivers/net/ethernet/qlogic/qed/qed_rdma.c | 7 +++-- > 3 files changed, 59 insertions(+), 3 deletions(-) > > diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c > b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c > index 9d989c9..568e985 100644 > --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c > +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c > @@ -41,6 +41,7 @@ > #include "qed_rdma.h" > #include "qed_reg_addr.h" > #include "qed_sp.h" > +#include "qed_ooo.h" > > #define QED_IWARP_ORD_DEFAULT32 > #define QED_IWARP_IRD_DEFAULT32 > @@ -119,6 +120,13 @@ static void qed_iwarp_cid_cleaned(struct qed_hwfn > *p_hwfn, u32 cid) > spin_unlock_bh(_hwfn->p_rdma_info->lock); > } > > +void qed_iwarp_init_fw_ramrod(struct qed_hwfn *p_hwfn, > + struct iwarp_init_func_params *p_ramrod) > +{ > + p_ramrod->ll2_ooo_q_index = RESC_START(p_hwfn, QED_LL2_QUEUE) + > + p_hwfn->p_rdma_info->iwarp.ll2_ooo_handle; > +} > + > static int qed_iwarp_alloc_cid(struct qed_hwfn *p_hwfn, u32 *cid) > { > int rc; > @@ -1876,6 +1884,16 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, > struct qed_ptt *p_ptt) > iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL; > } > > + if (iwarp_info->ll2_ooo_handle != QED_IWARP_HANDLE_INVAL) { > + rc = qed_ll2_terminate_connection(p_hwfn, > + iwarp_info->ll2_ooo_handle); > + if (rc) > + DP_INFO(p_hwfn, "Failed to terminate ooo connection\n"); What exactly will you do with this knowledge? Anyway you are not interested in return values of qed_ll2_terminate_connection function in this place and other places too. Why don't you handle EAGAIN returned from the qed_ll2_terminate_connection()? Thanks > + > + qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_ooo_handle); > + iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL; > + } > + > qed_llh_remove_mac_filter(p_hwfn, > p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr); > return rc; > @@ -1927,10 +1945,12 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn > *p_hwfn, struct qed_ptt *p_ptt) > struct qed_iwarp_info *iwarp_info; > struct qed_ll2_acquire_data data; > struct qed_ll2_cbs cbs; > + u16 n_ooo_bufs; > int rc = 0; > > iwarp_info = _hwfn->p_rdma_info->iwarp; > iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL; > + iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL; > > iwarp_info->max_mtu = params->max_mtu; > > @@ -1978,6 +1998,29 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, > struct qed_ptt *p_ptt) > if (rc) > goto err; > > + /* Start OOO connection */ > + data.input.conn_type = QED_LL2_TYPE_OOO; > + data.input.mtu = params->max_mtu; > + > + n_ooo_bufs = (QED_IWARP_MAX_OOO * QED_IWARP_RCV_WND_SIZE_DEF) / > + iwarp_info->max_mtu; > + n_ooo_bufs = min_t(u32, n_ooo_bufs, QED_IWARP_LL2_OOO_MAX_RX_SIZE); > + > + data.input.rx_num_desc = n_ooo_bufs; > + data.input.rx_num_ooo_buffers = n_ooo_bufs; > + > + data.input.tx_max_bds_per_packet = 1; /* will never be fragmented */ > + data.input.tx_num_desc = QED_IWARP_LL2_OOO_DEF_TX_SIZE; > + data.p_connection_handle = _info->ll2_ooo_handle; > + > + rc = qed_ll2_acquire_connection(p_hwfn, ); > + if (rc) > + goto err; > + > + rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_ooo_handle); > + if (rc) > + goto err; > + > return rc; > err: > qed_iwarp_ll2_stop(p_hwfn, p_ptt); > @@ -2014,6 +2057,7 @@ int qed_iwarp_setup(struct qed_hwfn *p_hwfn, struct > qed_ptt *p_ptt, > > qed_spq_register_async_cb(p_hwfn, PROTOCOLID_IWARP, > qed_iwarp_async_event); > + qed_ooo_setup(p_hwfn); > > return qed_iwarp_ll2_start(p_hwfn, params, p_ptt); > } > diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h > b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h > index 148ef3c..9e2bfde 100644 > --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h > +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h > @@ -47,7 +47,12 @@ enum qed_iwarp_qp_state { > #define QED_IWARP_LL2_SYN_TX_SIZE (128) > #define
Re: [PATCH net-next 03/14] gtp: Call common functions to get tunnel routes and add dst_cache
From: Harald WelteDate: Tue, 19 Sep 2017 20:09:42 +0800 > So I guess you're asking us to document that rationale as form of a > source code comment ? Yes that would make ignoring the potential changing of the non-const 'saddr' argument at least be documented.
Re: [PATCH net-next 3/4] qed: Fix maximum number of CQs for iWARP
On Tue, Sep 19, 2017 at 08:26:18PM +0300, Michal Kalderon wrote: > The maximum number of CQs supported is bound to the number > of connections supported, which differs between RoCE and iWARP. > > This fixes a crash that occurred in iWARP when running 1000 sessions > using perftest. > > Signed-off-by: Michal Kalderon> Signed-off-by: Ariel Elior > --- It is worth to add Fixes line. Thanks signature.asc Description: PGP signature
[PATCH net] net: change skb->mac_header when Generic XDP calls adjust_head
Since XDP's view of the packet includes the MAC header, moving the start- of-packet with bpf_xdp_adjust_head needs to also update the offset of the MAC header (which is relative to skb->head, not to the skb->data that was changed). Without this, tcpdump sees packets starting from the old MAC header rather than the new one, at least in my tests on the loopback device. Fixes: b5cdae3291f7 ("net: Generic XDP") Signed-off-by: Edward Cree--- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index fb766d9..9a2254f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3892,6 +3892,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); + skb->mac_header += off; switch (act) { case XDP_REDIRECT:
[PATCH net-next v3 02/12] net: dsa: b53: Make b53_enable_cpu_port() take a port argument
In preparation for future changes allowing the configuring of multiple CPU ports, make b53_enable_cpu_port() take a port argument. Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/b53/b53_common.c | 11 +-- 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 274f3679f33d..d8bc54cfcfbe 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -538,19 +538,18 @@ static void b53_disable_port(struct dsa_switch *ds, int port, b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg); } -static void b53_enable_cpu_port(struct b53_device *dev) +static void b53_enable_cpu_port(struct b53_device *dev, int port) { - unsigned int cpu_port = dev->cpu_port; u8 port_ctrl; /* BCM5325 CPU port is at 8 */ - if ((is5325(dev) || is5365(dev)) && cpu_port == B53_CPU_PORT_25) - cpu_port = B53_CPU_PORT; + if ((is5325(dev) || is5365(dev)) && port == B53_CPU_PORT_25) + port = B53_CPU_PORT; port_ctrl = PORT_CTRL_RX_BCST_EN | PORT_CTRL_RX_MCST_EN | PORT_CTRL_RX_UCST_EN; - b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(cpu_port), port_ctrl); + b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), port_ctrl); } static void b53_enable_mib(struct b53_device *dev) @@ -820,7 +819,7 @@ static int b53_setup(struct dsa_switch *ds) if (BIT(port) & ds->enabled_port_mask) b53_enable_port(ds, port, NULL); else if (dsa_is_cpu_port(ds, port)) - b53_enable_cpu_port(dev); + b53_enable_cpu_port(dev, port); else b53_disable_port(ds, port, NULL); } -- 2.9.3
Re: [PATCH net-next 2/4] net: dsa: setup master ethtool unconditionally
On 09/19/2017 08:56 AM, Vivien Didelot wrote: > When a DSA switch tree is meant to be applied, it already has a CPU > port. Thus remove the condition of dst->cpu_dp. > > Moreover, the next lines access dst->cpu_dp unconditionally. > > Signed-off-by: Vivien DidelotReviewed-by: Florian Fainelli -- Florian
[PATCH net-next v3 05/12] net: dsa: b53: Use a macro to define I/O operations
Instead of repeating the same pattern: acquire mutex, read/write, release mutex, define a macro: b53_build_op() which takes the type (read|write), I/O size, and value (scalar or pointer). This helps with fixing bugs that could exist (e.g: missing barrier, lock etc.). Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/b53/b53_priv.h | 133 +++-- 1 file changed, 22 insertions(+), 111 deletions(-) diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 7528b22aeb03..5bebe97900e8 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -199,119 +199,30 @@ static inline void b53_switch_remove(struct b53_device *dev) dsa_unregister_switch(dev->ds); } -static inline int b53_read8(struct b53_device *dev, u8 page, u8 reg, u8 *val) -{ - int ret; - - mutex_lock(>reg_mutex); - ret = dev->ops->read8(dev, page, reg, val); - mutex_unlock(>reg_mutex); - - return ret; -} - -static inline int b53_read16(struct b53_device *dev, u8 page, u8 reg, u16 *val) -{ - int ret; - - mutex_lock(>reg_mutex); - ret = dev->ops->read16(dev, page, reg, val); - mutex_unlock(>reg_mutex); - - return ret; -} - -static inline int b53_read32(struct b53_device *dev, u8 page, u8 reg, u32 *val) -{ - int ret; - - mutex_lock(>reg_mutex); - ret = dev->ops->read32(dev, page, reg, val); - mutex_unlock(>reg_mutex); - - return ret; -} - -static inline int b53_read48(struct b53_device *dev, u8 page, u8 reg, u64 *val) -{ - int ret; - - mutex_lock(>reg_mutex); - ret = dev->ops->read48(dev, page, reg, val); - mutex_unlock(>reg_mutex); - - return ret; +#define b53_build_op(type_op_size, val_type) \ +static inline int b53_##type_op_size(struct b53_device *dev, u8 page, \ +u8 reg, val_type val) \ +{ \ + int ret;\ + \ + mutex_lock(>reg_mutex);\ + ret = dev->ops->type_op_size(dev, page, reg, val); \ + mutex_unlock(>reg_mutex); \ + \ + return ret; \ } -static inline int b53_read64(struct b53_device *dev, u8 page, u8 reg, u64 *val) -{ - int ret; - - mutex_lock(>reg_mutex); - ret = dev->ops->read64(dev, page, reg, val); - mutex_unlock(>reg_mutex); - - return ret; -} - -static inline int b53_write8(struct b53_device *dev, u8 page, u8 reg, u8 value) -{ - int ret; - - mutex_lock(>reg_mutex); - ret = dev->ops->write8(dev, page, reg, value); - mutex_unlock(>reg_mutex); - - return ret; -} - -static inline int b53_write16(struct b53_device *dev, u8 page, u8 reg, - u16 value) -{ - int ret; - - mutex_lock(>reg_mutex); - ret = dev->ops->write16(dev, page, reg, value); - mutex_unlock(>reg_mutex); - - return ret; -} - -static inline int b53_write32(struct b53_device *dev, u8 page, u8 reg, - u32 value) -{ - int ret; - - mutex_lock(>reg_mutex); - ret = dev->ops->write32(dev, page, reg, value); - mutex_unlock(>reg_mutex); - - return ret; -} - -static inline int b53_write48(struct b53_device *dev, u8 page, u8 reg, - u64 value) -{ - int ret; - - mutex_lock(>reg_mutex); - ret = dev->ops->write48(dev, page, reg, value); - mutex_unlock(>reg_mutex); - - return ret; -} - -static inline int b53_write64(struct b53_device *dev, u8 page, u8 reg, - u64 value) -{ - int ret; - - mutex_lock(>reg_mutex); - ret = dev->ops->write64(dev, page, reg, value); - mutex_unlock(>reg_mutex); - - return ret; -} +b53_build_op(read8, u8 *); +b53_build_op(read16, u16 *); +b53_build_op(read32, u32 *); +b53_build_op(read48, u64 *); +b53_build_op(read64, u64 *); + +b53_build_op(write8, u8); +b53_build_op(write16, u16); +b53_build_op(write32, u32); +b53_build_op(write48, u64); +b53_build_op(write64, u64); struct b53_arl_entry { u8 port; -- 2.9.3
[PATCH net-next v3 01/12] net: dsa: b53: Remove is_cpu_port()
This is not used anywhere, so remove it. Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/b53/b53_priv.h | 5 - 1 file changed, 5 deletions(-) diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 01bd8cbe9a3f..7528b22aeb03 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -186,11 +186,6 @@ static inline int is58xx(struct b53_device *dev) #define B53_CPU_PORT_255 #define B53_CPU_PORT 8 -static inline int is_cpu_port(struct b53_device *dev, int port) -{ - return dev->cpu_port; -} - struct b53_device *b53_switch_alloc(struct device *base, const struct b53_io_ops *ops, void *priv); -- 2.9.3
Re: pull-request: mac80211 2017-11-19
From: Johannes BergDate: Tue, 19 Sep 2017 09:20:47 +0200 > Here's a new set of two small changes to prevent null pointer > dereferences on malformed netlink messages. > > Please pull and let me know if there's any problem. Pulled, thank you.
Re: [PATCH net-next 05/14] gtp: Remove special mtu handling
On Tue, Sep 19, 2017 at 4:42 AM, Harald Weltewrote: > Hi Tom, > > On Mon, Sep 18, 2017 at 05:38:55PM -0700, Tom Herbert wrote: >> Removes MTU handling in gtp_build_skb_ip4. This is non standard relative >> to how other tunneling protocols handle MTU. The model espoused is that >> the inner interface should set it's MTU to be less than the expected >> path MTU on the overlay network. Path MTU discovery is not typically >> used for modifying tunnel MTUs. > > The point of the kernel GTP module is to interoperate with existing > other GTP implementations and the practises established by cellular > operators when operating GTP in their networks. > > While what you describe (chose interface MTU to be less than the > expected path MTU) is generally best practise in the Linux IP/networking > world, this is not generally reflected in the cellular > universe. You see quite a bit of GTP fragmentation due to the fact > that the transport network simply has to deal with the MTU that has > been established via the control plane between SGSN and MS/UE, without > the GGSN even being part of that negotiation. > > Also, you may very well have one "gtp0" tunnel device at the GGSN, > but you are establishing individual GTP tunnels to dozesn to hundreds of > different SGSNs at operators all over the world. You cannot reliably > set the "gtp0" interface MTU to "the path MTU of the overlay network", > as the overlay network is in fact different for each of the SGSNs you're > talking to - and each may have a different path MTU. > > So unless I'm missing something, I would currently vote for staying with > the current code, which uses the path MTU to the specific destination IP > address (the SGSN). > Okay, I'll modify tnl_update_pmtu so we can call it from GTP and not have to replicate that function. I suspect VXLAN might also what this at some point. Tom > Regards, > Harald > > -- > - Harald Welte http://laforge.gnumonks.org/ > > "Privacy in residential applications is a desirable marketing option." > (ETSI EN 300 175-7 Ch. A6)
Re: [RFC PATCH 3/3] usbnet: Fix memory leak when rx_submit() fails
Douglas Andersonwrites: > If rx_submit() returns an error code then nobody calls usb_free_urb(). > That means it's leaked. Nope. rx_submit() will call usb_free_urb() before returning an error: static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags) .. if (!skb) { netif_dbg(dev, rx_err, dev->net, "no rx skb\n"); usbnet_defer_kevent (dev, EVENT_RX_MEMORY); usb_free_urb (urb); return -ENOMEM; } .. if (retval) { dev_kfree_skb_any (skb); usb_free_urb (urb); } Bjørn
[PATCH net-next v3 07/12] net: dsa: b53: Define EEE register page
In preparation for migrating the EEE code from bcm_sf2 to b53, define the full EEE register page and offsets within that page. Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/b53/b53_regs.h | 41 + 1 file changed, 41 insertions(+) diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h index 5e8b8e31fee8..2a9f421680aa 100644 --- a/drivers/net/dsa/b53/b53_regs.h +++ b/drivers/net/dsa/b53/b53_regs.h @@ -50,6 +50,9 @@ /* Jumbo Frame Registers */ #define B53_JUMBO_PAGE 0x40 +/* EEE Control Registers Page */ +#define B53_EEE_PAGE 0x92 + /* CFP Configuration Registers Page */ #define B53_CFP_PAGE 0xa1 @@ -472,6 +475,44 @@ #define JMS_MAX_SIZE 9724 /* + * EEE Configuration Page Registers + */ + +/* EEE Enable control register (16 bit) */ +#define B53_EEE_EN_CTRL0x00 + +/* EEE LPI assert status register (16 bit) */ +#define B53_EEE_LPI_ASSERT_STS 0x02 + +/* EEE LPI indicate status register (16 bit) */ +#define B53_EEE_LPI_INDICATE 0x4 + +/* EEE Receiving idle symbols status register (16 bit) */ +#define B53_EEE_RX_IDLE_SYM_STS0x6 + +/* EEE Pipeline timer register (32 bit) */ +#define B53_EEE_PIP_TIMER 0xC + +/* EEE Sleep timer Gig register (32 bit) */ +#define B53_EEE_SLEEP_TIMER_GIG(i) (0x10 + 4 * (i)) + +/* EEE Sleep timer FE register (32 bit) */ +#define B53_EEE_SLEEP_TIMER_FE(i) (0x34 + 4 * (i)) + +/* EEE Minimum LP timer Gig register (32 bit) */ +#define B53_EEE_MIN_LP_TIMER_GIG(i)(0x58 + 4 * (i)) + +/* EEE Minimum LP timer FE register (32 bit) */ +#define B53_EEE_MIN_LP_TIMER_FE(i) (0x7c + 4 * (i)) + +/* EEE Wake timer Gig register (16 bit) */ +#define B53_EEE_WAKE_TIMER_GIG(i) (0xa0 + 2 * (i)) + +/* EEE Wake timer FE register (16 bit) */ +#define B53_EEE_WAKE_TIMER_FE(i) (0xb2 + 2 * (i)) + + +/* * CFP Configuration Page Registers */ -- 2.9.3
[PATCH net-next v3 08/12] net: dsa: b53: Move EEE functions to b53
Move the bcm_sf2 EEE-related functions to the b53 driver because this is shared code amongst Gigabit capable switch, only 5325 and 5365 are too old to support that. Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/b53/b53_common.c | 63 ++ drivers/net/dsa/b53/b53_priv.h | 5 +++ drivers/net/dsa/bcm_sf2.c| 66 drivers/net/dsa/bcm_sf2.h| 2 -- drivers/net/dsa/bcm_sf2_regs.h | 3 -- 5 files changed, 74 insertions(+), 65 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index aa2187c71ea5..491e4ffa8a0e 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1531,6 +1531,69 @@ void b53_mirror_del(struct dsa_switch *ds, int port, } EXPORT_SYMBOL(b53_mirror_del); +void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable) +{ + struct b53_device *dev = ds->priv; + u16 reg; + + b53_read16(dev, B53_EEE_PAGE, B53_EEE_EN_CTRL, ); + if (enable) + reg |= BIT(port); + else + reg &= ~BIT(port); + b53_write16(dev, B53_EEE_PAGE, B53_EEE_EN_CTRL, reg); +} +EXPORT_SYMBOL(b53_eee_enable_set); + + +/* Returns 0 if EEE was not enabled, or 1 otherwise + */ +int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy) +{ + int ret; + + ret = phy_init_eee(phy, 0); + if (ret) + return 0; + + b53_eee_enable_set(ds, port, true); + + return 1; +} +EXPORT_SYMBOL(b53_eee_init); + +int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e) +{ + struct b53_device *dev = ds->priv; + struct ethtool_eee *p = >ports[port].eee; + u16 reg; + + if (is5325(dev) || is5365(dev)) + return -EOPNOTSUPP; + + b53_read16(dev, B53_EEE_PAGE, B53_EEE_LPI_INDICATE, ); + e->eee_enabled = p->eee_enabled; + e->eee_active = !!(reg & BIT(port)); + + return 0; +} +EXPORT_SYMBOL(b53_get_mac_eee); + +int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e) +{ + struct b53_device *dev = ds->priv; + struct ethtool_eee *p = >ports[port].eee; + + if (is5325(dev) || is5365(dev)) + return -EOPNOTSUPP; + + p->eee_enabled = e->eee_enabled; + b53_eee_enable_set(ds, port, e->eee_enabled); + + return 0; +} +EXPORT_SYMBOL(b53_set_mac_eee); + static const struct dsa_switch_ops b53_switch_ops = { .get_tag_protocol = b53_get_tag_protocol, .setup = b53_setup, diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 77102f685da0..aabe80eab25d 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -70,6 +70,7 @@ enum { struct b53_port { u16 vlan_ctl_mask; + struct ethtool_eee eee; }; struct b53_vlan { @@ -310,5 +311,9 @@ int b53_mirror_add(struct dsa_switch *ds, int port, void b53_mirror_del(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port); +void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable); +int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy); +int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e); +int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e); #endif diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 49cb51223f70..4e8ef4c07eab 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -107,19 +107,6 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port) core_writel(priv, reg, offset); } -static void bcm_sf2_eee_enable_set(struct dsa_switch *ds, int port, bool enable) -{ - struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - u32 reg; - - reg = core_readl(priv, CORE_EEE_EN_CTRL); - if (enable) - reg |= 1 << port; - else - reg &= ~(1 << port); - core_writel(priv, reg, CORE_EEE_EN_CTRL); -} - static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable) { struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); @@ -256,8 +243,8 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, bcm_sf2_imp_vlan_setup(ds, cpu_port); /* If EEE was enabled, restore it */ - if (priv->port_sts[port].eee.eee_enabled) - bcm_sf2_eee_enable_set(ds, port, true); + if (priv->dev->ports[port].eee.eee_enabled) + b53_eee_enable_set(ds, port, true); return 0; } @@ -292,47 +279,6 @@ static void bcm_sf2_port_disable(struct dsa_switch *ds, int port, core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL); } -/* Returns 0 if EEE was
[PATCH net-next v3 12/12] net: dsa: bcm_sf2: Utilize b53_{enable,disable}_port
Export b53_{enable,disable}_port and use these two functions in bcm_sf2_port_setup and bcm_sf2_port_disable. The generic functions cannot be used without wrapping because we need to manage additional switch integration details (PHY, Broadcom tag etc.). Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/b53/b53_common.c | 8 drivers/net/dsa/b53/b53_priv.h | 2 ++ drivers/net/dsa/bcm_sf2.c| 26 ++ 3 files changed, 8 insertions(+), 28 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index c3f1cd2c33ea..a9f2a5b55a5e 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -502,8 +502,7 @@ void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) } EXPORT_SYMBOL(b53_imp_vlan_setup); -static int b53_enable_port(struct dsa_switch *ds, int port, - struct phy_device *phy) +int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { struct b53_device *dev = ds->priv; unsigned int cpu_port = dev->cpu_port; @@ -530,9 +529,9 @@ static int b53_enable_port(struct dsa_switch *ds, int port, return 0; } +EXPORT_SYMBOL(b53_enable_port); -static void b53_disable_port(struct dsa_switch *ds, int port, -struct phy_device *phy) +void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { struct b53_device *dev = ds->priv; u8 reg; @@ -542,6 +541,7 @@ static void b53_disable_port(struct dsa_switch *ds, int port, reg |= PORT_CTRL_RX_DISABLE | PORT_CTRL_TX_DISABLE; b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg); } +EXPORT_SYMBOL(b53_disable_port); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port) { diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 8f4f83e2e4bd..603c66d240d8 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -311,6 +311,8 @@ int b53_mirror_add(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress); void b53_mirror_del(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); +int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy); +void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port); void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable); int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy); diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 08639674947a..0072a959db5b 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -163,7 +163,6 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, struct phy_device *phy) { struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - s8 cpu_port = ds->dst->cpu_dp->index; unsigned int i; u32 reg; @@ -184,9 +183,6 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, reg |= i << (PRT_TO_QID_SHIFT * i); core_writel(priv, reg, CORE_PORT_TC2_QOS_MAP_PORT(port)); - /* Clear the Rx and Tx disable bits and set to no spanning tree */ - core_writel(priv, 0, CORE_G_PCTL_PORT(port)); - /* Re-enable the GPHY and re-apply workarounds */ if (priv->int_phy_mask & 1 << port && priv->hw_params.num_gphy == 1) { bcm_sf2_gphy_enable_set(ds, true); @@ -209,23 +205,7 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, if (port == priv->moca_port) bcm_sf2_port_intr_enable(priv, port); - /* Set this port, and only this one to be in the default VLAN, -* if member of a bridge, restore its membership prior to -* bringing down this port. -*/ - reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(port)); - reg &= ~PORT_VLAN_CTRL_MASK; - reg |= (1 << port); - reg |= priv->dev->ports[port].vlan_ctl_mask; - core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(port)); - - b53_imp_vlan_setup(ds, cpu_port); - - /* If EEE was enabled, restore it */ - if (priv->dev->ports[port].eee.eee_enabled) - b53_eee_enable_set(ds, port, true); - - return 0; + return b53_enable_port(ds, port, phy); } static void bcm_sf2_port_disable(struct dsa_switch *ds, int port, @@ -248,9 +228,7 @@ static void bcm_sf2_port_disable(struct dsa_switch *ds, int port, else off = CORE_G_PCTL_PORT(port); - reg = core_readl(priv, off); - reg |= RX_DIS | TX_DIS; - core_writel(priv, reg, off); + b53_disable_port(ds, port, phy); /* Power down the port memory */ reg =
[PATCH net-next v3 03/12] net: dsa: b53: Defer port enabling to calling port_enable
There is no need to configure the enabled ports once in b53_setup() and then a second time around when dsa_switch_ops::port_enable is called, just do it when port_enable is called which is better in terms of power consumption and correctness. Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/b53/b53_common.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index d8bc54cfcfbe..3297af6aab8a 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -815,12 +815,13 @@ static int b53_setup(struct dsa_switch *ds) if (ret) dev_err(ds->dev, "failed to apply configuration\n"); + /* Configure IMP/CPU port, disable unused ports. Enabled +* ports will be configured with .port_enable +*/ for (port = 0; port < dev->num_ports; port++) { - if (BIT(port) & ds->enabled_port_mask) - b53_enable_port(ds, port, NULL); - else if (dsa_is_cpu_port(ds, port)) + if (dsa_is_cpu_port(ds, port)) b53_enable_cpu_port(dev, port); - else + else if (!(BIT(port) & ds->enabled_port_mask)) b53_disable_port(ds, port, NULL); } -- 2.9.3
[PATCH net-next v3 09/12] net: dsa: b53: Wire-up EEE
Add support for enabling and disabling EEE, as well as re-negotiating it in .adjust_link() and in .port_enable(). Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/b53/b53_common.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 491e4ffa8a0e..4e37ec27e496 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -523,6 +523,10 @@ static int b53_enable_port(struct dsa_switch *ds, int port, b53_imp_vlan_setup(ds, cpu_port); + /* If EEE was enabled, restore it */ + if (dev->ports[port].eee.eee_enabled) + b53_eee_enable_set(ds, port, true); + return 0; } @@ -879,6 +883,7 @@ static void b53_adjust_link(struct dsa_switch *ds, int port, struct phy_device *phydev) { struct b53_device *dev = ds->priv; + struct ethtool_eee *p = >ports[port].eee; u8 rgmii_ctrl = 0, reg = 0, off; if (!phy_is_pseudo_fixed_link(phydev)) @@ -1000,6 +1005,9 @@ static void b53_adjust_link(struct dsa_switch *ds, int port, b53_write8(dev, B53_CTRL_PAGE, po_reg, gmii_po); } } + + /* Re-negotiate EEE if it was enabled already */ + p->eee_enabled = b53_eee_init(ds, port, phydev); } int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) @@ -1605,6 +1613,8 @@ static const struct dsa_switch_ops b53_switch_ops = { .adjust_link= b53_adjust_link, .port_enable= b53_enable_port, .port_disable = b53_disable_port, + .get_mac_eee= b53_get_mac_eee, + .set_mac_eee= b53_set_mac_eee, .port_bridge_join = b53_br_join, .port_bridge_leave = b53_br_leave, .port_stp_state_set = b53_br_set_stp_state, -- 2.9.3
[PATCH net-next v3 11/12] net: dsa: bcm_sf2: Use SF2_NUM_EGRESS_QUEUES for CFP
The magic number 8 in 3 locations in bcm_sf2_cfp.c actually designates the number of switch port egress queues, so use that define instead of open-coding it. Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/bcm_sf2_cfp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index 8a1da7e67707..94649e1481ec 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -144,7 +144,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port, * destination port is enabled and that we are within the * number of ports supported by the switch */ - port_num = fs->ring_cookie / 8; + port_num = fs->ring_cookie / SF2_NUM_EGRESS_QUEUES; if (fs->ring_cookie == RX_CLS_FLOW_DISC || !(BIT(port_num) & ds->enabled_port_mask) || @@ -280,7 +280,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port, * We have a small oddity where Port 6 just does not have a * valid bit here (so we subtract by one). */ - queue_num = fs->ring_cookie % 8; + queue_num = fs->ring_cookie % SF2_NUM_EGRESS_QUEUES; if (port_num >= 7) port_num -= 1; @@ -401,7 +401,7 @@ static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port, /* There is no Port 6, so we compensate for that here */ if (nfc->fs.ring_cookie >= 6) nfc->fs.ring_cookie++; - nfc->fs.ring_cookie *= 8; + nfc->fs.ring_cookie *= SF2_NUM_EGRESS_QUEUES; /* Extract the destination queue */ queue_num = (reg >> NEW_TC_SHIFT) & NEW_TC_MASK; -- 2.9.3
[PATCH net-next v3 10/12] net: dsa: b53: Export b53_imp_vlan_setup()
bcm_sf2 and b53 do exactly the same thing, so share that piece. Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/b53/b53_common.c | 3 ++- drivers/net/dsa/b53/b53_priv.h | 1 + drivers/net/dsa/bcm_sf2.c| 23 +-- 3 files changed, 4 insertions(+), 23 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 4e37ec27e496..c3f1cd2c33ea 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -484,7 +484,7 @@ static int b53_fast_age_vlan(struct b53_device *dev, u16 vid) return b53_flush_arl(dev, FAST_AGE_VLAN); } -static void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) +void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) { struct b53_device *dev = ds->priv; unsigned int i; @@ -500,6 +500,7 @@ static void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) b53_write16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(i), pvlan); } } +EXPORT_SYMBOL(b53_imp_vlan_setup); static int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index aabe80eab25d..8f4f83e2e4bd 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -284,6 +284,7 @@ static inline int b53_switch_get_reset_gpio(struct b53_device *dev) #endif /* Exported functions towards other drivers */ +void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port); void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data); void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data); int b53_get_sset_count(struct dsa_switch *ds); diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 4e8ef4c07eab..08639674947a 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -40,27 +40,6 @@ static enum dsa_tag_protocol bcm_sf2_sw_get_tag_protocol(struct dsa_switch *ds) return DSA_TAG_PROTO_BRCM; } -static void bcm_sf2_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) -{ - struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - unsigned int i; - u32 reg; - - /* Enable the IMP Port to be in the same VLAN as the other ports -* on a per-port basis such that we only have Port i and IMP in -* the same VLAN. -*/ - for (i = 0; i < priv->hw_params.num_ports; i++) { - if (!((1 << i) & ds->enabled_port_mask)) - continue; - - reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i)); - reg |= (1 << cpu_port); - core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(i)); - } -} - - static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port) { struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); @@ -240,7 +219,7 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, reg |= priv->dev->ports[port].vlan_ctl_mask; core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(port)); - bcm_sf2_imp_vlan_setup(ds, cpu_port); + b53_imp_vlan_setup(ds, cpu_port); /* If EEE was enabled, restore it */ if (priv->dev->ports[port].eee.eee_enabled) -- 2.9.3
[PATCH net-next v3 04/12] net: dsa: bcm_sf2: Defer port enabling to calling port_enable
There is no need to configure the enabled ports once in bcm_sf2_sw_setup() and then a second time around when dsa_switch_ops::port_enable is called, just do it when port_enable is called which is better in terms of power consumption and correctness. Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/bcm_sf2.c | 9 +++-- 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index d7b53d53c116..8acbd17bc1fd 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -890,14 +890,11 @@ static int bcm_sf2_sw_setup(struct dsa_switch *ds) struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); unsigned int port; - /* Enable all valid ports and disable those unused */ + /* Disable unused ports and configure IMP port */ for (port = 0; port < priv->hw_params.num_ports; port++) { - /* IMP port receives special treatment */ - if ((1 << port) & ds->enabled_port_mask) - bcm_sf2_port_setup(ds, port, NULL); - else if (dsa_is_cpu_port(ds, port)) + if (dsa_is_cpu_port(ds, port)) bcm_sf2_imp_setup(ds, port); - else + else if (!((1 << port) & ds->enabled_port_mask)) bcm_sf2_port_disable(ds, port, NULL); } -- 2.9.3
[PATCH net-next v3 06/12] net: dsa: b53: Move Broadcom header setup to b53
The code to enable Broadcom tags/headers is largely switch independent, and in preparation for enabling it for multiple devices with b53, move the code we have in bcm_sf2.c to b53_common.c Reviewed-by: Vivien DidelotSigned-off-by: Florian Fainelli --- drivers/net/dsa/b53/b53_common.c | 47 drivers/net/dsa/b53/b53_priv.h | 1 + drivers/net/dsa/b53/b53_regs.h | 7 ++ drivers/net/dsa/bcm_sf2.c| 43 ++-- drivers/net/dsa/bcm_sf2_regs.h | 8 --- 5 files changed, 57 insertions(+), 49 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 3297af6aab8a..aa2187c71ea5 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -538,6 +538,53 @@ static void b53_disable_port(struct dsa_switch *ds, int port, b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg); } +void b53_brcm_hdr_setup(struct dsa_switch *ds, int port) +{ + struct b53_device *dev = ds->priv; + u8 hdr_ctl, val; + u16 reg; + + /* Resolve which bit controls the Broadcom tag */ + switch (port) { + case 8: + val = BRCM_HDR_P8_EN; + break; + case 7: + val = BRCM_HDR_P7_EN; + break; + case 5: + val = BRCM_HDR_P5_EN; + break; + default: + val = 0; + break; + } + + /* Enable Broadcom tags for IMP port */ + b53_read8(dev, B53_MGMT_PAGE, B53_BRCM_HDR, _ctl); + hdr_ctl |= val; + b53_write8(dev, B53_MGMT_PAGE, B53_BRCM_HDR, hdr_ctl); + + /* Registers below are only accessible on newer devices */ + if (!is58xx(dev)) + return; + + /* Enable reception Broadcom tag for CPU TX (switch RX) to +* allow us to tag outgoing frames +*/ + b53_read16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_RX_DIS, ); + reg &= ~BIT(port); + b53_write16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_RX_DIS, reg); + + /* Enable transmission of Broadcom tags from the switch (CPU RX) to +* allow delivering frames to the per-port net_devices +*/ + b53_read16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_TX_DIS, ); + reg &= ~BIT(port); + b53_write16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_TX_DIS, reg); +} +EXPORT_SYMBOL(b53_brcm_hdr_setup); + static void b53_enable_cpu_port(struct b53_device *dev, int port) { u8 port_ctrl; diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 5bebe97900e8..77102f685da0 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -309,5 +309,6 @@ int b53_mirror_add(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress); void b53_mirror_del(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); +void b53_brcm_hdr_setup(struct dsa_switch *ds, int port); #endif diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h index e5c86d44667a..5e8b8e31fee8 100644 --- a/drivers/net/dsa/b53/b53_regs.h +++ b/drivers/net/dsa/b53/b53_regs.h @@ -210,6 +210,7 @@ #define B53_BRCM_HDR 0x03 #define BRCM_HDR_P8_EN BIT(0) /* Enable tagging on port 8 */ #define BRCM_HDR_P5_EN BIT(1) /* Enable tagging on port 5 */ +#define BRCM_HDR_P7_EN BIT(2) /* Enable tagging on port 7 */ /* Mirror capture control register (16 bit) */ #define B53_MIR_CAP_CTL0x10 @@ -249,6 +250,12 @@ /* Revision ID register (8 bit) */ #define B53_REV_ID 0x40 +/* Broadcom header RX control (16 bit) */ +#define B53_BRCM_HDR_RX_DIS0x60 + +/* Broadcom header TX control (16 bit) */ +#define B53_BRCM_HDR_TX_DIS0x62 + /* * ARL Access Page Registers */ diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 8acbd17bc1fd..49cb51223f70 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -60,45 +60,6 @@ static void bcm_sf2_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) } } -static void bcm_sf2_brcm_hdr_setup(struct bcm_sf2_priv *priv, int port) -{ - u32 reg, val; - - /* Resolve which bit controls the Broadcom tag */ - switch (port) { - case 8: - val = BRCM_HDR_EN_P8; - break; - case 7: - val = BRCM_HDR_EN_P7; - break; - case 5: - val = BRCM_HDR_EN_P5; - break; - default: - val = 0; - break; - } - - /* Enable Broadcom tags for IMP port */ - reg
[PATCH net-next v3 00/12] net: dsa: b53/bcm_sf2 cleanups
Hi all, This patch series is a first pass set of clean-ups to reduce the number of LOCs between b53 and bcm_sf2 and sharing as many functions as possible. There is a number of additional cleanups queued up locally that require more thorough testing. Thanks! Changes in v3: - remove one extra argument for the b53_build_io_op macro (David Laight) - added additional Reviewed-by tags from Vivien Changes in v2: - added Reviewed-by tags from Vivien - added a missing EXPORT_SYMBOL() in patch 8 - fixed a typo in patch 5 Florian Fainelli (12): net: dsa: b53: Remove is_cpu_port() net: dsa: b53: Make b53_enable_cpu_port() take a port argument net: dsa: b53: Defer port enabling to calling port_enable net: dsa: bcm_sf2: Defer port enabling to calling port_enable net: dsa: b53: Use a macro to define I/O operations net: dsa: b53: Move Broadcom header setup to b53 net: dsa: b53: Define EEE register page net: dsa: b53: Move EEE functions to b53 net: dsa: b53: Wire-up EEE net: dsa: b53: Export b53_imp_vlan_setup() net: dsa: bcm_sf2: Use SF2_NUM_EGRESS_QUEUES for CFP net: dsa: bcm_sf2: Utilize b53_{enable,disable}_port drivers/net/dsa/b53/b53_common.c | 151 drivers/net/dsa/b53/b53_priv.h | 145 --- drivers/net/dsa/b53/b53_regs.h | 48 drivers/net/dsa/bcm_sf2.c| 161 +++ drivers/net/dsa/bcm_sf2.h| 2 - drivers/net/dsa/bcm_sf2_cfp.c| 6 +- drivers/net/dsa/bcm_sf2_regs.h | 11 --- 7 files changed, 228 insertions(+), 296 deletions(-) -- 2.9.3
Re: [RFC PATCH 1/3] usbnet: Get rid of spammy usbnet "kevent X may have been dropped"
Douglas Andersonwrites: > Every once in a while when my system is under a bit of stress I see > some spammy messages show up in my logs that say: > > kevent X may have been dropped > > As far as I can tell these messages aren't terribly useful. I agree, FWIW. These messages just confuse users for no purpose at all. > + /* If work is already started this will mark it to run again when it > + * finishes; if we already had work pending and it hadn't started > + * yet then that's fine too. > + */ > + schedule_work (>kevent); > + netdev_dbg(dev->net, "kevent %d scheduled\n", work); Or maybe if (schedule_work (>kevent)) netdev_dbg(dev->net, "kevent %d scheduled\n", work); ? Not that I think it matters much. Bjørn
Re: [PATCH net-next 3/4] net: dsa: setup master ethtool after dsa_ptr
On 09/19/2017 08:56 AM, Vivien Didelot wrote: > DSA overrides the master's ethtool ops so that we can inject its CPU > port's statistics. Because of that, we need to setup the ethtool ops > after the master's dsa_ptr pointer has been assigned, not before. Yes, good point, technically this is a bugfix, but since we have changed this quite often and the race is tiny, I am not positive we could a) trigger this in real life, and b) provide a proper Fixes tag. > > This patch setups the ethtool ops after dsa_ptr is assigned, and > restores them before it gets cleared. > > Signed-off-by: Vivien DidelotReviewed-by: Florian Fainelli -- Florian
Re: [PATCH 2/3] selftests: actually run the various net selftests
On Tue, Sep 19, 2017 at 9:34 AM, Josef Bacikwrote: > On Mon, Sep 18, 2017 at 04:14:41PM -0600, Shuah Khan wrote: >> On 09/18/2017 11:32 AM, jo...@toxicpanda.com wrote: >> > From: Josef Bacik >> > >> > These self tests are just self contained binaries, they are not run by >> > any of the scripts in the directory. This means they need to be marked >> > with TEST_GEN_PROGS to actually be run, not TEST_GEN_FILES. >> > >> > Signed-off-by: Josef Bacik >> > --- >> > tools/testing/selftests/net/Makefile | 4 ++-- >> > 1 file changed, 2 insertions(+), 2 deletions(-) >> > >> > diff --git a/tools/testing/selftests/net/Makefile >> > b/tools/testing/selftests/net/Makefile >> > index 3df542c84610..45a4e77a47c4 100644 >> > --- a/tools/testing/selftests/net/Makefile >> > +++ b/tools/testing/selftests/net/Makefile >> > @@ -6,8 +6,8 @@ CFLAGS += -I../../../../usr/include/ >> > TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh >> > rtnetlink.sh >> > TEST_GEN_FILES = socket >> > TEST_GEN_FILES += psock_fanout psock_tpacket >> > -TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa >> > -TEST_GEN_FILES += reuseport_dualstack msg_zerocopy reuseaddr_conflict >> > +TEST_GEN_PROGS += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa >> > +TEST_GEN_PROGS += reuseport_dualstack msg_zerocopy reuseaddr_conflict >> >> Hmm. I see msg_zerocopy.sh for running msg_zerocopy. msg_zerocopy should >> still stay in TEST_GEN_FILES and msg_zerocopy.sh needs to be added to >> TEST_PROGS so it runs. >> > > Actually the shell script requires arguments, it doesn't just run the test. > I'll fix this to just omit the test for now as it's not setup to run properly. > > Willem, could you follow up with a patch so that the zero copy test is run > properly the way you envision it running? You need to make sure that > > make -C tools/testing/selftests TARGETS=net run_tests > > actually runs your zero copy test the way you expect it to, otherwise it's > just > sitting there collecting dust. Thanks, Will do. In its current state, this test is really only meant to be run manually. It demonstrates the API and outputs some information on stderr. Zerocopy itself requires a two-host test. The feature is expressly disabled over loopback. But I can make this a pass/fail tests that exercises the interface and notification channel and verifies that data was copied. It will be a bit more work than just changing the default invocation of msg_zerocopy.sh
Re: [REGRESSION] Warning in tcp_fastretrans_alert() of net/ipv4/tcp_input.c
On Tue, Sep 19, 2017 at 4:04 AM, Oleksandr Natalenkowrote: > Hi. > > 18.09.2017 23:40, Yuchung Cheng wrote: >> >> I assume this kernel does not have the patch that Neal proposed in his >> first reply? > > > Correct. > >> The main warning needs to be triggered by another peculiar SACK that >> kicks the sender into recovery again (after undo). Please let it run >> longer if possible to see if we can get both. But the new data does >> indicate the we can (validly) be in CA_Open with retrans_out > 0. > > > OK, here it is: > > === > » LC_TIME=C jctl -kb | grep RIP > … > Sep 19 12:54:03 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 > Sep 19 12:54:22 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 > Sep 19 12:54:25 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 > Sep 19 12:56:00 defiant kernel: RIP: 0010:tcp_fastretrans_alert+0x7c8/0x990 > Sep 19 12:57:07 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 > Sep 19 12:57:14 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 > Sep 19 12:58:04 defiant kernel: RIP: 0010:tcp_undo_cwnd_reduction+0xbd/0xd0 > … > === > > Note timestamps — two types of warning are distant in time, so didn't happen > at once. > > While still running this kernel, anything else I can check for you? Thanks. Based on all the experiments you did I believe there's other code path than my hypothesis that'd cause the warning: 1) Neal's proposed F-RTO fix didn't work 2) the main warning is not being triggered together with the newly-instrumented warning in undo 3) Disabling RACK stopped the warning We couldn't figure out exactly what. So we'll do a bit code auditing first to find more suspects
Re: [PATCH v2 net-next] net: sk_buff rbnode reorg
On Tue, Sep 19, 2017 at 8:14 AM, Eric Dumazetwrote: > From: Eric Dumazet > > skb->rbnode shares space with skb->next, skb->prev and skb->tstamp > > Current uses (TCP receive ofo queue and netem) need to save/restore > tstamp, while skb->dev is either NULL (TCP) or a constant for a given > queue (netem). > > Since we plan using an RB tree for TCP retransmit queue to speedup SACK > processing with large BDP, this patch exchanges skb->dev and > skb->tstamp. > > This saves some overhead in both TCP and netem. > > v2: removes the swtstamp field from struct tcp_skb_cb > > Signed-off-by: Eric Dumazet > Cc: Soheil Hassas Yeganeh > Cc: Wei Wang > Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Very nice!
[patch net-next] team: fall back to hash if table entry is empty
If the hash to port mapping table does not have a valid port (i.e. when a port goes down), fall back to the simple hashing mechanism to avoid dropping packets. Signed-off-by: Jim HankoAcked-by: Jiri Pirko --- drivers/net/team/team_mode_loadbalance.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c index 1468ddf..a5ef970 100644 --- a/drivers/net/team/team_mode_loadbalance.c +++ b/drivers/net/team/team_mode_loadbalance.c @@ -137,7 +137,13 @@ static struct team_port *lb_htpm_select_tx_port(struct team *team, struct sk_buff *skb, unsigned char hash) { - return rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv, hash)); + struct team_port *port; + + port = rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv, hash)); + if (likely(port)) + return port; + /* If no valid port in the table, fall back to simple hash */ + return lb_hash_select_tx_port(team, lb_priv, skb, hash); } struct lb_select_tx_port { -- 2.7.4
[PATCH] isdn/i4l: check the message proto does not change across fetches
In isdn_ppp_write(), the header (i.e., protobuf) of the buffer is fetched twice from userspace. The first fetch is used to peek at the protocol of the message and reset the huptimer if necessary; while the second fetch copies in the whole buffer. However, given that buf resides in userspace memory, a user process can race to change its memory content across fetches. By doing so, we can either avoid resetting the huptimer for any type of packets (by first setting proto to PPP_LCP and later change to the actual type) or force resetting the huptimer for LCP packets. This patch does a memcmp between the two fetches and abort if changes to the protobuf is detected across fetches. Signed-off-by: Meng Xu--- drivers/isdn/i4l/isdn_ppp.c | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c index 6c44609..21a9ae8 100644 --- a/drivers/isdn/i4l/isdn_ppp.c +++ b/drivers/isdn/i4l/isdn_ppp.c @@ -857,6 +857,7 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) (lp->flags & ISDN_NET_CONNECTED)) { unsigned short hl; struct sk_buff *skb; + void *skb_tail; /* * we need to reserve enough space in front of * sk_buff. old call to dev_alloc_skb only reserved @@ -869,11 +870,21 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) return count; } skb_reserve(skb, hl); - if (copy_from_user(skb_put(skb, count), buf, count)) + skb_tail = skb_put(skb, count); + if (copy_from_user(skb_tail, buf, count)) { kfree_skb(skb); return -EFAULT; } + + /* +* abort if the message proto is changed between the fetches +*/ + if (memcmp(skb_tail, protobuf, 4)) { + kfree_skb(skb); + return -EFAULT; + } + if (is->debug & 0x40) { printk(KERN_DEBUG "ppp xmit: len %d\n", (int) skb->len); isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot); -- 2.7.4
Re: [5/5] e1000e: Avoid receiver overrun interrupt bursts
Hi. We’ve been running this patchset (all 5) for about as long as they’ve been under review… about 2 months. And in a burn-in lab with heavy traffic. We’ve not seen a single link-flap in hundreds of ours of saturated traffic. Would love to see some resolution soon on this as we don’t want to ship a release with unsanctioned patches. Is there an estimate on when that might be? Thanks, -Philip > On Jul 21, 2017, at 12:36 PM, Benjamin Poirierwrote: > > When e1000e_poll() is not fast enough to keep up with incoming traffic, the > adapter (when operating in msix mode) raises the Other interrupt to signal > Receiver Overrun. > > This is a double problem because 1) at the moment e1000_msix_other() > assumes that it is only called in case of Link Status Change and 2) if the > condition persists, the interrupt is repeatedly raised again in quick > succession. > > Ideally we would configure the Other interrupt to not be raised in case of > receiver overrun but this doesn't seem possible on this adapter. Instead, > we handle the first part of the problem by reverting to the practice of > reading ICR in the other interrupt handler, like before commit 16ecba59bc33 > ("e1000e: Do not read ICR in Other interrupt"). Thanks to commit > 0a8047ac68e5 ("e1000e: Fix msi-x interrupt automask") which cleared IAME > from CTRL_EXT, reading ICR doesn't interfere with RxQ0, TxQ0 interrupts > anymore. We handle the second part of the problem by not re-enabling the > Other interrupt right away when there is overrun. Instead, we wait until > traffic subsides, napi polling mode is exited and interrupts are > re-enabled. > > Reported-by: Lennart Sorensen > Fixes: 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt") > Signed-off-by: Benjamin Poirier > Tested-by: Aaron Brown > --- > drivers/net/ethernet/intel/e1000e/defines.h | 1 + > drivers/net/ethernet/intel/e1000e/netdev.c | 33 +++-- > 2 files changed, 27 insertions(+), 7 deletions(-) > > diff --git a/drivers/net/ethernet/intel/e1000e/defines.h > b/drivers/net/ethernet/intel/e1000e/defines.h > index 0641c0098738..afb7ebe20b24 100644 > --- a/drivers/net/ethernet/intel/e1000e/defines.h > +++ b/drivers/net/ethernet/intel/e1000e/defines.h > @@ -398,6 +398,7 @@ > #define E1000_ICR_LSC 0x0004 /* Link Status Change */ > #define E1000_ICR_RXSEQ 0x0008 /* Rx sequence error */ > #define E1000_ICR_RXDMT00x0010 /* Rx desc min. threshold (0) */ > +#define E1000_ICR_RXO 0x0040 /* Receiver Overrun */ > #define E1000_ICR_RXT0 0x0080 /* Rx timer intr (ring 0) */ > #define E1000_ICR_ECCER 0x0040 /* Uncorrectable ECC Error */ > /* If this bit asserted, the driver should claim the interrupt */ > diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c > b/drivers/net/ethernet/intel/e1000e/netdev.c > index 5a8ab1136566..803edd1a6401 100644 > --- a/drivers/net/ethernet/intel/e1000e/netdev.c > +++ b/drivers/net/ethernet/intel/e1000e/netdev.c > @@ -1910,12 +1910,30 @@ static irqreturn_t e1000_msix_other(int > __always_unused irq, void *data) > struct net_device *netdev = data; > struct e1000_adapter *adapter = netdev_priv(netdev); > struct e1000_hw *hw = >hw; > + u32 icr; > + bool enable = true; > + > + icr = er32(ICR); > + if (icr & E1000_ICR_RXO) { > + ew32(ICR, E1000_ICR_RXO); > + enable = false; > + /* napi poll will re-enable Other, make sure it runs */ > + if (napi_schedule_prep(>napi)) { > + adapter->total_rx_bytes = 0; > + adapter->total_rx_packets = 0; > + __napi_schedule(>napi); > + } > + } > + if (icr & E1000_ICR_LSC) { > + ew32(ICR, E1000_ICR_LSC); > + hw->mac.get_link_status = true; > + /* guard against interrupt when we're going down */ > + if (!test_bit(__E1000_DOWN, >state)) { > + mod_timer(>watchdog_timer, jiffies + 1); > + } > + } > > - hw->mac.get_link_status = true; > - > - /* guard against interrupt when we're going down */ > - if (!test_bit(__E1000_DOWN, >state)) { > - mod_timer(>watchdog_timer, jiffies + 1); > + if (enable && !test_bit(__E1000_DOWN, >state)) { > ew32(IMS, E1000_IMS_OTHER); > } > > @@ -2687,7 +2705,8 @@ static int e1000e_poll(struct napi_struct *napi, int > weight) > napi_complete_done(napi, work_done); > if (!test_bit(__E1000_DOWN, >state)) { > if (adapter->msix_entries) > - ew32(IMS, adapter->rx_ring->ims_val); > + ew32(IMS, adapter->rx_ring->ims_val | > + E1000_IMS_OTHER); > else >