date:20161018

[RFC 2/6] qed: Add iSCSI out of order packet handling.

2016-10-18 Thread manish.rangankar

From: Yuval Mintz 

This patch adds out of order packet handling for hardware offloaded
iSCSI. Out of order packet handling requires driver buffer allocation
and assistance.

Signed-off-by: Arun Easi 
Signed-off-by: Yuval Mintz 
---
 drivers/net/ethernet/qlogic/qed/Makefile   |   2 +-
 drivers/net/ethernet/qlogic/qed/qed.h  |   1 +
 drivers/net/ethernet/qlogic/qed/qed_dev.c  |  14 +-
 drivers/net/ethernet/qlogic/qed/qed_ll2.c  | 559 +++--
 drivers/net/ethernet/qlogic/qed/qed_ll2.h  |   9 +
 drivers/net/ethernet/qlogic/qed/qed_ooo.c  | 510 ++
 drivers/net/ethernet/qlogic/qed/qed_ooo.h  | 116 ++
 drivers/net/ethernet/qlogic/qed/qed_roce.c |   1 +
 drivers/net/ethernet/qlogic/qed/qed_spq.c  |   9 +
 9 files changed, 1195 insertions(+), 26 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ooo.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ooo.h

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile 
b/drivers/net/ethernet/qlogic/qed/Makefile
index b76669c..9121bf0 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -6,4 +6,4 @@ qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o 
qed_init_ops.o \
 qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
 qed-$(CONFIG_QED_LL2) += qed_ll2.o
 qed-$(CONFIG_INFINIBAND_QEDR) += qed_roce.o
-qed-$(CONFIG_QED_ISCSI) += qed_iscsi.o
+qed-$(CONFIG_QED_ISCSI) += qed_iscsi.o qed_ooo.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h 
b/drivers/net/ethernet/qlogic/qed/qed.h
index a61b1c0..e5626ae 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -380,6 +380,7 @@ struct qed_hwfn {
/* Protocol related */
boolusing_ll2;
struct qed_ll2_info *p_ll2_info;
+   struct qed_ooo_info *p_ooo_info;
struct qed_rdma_info*p_rdma_info;
struct qed_iscsi_info   *p_iscsi_info;
struct qed_pf_paramspf_params;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c 
b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index a4234c0..060e9a4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -32,6 +32,7 @@
 #include "qed_iscsi.h"
 #include "qed_ll2.h"
 #include "qed_mcp.h"
+#include "qed_ooo.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
 #include "qed_sriov.h"
@@ -157,8 +158,10 @@ void qed_resc_free(struct qed_dev *cdev)
qed_ll2_free(p_hwfn, p_hwfn->p_ll2_info);
 #endif
if (IS_ENABLED(CONFIG_QEDI) &&
-   p_hwfn->hw_info.personality == QED_PCI_ISCSI)
+   p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
qed_iscsi_free(p_hwfn, p_hwfn->p_iscsi_info);
+   qed_ooo_free(p_hwfn, p_hwfn->p_ooo_info);
+   }
qed_iov_free(p_hwfn);
qed_dmae_info_free(p_hwfn);
qed_dcbx_info_free(p_hwfn, p_hwfn->p_dcbx_info);
@@ -416,6 +419,7 @@ int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt 
*p_ptt)
 int qed_resc_alloc(struct qed_dev *cdev)
 {
struct qed_iscsi_info *p_iscsi_info;
+   struct qed_ooo_info *p_ooo_info;
 #ifdef CONFIG_QED_LL2
struct qed_ll2_info *p_ll2_info;
 #endif
@@ -543,6 +547,10 @@ int qed_resc_alloc(struct qed_dev *cdev)
if (!p_iscsi_info)
goto alloc_no_mem;
p_hwfn->p_iscsi_info = p_iscsi_info;
+   p_ooo_info = qed_ooo_alloc(p_hwfn);
+   if (!p_ooo_info)
+   goto alloc_no_mem;
+   p_hwfn->p_ooo_info = p_ooo_info;
}
 
/* DMA info initialization */
@@ -598,8 +606,10 @@ void qed_resc_setup(struct qed_dev *cdev)
qed_ll2_setup(p_hwfn, p_hwfn->p_ll2_info);
 #endif
if (IS_ENABLED(CONFIG_QEDI) &&
-   p_hwfn->hw_info.personality == QED_PCI_ISCSI)
+   p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
qed_iscsi_setup(p_hwfn, p_hwfn->p_iscsi_info);
+   qed_ooo_setup(p_hwfn, p_hwfn->p_ooo_info);
+   }
}
 }
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c 
b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index e67f3c9..4ce12e9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -36,6 +36,7 @@
 #include "qed_int.h"
 #include "qed_ll2.h"
 #include "qed_mcp.h"
+#include "qed_ooo.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
 
@@ -295,27 +296,36 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 
connection_handle)

[RFC 5/6] qedi: Add support for iSCSI session management.

2016-10-18 Thread manish.rangankar

From: Manish Rangankar 

This patch adds support for iscsi_transport LLD Login,
Logout, NOP-IN/NOP-OUT, Async, Reject PDU processing
and Firmware async event handling support.

Signed-off-by: Nilesh Javali 
Signed-off-by: Adheer Chandravanshi 
Signed-off-by: Chad Dupuis 
Signed-off-by: Saurav Kashyap 
Signed-off-by: Arun Easi 
Signed-off-by: Manish Rangankar 
---
 drivers/scsi/qedi/qedi_fw.c| 1123 
 drivers/scsi/qedi/qedi_gbl.h   |   67 ++
 drivers/scsi/qedi/qedi_iscsi.c | 1604 
 drivers/scsi/qedi/qedi_iscsi.h |  228 ++
 drivers/scsi/qedi/qedi_main.c  |  164 
 5 files changed, 3186 insertions(+)
 create mode 100644 drivers/scsi/qedi/qedi_fw.c
 create mode 100644 drivers/scsi/qedi/qedi_gbl.h
 create mode 100644 drivers/scsi/qedi/qedi_iscsi.c
 create mode 100644 drivers/scsi/qedi/qedi_iscsi.h

diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c
new file mode 100644
index 000..a820785
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_fw.c
@@ -0,0 +1,1123 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include 
+#include 
+#include 
+
+#include "qedi.h"
+#include "qedi_iscsi.h"
+#include "qedi_gbl.h"
+
+static int qedi_send_iscsi_tmf(struct qedi_conn *qedi_conn,
+  struct iscsi_task *mtask);
+
+void qedi_iscsi_unmap_sg_list(struct qedi_cmd *cmd)
+{
+   struct scsi_cmnd *sc = cmd->scsi_cmd;
+
+   if (cmd->io_tbl.sge_valid && sc) {
+   scsi_dma_unmap(sc);
+   cmd->io_tbl.sge_valid = 0;
+   }
+}
+
+static void qedi_process_logout_resp(struct qedi_ctx *qedi,
+union iscsi_cqe *cqe,
+struct iscsi_task *task,
+struct qedi_conn *qedi_conn)
+{
+   struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+   struct iscsi_logout_rsp *resp_hdr;
+   struct iscsi_session *session = conn->session;
+   struct iscsi_logout_response_hdr *cqe_logout_response;
+   struct qedi_cmd *cmd;
+
+   cmd = (struct qedi_cmd *)task->dd_data;
+   cqe_logout_response = >cqe_common.iscsi_hdr.logout_response;
+   spin_lock(>back_lock);
+   resp_hdr = (struct iscsi_logout_rsp *)_conn->gen_pdu.resp_hdr;
+   memset(resp_hdr, 0, sizeof(struct iscsi_hdr));
+   resp_hdr->opcode = cqe_logout_response->opcode;
+   resp_hdr->flags = cqe_logout_response->flags;
+   resp_hdr->hlength = 0;
+
+   resp_hdr->itt = build_itt(cqe->cqe_solicited.itid, conn->session->age);
+   resp_hdr->statsn = cpu_to_be32(cqe_logout_response->stat_sn);
+   resp_hdr->exp_cmdsn = cpu_to_be32(cqe_logout_response->exp_cmd_sn);
+   resp_hdr->max_cmdsn = cpu_to_be32(cqe_logout_response->max_cmd_sn);
+
+   resp_hdr->t2wait = cpu_to_be32(cqe_logout_response->time2wait);
+   resp_hdr->t2retain = cpu_to_be32(cqe_logout_response->time2retain);
+
+   QEDI_INFO(>dbg_ctx, QEDI_LOG_TID,
+ "Freeing tid=0x%x for cid=0x%x\n",
+ cmd->task_id, qedi_conn->iscsi_conn_id);
+
+   if (likely(cmd->io_cmd_in_list)) {
+   cmd->io_cmd_in_list = false;
+   list_del_init(>io_cmd);
+   qedi_conn->active_cmd_count--;
+   } else {
+   QEDI_INFO(>dbg_ctx, QEDI_LOG_INFO,
+ "Active cmd list node already deleted, tid=0x%x, 
cid=0x%x, io_cmd_node=%p\n",
+ cmd->task_id, qedi_conn->iscsi_conn_id,
+ >io_cmd);
+   }
+
+   cmd->state = RESPONSE_RECEIVED;
+   qedi_clear_task_idx(qedi, cmd->task_id);
+   __iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr, NULL, 0);
+
+   spin_unlock(>back_lock);
+}
+
+static void qedi_process_text_resp(struct qedi_ctx *qedi,
+  union iscsi_cqe *cqe,
+  struct iscsi_task *task,
+  struct qedi_conn *qedi_conn)
+{
+   struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+   struct iscsi_session *session = conn->session;
+   struct iscsi_task_context *task_ctx;
+   struct iscsi_text_rsp *resp_hdr_ptr;
+   struct iscsi_text_response_hdr *cqe_text_response;
+   struct qedi_cmd *cmd;
+   int pld_len;
+   u32 *tmp;
+
+   cmd = (struct qedi_cmd *)task->dd_data;
+   task_ctx = (struct iscsi_task_context *)qedi_get_task_mem(>tasks,
+ cmd->task_id);
+
+   cqe_text_response

[RFC 3/6] qedi: Add QLogic FastLinQ offload iSCSI driver framework.

2016-10-18 Thread manish.rangankar

From: Manish Rangankar 

The QLogic FastLinQ Driver for iSCSI (qedi) is the iSCSI specific module
for 41000 Series Converged Network Adapters by QLogic.

This patch consists of following changes:
  - MAINTAINERS Makefile and Kconfig changes for qedi,
  - PCI driver registration,
  - iSCSI host level initialization,
  - Debugfs and log level infrastructure.

Signed-off-by: Nilesh Javali 
Signed-off-by: Adheer Chandravanshi 
Signed-off-by: Chad Dupuis 
Signed-off-by: Saurav Kashyap 
Signed-off-by: Arun Easi 
Signed-off-by: Manish Rangankar 
---
 MAINTAINERS |6 +
 drivers/net/ethernet/qlogic/Kconfig |   12 -
 drivers/scsi/Kconfig|1 +
 drivers/scsi/Makefile   |1 +
 drivers/scsi/qedi/Kconfig   |   10 +
 drivers/scsi/qedi/Makefile  |5 +
 drivers/scsi/qedi/qedi.h|  286 +++
 drivers/scsi/qedi/qedi_dbg.c|  143 
 drivers/scsi/qedi/qedi_dbg.h|  144 
 drivers/scsi/qedi/qedi_debugfs.c|  244 ++
 drivers/scsi/qedi/qedi_hsi.h|   52 ++
 drivers/scsi/qedi/qedi_main.c   | 1550 +++
 drivers/scsi/qedi/qedi_sysfs.c  |   52 ++
 drivers/scsi/qedi/qedi_version.h|   14 +
 14 files changed, 2508 insertions(+), 12 deletions(-)
 create mode 100644 drivers/scsi/qedi/Kconfig
 create mode 100644 drivers/scsi/qedi/Makefile
 create mode 100644 drivers/scsi/qedi/qedi.h
 create mode 100644 drivers/scsi/qedi/qedi_dbg.c
 create mode 100644 drivers/scsi/qedi/qedi_dbg.h
 create mode 100644 drivers/scsi/qedi/qedi_debugfs.c
 create mode 100644 drivers/scsi/qedi/qedi_hsi.h
 create mode 100644 drivers/scsi/qedi/qedi_main.c
 create mode 100644 drivers/scsi/qedi/qedi_sysfs.c
 create mode 100644 drivers/scsi/qedi/qedi_version.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 5e925a2..906d05f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9909,6 +9909,12 @@ F:   drivers/net/ethernet/qlogic/qed/
 F: include/linux/qed/
 F: drivers/net/ethernet/qlogic/qede/
 
+QLOGIC QL41xxx ISCSI DRIVER
+M: qlogic-storage-upstr...@cavium.com
+L: linux-s...@vger.kernel.org
+S: Supported
+F: drivers/scsi/qedi/
+
 QNX4 FILESYSTEM
 M: Anders Larsen 
 W: http://www.alarsen.net/linux/qnx4fs/
diff --git a/drivers/net/ethernet/qlogic/Kconfig 
b/drivers/net/ethernet/qlogic/Kconfig
index bad4fae..28b4366 100644
--- a/drivers/net/ethernet/qlogic/Kconfig
+++ b/drivers/net/ethernet/qlogic/Kconfig
@@ -121,16 +121,4 @@ config INFINIBAND_QEDR
 config QED_ISCSI
bool
 
-config QEDI
-   tristate "QLogic QED 25/40/100Gb iSCSI driver"
-   depends on QED
-   select QED_LL2
-   select QED_ISCSI
-   default n
-   ---help---
- This provides a temporary node that allows the compilation
- and logical testing of the hardware offload iSCSI support
- for QLogic QED. This would be replaced by the 'real' option
- once the QEDI driver is added [+relocated].
-
 endif # NET_VENDOR_QLOGIC
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 3e2bdb9..5cf03db 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1254,6 +1254,7 @@ config SCSI_QLOGICPTI
 
 source "drivers/scsi/qla2xxx/Kconfig"
 source "drivers/scsi/qla4xxx/Kconfig"
+source "drivers/scsi/qedi/Kconfig"
 
 config SCSI_LPFC
tristate "Emulex LightPulse Fibre Channel Support"
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index 38d938d..da9e312 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -132,6 +132,7 @@ obj-$(CONFIG_PS3_ROM)   += ps3rom.o
 obj-$(CONFIG_SCSI_CXGB3_ISCSI) += libiscsi.o libiscsi_tcp.o cxgbi/
 obj-$(CONFIG_SCSI_CXGB4_ISCSI) += libiscsi.o libiscsi_tcp.o cxgbi/
 obj-$(CONFIG_SCSI_BNX2_ISCSI)  += libiscsi.o bnx2i/
+obj-$(CONFIG_QEDI)  += libiscsi.o qedi/
 obj-$(CONFIG_BE2ISCSI) += libiscsi.o be2iscsi/
 obj-$(CONFIG_SCSI_ESAS2R)  += esas2r/
 obj-$(CONFIG_SCSI_PMCRAID) += pmcraid.o
diff --git a/drivers/scsi/qedi/Kconfig b/drivers/scsi/qedi/Kconfig
new file mode 100644
index 000..23ca8a2
--- /dev/null
+++ b/drivers/scsi/qedi/Kconfig
@@ -0,0 +1,10 @@
+config QEDI
+   tristate "QLogic QEDI 25/40/100Gb iSCSI Initiator Driver Support"
+   depends on PCI && SCSI
+   depends on QED
+   select SCSI_ISCSI_ATTRS
+   select QED_LL2
+   select QED_ISCSI
+   ---help---
+   This driver supports iSCSI offload for the QLogic FastLinQ
+   41000 Series Converged Network Adapters.
diff --git a/drivers/scsi/qedi/Makefile b/drivers/scsi/qedi/Makefile
new file mode 100644
index 000..2b3e16b
--- /dev/null
+++ b/drivers/scsi/qedi/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_QEDI) := qedi.o
+qedi-y := qedi_main.o qedi_iscsi.o qedi_fw.o qedi_sysfs.o \

[RFC 6/6] qedi: Add support for data path.

2016-10-18 Thread manish.rangankar

From: Manish Rangankar 

This patch adds support for data path and TMF handling.

Signed-off-by: Nilesh Javali 
Signed-off-by: Adheer Chandravanshi 
Signed-off-by: Chad Dupuis 
Signed-off-by: Saurav Kashyap 
Signed-off-by: Arun Easi 
Signed-off-by: Manish Rangankar 
---
 drivers/scsi/qedi/qedi_fw.c| 1282 
 drivers/scsi/qedi/qedi_gbl.h   |6 +
 drivers/scsi/qedi/qedi_iscsi.c |6 +
 drivers/scsi/qedi/qedi_main.c  |4 +
 4 files changed, 1298 insertions(+)

diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c
index a820785..af1e14d 100644
--- a/drivers/scsi/qedi/qedi_fw.c
+++ b/drivers/scsi/qedi/qedi_fw.c
@@ -147,6 +147,114 @@ static void qedi_process_text_resp(struct qedi_ctx *qedi,
spin_unlock(>back_lock);
 }
 
+static void qedi_tmf_resp_work(struct work_struct *work)
+{
+   struct qedi_cmd *qedi_cmd =
+   container_of(work, struct qedi_cmd, tmf_work);
+   struct qedi_conn *qedi_conn = qedi_cmd->conn;
+   struct qedi_ctx *qedi = qedi_conn->qedi;
+   struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+   struct iscsi_session *session = conn->session;
+   struct iscsi_tm_rsp *resp_hdr_ptr;
+   struct iscsi_cls_session *cls_sess;
+   int rval = 0;
+
+   set_bit(QEDI_CONN_FW_CLEANUP, _conn->flags);
+   resp_hdr_ptr =  (struct iscsi_tm_rsp *)qedi_cmd->tmf_resp_buf;
+   cls_sess = iscsi_conn_to_session(qedi_conn->cls_conn);
+
+   iscsi_block_session(session->cls_session);
+   rval = qedi_cleanup_all_io(qedi, qedi_conn, qedi_cmd->task, true);
+   if (rval) {
+   clear_bit(QEDI_CONN_FW_CLEANUP, _conn->flags);
+   qedi_clear_task_idx(qedi, qedi_cmd->task_id);
+   iscsi_unblock_session(session->cls_session);
+   return;
+   }
+
+   iscsi_unblock_session(session->cls_session);
+   qedi_clear_task_idx(qedi, qedi_cmd->task_id);
+
+   spin_lock(>back_lock);
+   __iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr_ptr, NULL, 0);
+   spin_unlock(>back_lock);
+   kfree(resp_hdr_ptr);
+   clear_bit(QEDI_CONN_FW_CLEANUP, _conn->flags);
+}
+
+static void qedi_process_tmf_resp(struct qedi_ctx *qedi,
+ union iscsi_cqe *cqe,
+ struct iscsi_task *task,
+ struct qedi_conn *qedi_conn)
+
+{
+   struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+   struct iscsi_session *session = conn->session;
+   struct iscsi_tmf_response_hdr *cqe_tmp_response;
+   struct iscsi_tm_rsp *resp_hdr_ptr;
+   struct iscsi_tm *tmf_hdr;
+   struct qedi_cmd *qedi_cmd = NULL;
+   u32 *tmp;
+
+   cqe_tmp_response = >cqe_common.iscsi_hdr.tmf_response;
+
+   qedi_cmd = task->dd_data;
+   qedi_cmd->tmf_resp_buf = kzalloc(sizeof(*resp_hdr_ptr), GFP_KERNEL);
+   if (!qedi_cmd->tmf_resp_buf) {
+   QEDI_ERR(>dbg_ctx,
+"Failed to allocate resp buf, cid=0x%x\n",
+ qedi_conn->iscsi_conn_id);
+   return;
+   }
+
+   spin_lock(>back_lock);
+   resp_hdr_ptr =  (struct iscsi_tm_rsp *)qedi_cmd->tmf_resp_buf;
+   memset(resp_hdr_ptr, 0, sizeof(struct iscsi_tm_rsp));
+
+   /* Fill up the header */
+   resp_hdr_ptr->opcode = cqe_tmp_response->opcode;
+   resp_hdr_ptr->flags = cqe_tmp_response->hdr_flags;
+   resp_hdr_ptr->response = cqe_tmp_response->hdr_response;
+   resp_hdr_ptr->hlength = 0;
+
+   hton24(resp_hdr_ptr->dlength,
+  (cqe_tmp_response->hdr_second_dword &
+   ISCSI_TMF_RESPONSE_HDR_DATA_SEG_LEN_MASK));
+   tmp = (u32 *)resp_hdr_ptr->dlength;
+   resp_hdr_ptr->itt = build_itt(cqe->cqe_solicited.itid,
+ conn->session->age);
+   resp_hdr_ptr->statsn = cpu_to_be32(cqe_tmp_response->stat_sn);
+   resp_hdr_ptr->exp_cmdsn  = cpu_to_be32(cqe_tmp_response->exp_cmd_sn);
+   resp_hdr_ptr->max_cmdsn = cpu_to_be32(cqe_tmp_response->max_cmd_sn);
+
+   tmf_hdr = (struct iscsi_tm *)qedi_cmd->task->hdr;
+
+   if (likely(qedi_cmd->io_cmd_in_list)) {
+   qedi_cmd->io_cmd_in_list = false;
+   list_del_init(_cmd->io_cmd);
+   qedi_conn->active_cmd_count--;
+   }
+
+   if (((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+ ISCSI_TM_FUNC_LOGICAL_UNIT_RESET) ||
+   ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+ ISCSI_TM_FUNC_TARGET_WARM_RESET) ||
+   ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+ ISCSI_TM_FUNC_TARGET_COLD_RESET)) {
+   INIT_WORK(_cmd->tmf_work, qedi_tmf_resp_work);
+

[RFC 1/6] qed: Add support for hardware offloaded iSCSI.

2016-10-18 Thread manish.rangankar

From: Yuval Mintz 

This adds the backbone required for the various HW initalizations
which are necessary for the iSCSI driver (qedi) for QLogic FastLinQ
4 line of adapters - FW notification, resource initializations, etc.

Signed-off-by: Arun Easi 
Signed-off-by: Yuval Mintz 
---
 drivers/net/ethernet/qlogic/Kconfig|   15 +
 drivers/net/ethernet/qlogic/qed/Makefile   |1 +
 drivers/net/ethernet/qlogic/qed/qed.h  |8 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c  |   15 +
 drivers/net/ethernet/qlogic/qed/qed_int.h  |1 -
 drivers/net/ethernet/qlogic/qed/qed_iscsi.c| 1310 
 drivers/net/ethernet/qlogic/qed/qed_iscsi.h|   52 +
 drivers/net/ethernet/qlogic/qed/qed_l2.c   |1 -
 drivers/net/ethernet/qlogic/qed/qed_ll2.c  |   35 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c |2 -
 drivers/net/ethernet/qlogic/qed/qed_mcp.h  |6 -
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h |2 +
 drivers/net/ethernet/qlogic/qed/qed_spq.c  |   15 +
 include/linux/qed/qed_if.h |2 +
 include/linux/qed/qed_iscsi_if.h   |  249 +
 15 files changed, 1692 insertions(+), 22 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_iscsi.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_iscsi.h
 create mode 100644 include/linux/qed/qed_iscsi_if.h

diff --git a/drivers/net/ethernet/qlogic/Kconfig 
b/drivers/net/ethernet/qlogic/Kconfig
index 0df1391f9..bad4fae 100644
--- a/drivers/net/ethernet/qlogic/Kconfig
+++ b/drivers/net/ethernet/qlogic/Kconfig
@@ -118,4 +118,19 @@ config INFINIBAND_QEDR
  for QLogic QED. This would be replaced by the 'real' option
  once the QEDR driver is added [+relocated].
 
+config QED_ISCSI
+   bool
+
+config QEDI
+   tristate "QLogic QED 25/40/100Gb iSCSI driver"
+   depends on QED
+   select QED_LL2
+   select QED_ISCSI
+   default n
+   ---help---
+ This provides a temporary node that allows the compilation
+ and logical testing of the hardware offload iSCSI support
+ for QLogic QED. This would be replaced by the 'real' option
+ once the QEDI driver is added [+relocated].
+
 endif # NET_VENDOR_QLOGIC
diff --git a/drivers/net/ethernet/qlogic/qed/Makefile 
b/drivers/net/ethernet/qlogic/qed/Makefile
index cda0af7..b76669c 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -6,3 +6,4 @@ qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o 
qed_init_ops.o \
 qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
 qed-$(CONFIG_QED_LL2) += qed_ll2.o
 qed-$(CONFIG_INFINIBAND_QEDR) += qed_roce.o
+qed-$(CONFIG_QED_ISCSI) += qed_iscsi.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h 
b/drivers/net/ethernet/qlogic/qed/qed.h
index 653bb57..a61b1c0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -35,6 +35,7 @@
 
 #define QED_WFQ_UNIT   100
 
+#define ISCSI_BDQ_ID(_port_id) (_port_id)
 #define QED_WID_SIZE(1024)
 #define QED_PF_DEMS_SIZE(4)
 
@@ -167,6 +168,7 @@ enum QED_RESOURCES {
QED_ILT,
QED_LL2_QUEUE,
QED_RDMA_STATS_QUEUE,
+   QED_CMDQS_CQS,
QED_MAX_RESC,
 };
 
@@ -379,6 +381,7 @@ struct qed_hwfn {
boolusing_ll2;
struct qed_ll2_info *p_ll2_info;
struct qed_rdma_info*p_rdma_info;
+   struct qed_iscsi_info   *p_iscsi_info;
struct qed_pf_paramspf_params;
 
bool b_rdma_enabled_in_prs;
@@ -578,6 +581,8 @@ struct qed_dev {
/* Linux specific here */
struct  qede_dev*edev;
struct  pci_dev *pdev;
+   u32 flags;
+#define QED_FLAG_STORAGE_STARTED   (BIT(0))
int msg_enable;
 
struct pci_params   pci_params;
@@ -591,6 +596,7 @@ struct qed_dev {
union {
struct qed_common_cb_ops*common;
struct qed_eth_cb_ops   *eth;
+   struct qed_iscsi_cb_ops *iscsi;
} protocol_ops;
void*ops_cookie;
 
@@ -600,7 +606,7 @@ struct qed_dev {
struct qed_cb_ll2_info  *ll2;
u8  ll2_mac_address[ETH_ALEN];
 #endif
-
+   DECLARE_HASHTABLE(connections, 10);
const struct firmware   *firmware;
 
u32 rdma_max_sge;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c 
b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 754f6a9..a4234c0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -29,6 +29,7 @@
 #include "qed_hw.h"
 #include "qed_init_ops.h"
 #include "qed_int.h"
+#include "qed_iscsi.h"
 #include "qed_ll2.h"

[RFC 0/6] Add QLogic FastLinQ iSCSI (qedi) driver.

2016-10-18 Thread manish.rangankar

From: Manish Rangankar 

This series introduces hardware offload iSCSI initiator driver for the
41000 Series Converged Network Adapters (579xx chip) by Qlogic. The overall
driver design includes a common module ('qed') and protocol specific
dependent modules ('qedi' for iSCSI).

This is an open iSCSI driver, modifications to open iSCSI user components
'iscsid', 'iscsiuio', etc. are required for the solution to work. The user
space changes are also in the process of being submitted.

https://groups.google.com/forum/#!forum/open-iscsi

The 'qed' common module, under drivers/net/ethernet/qlogic/qed/, is
enhanced with functionality required for the iSCSI support. This series
is based on:

net-next: 1b830996c1603225a96e233c3b09bf2b12607d78

qedi patches are divided logically for review purpose and individual
patches do not compile.

We really appreciate any review comments you may have on the patch series.

Manish Rangankar (4):
  qedi: Add QLogic FastLinQ offload iSCSI driver framework.
  qedi: Add LL2 iSCSI interface for offload iSCSI.
  qedi: Add support for iSCSI session management.
  qedi: Add support for data path.

Yuval Mintz (2):
  qed: Add support for hardware offloaded iSCSI.
  qed: Add iSCSI out of order packet handling.

 MAINTAINERS|6 +
 drivers/net/ethernet/qlogic/Kconfig|3 +
 drivers/net/ethernet/qlogic/qed/Makefile   |1 +
 drivers/net/ethernet/qlogic/qed/qed.h  |9 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c  |   25 +
 drivers/net/ethernet/qlogic/qed/qed_int.h  |1 -
 drivers/net/ethernet/qlogic/qed/qed_iscsi.c| 1310 +
 drivers/net/ethernet/qlogic/qed/qed_iscsi.h|   52 +
 drivers/net/ethernet/qlogic/qed/qed_l2.c   |1 -
 drivers/net/ethernet/qlogic/qed/qed_ll2.c  |  594 +-
 drivers/net/ethernet/qlogic/qed/qed_ll2.h  |9 +
 drivers/net/ethernet/qlogic/qed/qed_main.c |2 -
 drivers/net/ethernet/qlogic/qed/qed_mcp.h  |6 -
 drivers/net/ethernet/qlogic/qed/qed_ooo.c  |  510 +
 drivers/net/ethernet/qlogic/qed/qed_ooo.h  |  116 ++
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h |2 +
 drivers/net/ethernet/qlogic/qed/qed_roce.c |1 +
 drivers/net/ethernet/qlogic/qed/qed_spq.c  |   24 +
 drivers/scsi/Kconfig   |1 +
 drivers/scsi/Makefile  |1 +
 drivers/scsi/qedi/Kconfig  |   10 +
 drivers/scsi/qedi/Makefile |5 +
 drivers/scsi/qedi/qedi.h   |  359 
 drivers/scsi/qedi/qedi_dbg.c   |  143 ++
 drivers/scsi/qedi/qedi_dbg.h   |  144 ++
 drivers/scsi/qedi/qedi_debugfs.c   |  244 +++
 drivers/scsi/qedi/qedi_fw.c| 2405 
 drivers/scsi/qedi/qedi_gbl.h   |   73 +
 drivers/scsi/qedi/qedi_hsi.h   |   52 +
 drivers/scsi/qedi/qedi_iscsi.c | 1610 
 drivers/scsi/qedi/qedi_iscsi.h |  228 +++
 drivers/scsi/qedi/qedi_main.c  | 2075 
 drivers/scsi/qedi/qedi_sysfs.c |   52 +
 drivers/scsi/qedi/qedi_version.h   |   14 +
 include/linux/qed/qed_if.h |2 +
 include/linux/qed/qed_iscsi_if.h   |  249 +++
 36 files changed, 10294 insertions(+), 45 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_iscsi.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_iscsi.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ooo.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ooo.h
 create mode 100644 drivers/scsi/qedi/Kconfig
 create mode 100644 drivers/scsi/qedi/Makefile
 create mode 100644 drivers/scsi/qedi/qedi.h
 create mode 100644 drivers/scsi/qedi/qedi_dbg.c
 create mode 100644 drivers/scsi/qedi/qedi_dbg.h
 create mode 100644 drivers/scsi/qedi/qedi_debugfs.c
 create mode 100644 drivers/scsi/qedi/qedi_fw.c
 create mode 100644 drivers/scsi/qedi/qedi_gbl.h
 create mode 100644 drivers/scsi/qedi/qedi_hsi.h
 create mode 100644 drivers/scsi/qedi/qedi_iscsi.c
 create mode 100644 drivers/scsi/qedi/qedi_iscsi.h
 create mode 100644 drivers/scsi/qedi/qedi_main.c
 create mode 100644 drivers/scsi/qedi/qedi_sysfs.c
 create mode 100644 drivers/scsi/qedi/qedi_version.h
 create mode 100644 include/linux/qed/qed_iscsi_if.h

-- 
1.8.3.1

[RFC 4/6] qedi: Add LL2 iSCSI interface for offload iSCSI.

2016-10-18 Thread manish.rangankar

From: Manish Rangankar 

This patch adds support for iscsiuio interface using Light L2 (LL2) qed
interface.

Signed-off-by: Nilesh Javali 
Signed-off-by: Adheer Chandravanshi 
Signed-off-by: Chad Dupuis 
Signed-off-by: Saurav Kashyap 
Signed-off-by: Arun Easi 
Signed-off-by: Manish Rangankar 
---
 drivers/scsi/qedi/qedi.h  |  73 +
 drivers/scsi/qedi/qedi_main.c | 357 ++
 2 files changed, 430 insertions(+)

diff --git a/drivers/scsi/qedi/qedi.h b/drivers/scsi/qedi/qedi.h
index 0a5035e..02fefbd 100644
--- a/drivers/scsi/qedi/qedi.h
+++ b/drivers/scsi/qedi/qedi.h
@@ -21,6 +21,7 @@
 #include 
 #include "qedi_dbg.h"
 #include 
+#include 
 #include "qedi_version.h"
 
 #define QEDI_MODULE_NAME   "qedi"
@@ -54,6 +55,78 @@
 #define QEDI_LOCAL_PORT_MAX 61024
 #define QEDI_LOCAL_PORT_RANGE   (QEDI_LOCAL_PORT_MAX - QEDI_LOCAL_PORT_MIN)
 #define QEDI_LOCAL_PORT_INVALID0x
+#define TX_RX_RING 16
+#define RX_RING(TX_RX_RING - 1)
+#define LL2_SINGLE_BUF_SIZE0x400
+#define QEDI_PAGE_SIZE 4096
+#define QEDI_PAGE_ALIGN(addr)  ALIGN(addr, QEDI_PAGE_SIZE)
+#define QEDI_PAGE_MASK (~((QEDI_PAGE_SIZE) - 1))
+
+#define QEDI_PAGE_SIZE 4096
+#define QEDI_PATH_HANDLE   0xFE000UL
+
+struct qedi_uio_ctrl {
+   /* meta data */
+   u32 uio_hsi_version;
+
+   /* user writes */
+   u32 host_tx_prod;
+   u32 host_rx_cons;
+   u32 host_rx_bd_cons;
+   u32 host_tx_pkt_len;
+   u32 host_rx_cons_cnt;
+
+   /* driver writes */
+   u32 hw_tx_cons;
+   u32 hw_rx_prod;
+   u32 hw_rx_bd_prod;
+   u32 hw_rx_prod_cnt;
+
+   /* other */
+   u8 mac_addr[6];
+   u8 reserve[2];
+};
+
+struct qedi_rx_bd {
+   u32 rx_pkt_index;
+   u32 rx_pkt_len;
+   u16 vlan_id;
+};
+
+#define QEDI_RX_DESC_CNT   (QEDI_PAGE_SIZE / sizeof(struct qedi_rx_bd))
+#define QEDI_MAX_RX_DESC_CNT   (QEDI_RX_DESC_CNT - 1)
+#define QEDI_NUM_RX_BD (QEDI_RX_DESC_CNT * 1)
+#define QEDI_MAX_RX_BD (QEDI_NUM_RX_BD - 1)
+
+#define QEDI_NEXT_RX_IDX(x)x) & (QEDI_MAX_RX_DESC_CNT)) == \
+ (QEDI_MAX_RX_DESC_CNT - 1)) ? \
+(x) + 2 : (x) + 1)
+
+struct qedi_uio_dev {
+   struct uio_info qedi_uinfo;
+   u32 uio_dev;
+   struct list_headlist;
+
+   u32 ll2_ring_size;
+   void*ll2_ring;
+
+   u32 ll2_buf_size;
+   void*ll2_buf;
+
+   void*rx_pkt;
+   void*tx_pkt;
+
+   struct qedi_ctx *qedi;
+   struct pci_dev  *pdev;
+   void*uctrl;
+};
+
+/* List to maintain the skb pointers */
+struct skb_work_list {
+   struct list_head list;
+   struct sk_buff *skb;
+   u16 vlan_id;
+};
 
 /* Queue sizes in number of elements */
 #define QEDI_SQ_SIZE   MAX_OUSTANDING_TASKS_PER_CON
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index 35ab2f9..58ac9a2 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -45,9 +45,12 @@
 static struct scsi_transport_template *qedi_scsi_transport;
 static struct pci_driver qedi_pci_driver;
 static DEFINE_PER_CPU(struct qedi_percpu_s, qedi_percpu);
+static LIST_HEAD(qedi_udev_list);
 /* Static function declaration */
 static int qedi_alloc_global_queues(struct qedi_ctx *qedi);
 static void qedi_free_global_queues(struct qedi_ctx *qedi);
+static void qedi_reset_uio_rings(struct qedi_uio_dev *udev);
+static void qedi_ll2_free_skbs(struct qedi_ctx *qedi);
 
 static int qedi_iscsi_event_cb(void *context, u8 fw_event_code, void 
*fw_handle)
 {
@@ -112,6 +115,224 @@ static int qedi_iscsi_event_cb(void *context, u8 
fw_event_code, void *fw_handle)
return rval;
 }
 
+static int qedi_uio_open(struct uio_info *uinfo, struct inode *inode)
+{
+   struct qedi_uio_dev *udev = uinfo->priv;
+   struct qedi_ctx *qedi = udev->qedi;
+
+   if (!capable(CAP_NET_ADMIN))
+   return -EPERM;
+
+   if (udev->uio_dev != -1)
+   return -EBUSY;
+
+   rtnl_lock();
+   udev->uio_dev = iminor(inode);
+   qedi_reset_uio_rings(udev);
+   set_bit(UIO_DEV_OPENED, >flags);
+   rtnl_unlock();
+
+   return 0;
+}
+
+static int qedi_uio_close(struct uio_info *uinfo, struct inode *inode)
+{
+   struct qedi_uio_dev *udev = uinfo->priv;
+   struct qedi_ctx *qedi = udev->qedi;
+
+   udev->uio_dev = -1;
+   clear_bit(UIO_DEV_OPENED, >flags);
+   qedi_ll2_free_skbs(qedi);
+   return 0;
+}
+
+static void

Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.

2016-10-18 Thread William Tu

> ...
>> - if (copy_to_user(uvalue, value, value_size) != 0)
>> + if (copy_to_user(uvalue, value, min_t(u32, usize, value_size)) != 0)
>>   goto free_value;
>
> I think such approach won't actually fix anything. User space
> may lose some of the values and won't have any idea what was lost.
> I think we need to fix sample code to avoid using 
> sysconf(_SC_NPROCESSORS_CONF)
> and use /sys/devices/system/cpu/possible instead.
> I would argue that glibc should be fixed as well since relying on
> ls -d /sys/devices/system/cpu/cpu[0-9]*|wc -l turned out to be incorrect.
>

Thanks for the feedback. I think glibc is correct. The
_SC_NPROCESSORS_CONF presents the number of processors
configured/populated and is indeed "ls
/sys/devices/system/cpu/cpu[0-9]*|wc -l". This means the actual number
of CPUs installed on your system. On the other hand, the
num_possible_cpus() includes both the installed CPUs and the empty CPU
socket/slot, in order to support CPU hotplug.

As a example, one of my dual socket motherboard with 1 CPU installed has
# /sys/devices/system/cpu/possible
0-239
# /sys/devices/system/cpu/cpu[0-9]*|wc -l
12
Note that these 12 cpus could be online/offline by
# echo 1/0 > /sys/devices/system/cpu/cpuX/online
Even if it is offline, the entry is still there.

Thinking about another solution, maybe we should use
"num_present_cpus()" which means the configured/populated CPUs and the
value is the same as sysconf(_SC_NPROCESSORS_CONF). Consider:
1) cpuX is online/offline: the num_present_cpus() remains the same.
2) new cpu is hotplug into the empty socket: the num_present_cpus()
gets updates, and also the sysconf(_SC_NPROCESSORS_CONF).

+++ b/kernel/bpf/syscall.c
@@ -297,7 +297,7 @@ static int map_lookup_elem(union bpf_attr *attr)

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
-   value_size = round_up(map->value_size, 8) * num_possible_cpus();
+   value_size = round_up(map->value_size, 8) * num_present_cpus();
else
value_size = map->value_size;

Thanks. Regards,
William

Re: [PATCH net-next v12 1/9] openvswitch: use hard_header_len instead of hardcoded ETH_HLEN

2016-10-18 Thread Pravin Shelar

On Mon, Oct 17, 2016 at 6:02 AM, Jiri Benc  wrote:
> On tx, use hard_header_len while deciding whether to refragment or drop the
> packet. That way, all combinations are calculated correctly:
>
> * L2 packet going to L2 interface (the L2 header len is subtracted),
> * L2 packet going to L3 interface (the L2 header is included in the packet
>   lenght),
> * L3 packet going to L3 interface.
>
> Signed-off-by: Jiri Benc 

Acked-by: Pravin B Shelar

Re: [PATCH net-next v12 9/9] openvswitch: use ipgre tunnel rather than gretap tunnel

2016-10-18 Thread Pravin Shelar

On Mon, Oct 17, 2016 at 6:02 AM, Jiri Benc  wrote:
> From: Simon Horman 
>
> This allows GRE tunnels to send and receive both
> layer 2 packets (packets with an ethernet header) and
> layer 3 packets (packets without an ethernet header).
>
> Signed-off-by: Simon Horman 
> Signed-off-by: Jiri Benc 
> ---
> v12: removed the non-gre hunks (now part of previous patches in this
>  patchset)
> ---
>  include/net/gre.h   | 4 ++--
>  net/ipv4/ip_gre.c   | 9 +
>  net/openvswitch/vport-gre.c | 2 +-
>  3 files changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/include/net/gre.h b/include/net/gre.h
> index d25d836c129b..1a0bb1cefa60 100644
> --- a/include/net/gre.h
> +++ b/include/net/gre.h
> @@ -31,8 +31,8 @@ struct gre_protocol {
>  int gre_add_protocol(const struct gre_protocol *proto, u8 version);
>  int gre_del_protocol(const struct gre_protocol *proto, u8 version);
>
> -struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
> -  u8 name_assign_type);
> +struct net_device *gre_fb_dev_create(struct net *net, const char *name,
> +u8 name_assign_type);
>  int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
>  bool *csum_err, __be16 proto, int nhs);
>
> diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
> index 576f705d8180..18caea5c6d09 100644
> --- a/net/ipv4/ip_gre.c
> +++ b/net/ipv4/ip_gre.c
> @@ -1125,8 +1125,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const 
> struct net_device *dev)
> .get_link_net   = ip_tunnel_get_link_net,
>  };
>
> -struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
> -   u8 name_assign_type)
> +struct net_device *gre_fb_dev_create(struct net *net, const char *name,
> +u8 name_assign_type)
>  {
> struct nlattr *tb[IFLA_MAX + 1];
> struct net_device *dev;
> @@ -1137,13 +1137,14 @@ struct net_device *gretap_fb_dev_create(struct net 
> *net, const char *name,
> memset(, 0, sizeof(tb));
>
> dev = rtnl_create_link(net, name, name_assign_type,
> -  _tap_ops, tb);
> +  _link_ops, tb);
> if (IS_ERR(dev))
> return dev;
>
> /* Configure flow based GRE device. */
> t = netdev_priv(dev);
> t->collect_md = true;
> +   dev->type = ARPHRD_NONE;
>

This is OVS tunnel compatibility code. We are not suppose to add new
features to compat code. Just provide a way to configure such device
over rtnl.

Re: [PATCH net-next v12 5/9] openvswitch: add processing of L3 packets

2016-10-18 Thread Pravin Shelar

On Mon, Oct 17, 2016 at 6:02 AM, Jiri Benc  wrote:
> Support receiving, extracting flow key and sending of L3 packets (packets
> without an Ethernet header).
>
> Note that even after this patch, non-Ethernet interfaces are still not
> allowed to be added to bridges. Similarly, netlink interface for sending and
> receiving L3 packets to/from user space is not in place yet.
>
> Based on previous versions by Lorand Jakab and Simon Horman.
>
> Signed-off-by: Lorand Jakab 
> Signed-off-by: Simon Horman 
> Signed-off-by: Jiri Benc 
> ---
>  net/openvswitch/datapath.c |  17 ++--
>  net/openvswitch/flow.c | 101 
> ++---
>  net/openvswitch/vport.c|  16 +++
>  3 files changed, 96 insertions(+), 38 deletions(-)
>
...

> @@ -505,28 +511,35 @@ static int key_extract(struct sk_buff *skb, struct 
> sw_flow_key *key)
>
> skb_reset_mac_header(skb);
>
> -   /* Link layer.  We are guaranteed to have at least the 14 byte 
> Ethernet
> -* header in the linear data area.
> -*/
> -   eth = eth_hdr(skb);
> -   ether_addr_copy(key->eth.src, eth->h_source);
> -   ether_addr_copy(key->eth.dst, eth->h_dest);
> +   /* Link layer. */
> +   clear_vlan(key);
> +   if (key->mac_proto == MAC_PROTO_NONE) {
> +   if (unlikely(eth_type_vlan(skb->protocol)))
> +   return -EINVAL;
>
> -   __skb_pull(skb, 2 * ETH_ALEN);
> -   /* We are going to push all headers that we pull, so no need to
> -* update skb->csum here.
> -*/
> +   skb_reset_network_header(skb);
> +   } else {
> +   eth = eth_hdr(skb);
> +   ether_addr_copy(key->eth.src, eth->h_source);
> +   ether_addr_copy(key->eth.dst, eth->h_dest);
>
> -   if (unlikely(parse_vlan(skb, key)))
> -   return -ENOMEM;
> +   __skb_pull(skb, 2 * ETH_ALEN);
> +   /* We are going to push all headers that we pull, so no need 
> to
> +   * update skb->csum here.
> +   */
>
> -   key->eth.type = parse_ethertype(skb);
> -   if (unlikely(key->eth.type == htons(0)))
> -   return -ENOMEM;
> +   if (unlikely(parse_vlan(skb, key)))
> +   return -ENOMEM;
>
> -   skb_reset_network_header(skb);
> +   skb->protocol = parse_ethertype(skb);

I am not sure about changing skb->protocol here.
By changing this skb loosing information about packet type. Therefore
if packet re-enters OVS (through different bridge), this packet would
look like L3 packet. function key_extract_mac_proto() would not see
TEB type packet.

> +   if (unlikely(skb->protocol == htons(0)))
> +   return -ENOMEM;
> +
> +   skb_reset_network_header(skb);
> +   __skb_push(skb, skb->data - skb_mac_header(skb));
> +   }
> skb_reset_mac_len(skb);
> -   __skb_push(skb, skb->data - skb_mac_header(skb));
> +   key->eth.type = skb->protocol;
>
> /* Network layer. */
> if (key->eth.type == htons(ETH_P_IP)) {
> @@ -721,6 +734,20 @@ int ovs_flow_key_update(struct sk_buff *skb, struct 
> sw_flow_key *key)
> return key_extract(skb, key);
>  }
>
> +static u8 key_extract_mac_proto(struct sk_buff *skb)
> +{
> +   switch (skb->dev->type) {
> +   case ARPHRD_ETHER:
> +   return MAC_PROTO_ETHERNET;
> +   case ARPHRD_NONE:
> +   if (skb->protocol == htons(ETH_P_TEB))
> +   return MAC_PROTO_ETHERNET;
> +   return MAC_PROTO_NONE;
> +   }
> +   WARN_ON_ONCE(1);
> +   return MAC_PROTO_ETHERNET;
> +}
> +

Re: [PATCH net-next v12 4/9] openvswitch: support MPLS push and pop for L3 packets

2016-10-18 Thread Pravin Shelar

On Mon, Oct 17, 2016 at 6:02 AM, Jiri Benc  wrote:
> Update Ethernet header only if there is one.
>
> Signed-off-by: Jiri Benc 

Acked-by: Pravin B Shelar

Re: [PATCH net-next v12 3/9] openvswitch: pass mac_proto to ovs_vport_send

2016-10-18 Thread Pravin Shelar

On Mon, Oct 17, 2016 at 6:02 AM, Jiri Benc  wrote:
> We'll need it to alter packets sent to ARPHRD_NONE interfaces.
>
> Change do_output() to use the actual L2 header size of the packet when
> deciding on the minimum cutlen. The assumption here is that what matters is
> not the output interface hard_header_len but rather the L2 header of the
> particular packet. For example, ARPHRD_NONE tunnels that encapsulate
> Ethernet should get at least the Ethernet header.
>
> Signed-off-by: Jiri Benc 

Acked-by: Pravin B Shelar

Re: [PATCH net-next v12 2/9] openvswitch: add mac_proto field to the flow key

2016-10-18 Thread Pravin Shelar

On Mon, Oct 17, 2016 at 6:02 AM, Jiri Benc  wrote:
> Use a hole in the structure. We support only Ethernet so far and will add
> a support for L2-less packets shortly. We could use a bool to indicate
> whether the Ethernet header is present or not but the approach with the
> mac_proto field is more generic and occupies the same number of bytes in the
> struct, while allowing later extensibility. It also makes the code in the
> next patches more self explaining.
>
> It would be nice to use ARPHRD_ constants but those are u16 which would be
> waste. Thus define our own constants.
>
> Another upside of this is that we can overload this new field to also denote
> whether the flow key is valid. This has the advantage that on
> refragmentation, we don't have to reparse the packet but can rely on the
> stored eth.type. This is especially important for the next patches in this
> series - instead of adding another branch for L2-less packets before calling
> ovs_fragment, we can just remove all those branches completely.
>
> Signed-off-by: Jiri Benc 
> ---
> There are three possible approaches:
>
> (1) The one in this patch.
>
> (2) Use just a one bit flag indicating whether the packet is L3 or Ethernet
> (similar to the "is_layer3" bool in v11). The code would stay very
> similar to this patchset, the memory consumption would be the same.
>
> (3) Use value of 14 for MAC_PROTO_ETHERNET. It would simplify things nicely,
> as ovs_mac_header_len would be identical to ovs_key_mac_proto, saving
> one comparison. Of course, this would mean that if other L2 protocols
> are added in the future, they can only have L2 header length different
> than 14. Sounds hacky, although I kind of like this.
>
> After thinking about pros and cons, I implemented (1). Seems to be most
> clear of the three options. But I'm happy to implement (2) or (3) if it's
> deemed better.

I like approach taken by this patch.

Acked-by: Pravin B Shelar

Re: [PATCH net-next v12 0/9] openvswitch: support for layer 3 encapsulated packets

2016-10-18 Thread Pravin Shelar

On Mon, Oct 17, 2016 at 6:02 AM, Jiri Benc  wrote:
> At the core of this patch set is removing the assumption in Open vSwitch
> datapath that all packets have Ethernet header. Support for layer 3 GRE
> tunnels is also added by this patchset.
>
> The implementation relies on the presence of pop_eth and push_eth actions
> in datapath flows to facilitate adding and removing Ethernet headers as
> appropriate. The construction of such flows is left up to user-space.
>
> This series is based on work by Simon Horman, Lorand Jakab, Thomas Morin and
> others. I kept Lorand's and Simon's s-o-b in the patches that are derived
> from v11 to record their authorship of parts of the code. Please let me know
> if you disagree with this.
>
> v12 differs from v11 a lot. The main changes are:
>
> * The patches were restructured and split differently for easier review.
> * They were rebased and adjusted to the current net-next. Especially MPLS
>   handling is different (and easier) thanks to the recent MPLS GSO rework.
> * Several bugs were discovered and fixed. The most notable is fragment
>   handling: header adjustment for ARPHRD_NONE devices on tx needs to be done
>   after refragmentation, not before it. This required significant changes in
>   the patchset. Another one is stricter checking of attributes (match on L2
>   vs. L3 packet) at the kernel level.
> * Instead of is_layer3 bool, a mac_proto field is used. See patch 2. This is
>   a matter of taste and alternate approaches are offered in patch 2
>   description.
>
> There is no change to uAPI since v11. The previously posted patchset for
> Open vSwitch user space works with this submission unmodified.
>

I have not finished the review yet, but most of patches looks good to
me. Can you send userspace patches against latest master so that I can
try the patches with tunnel setup?

Re: tg3 BUG: spinlock lockup suspected

2016-10-18 Thread Siva Reddy Kallam

On Mon, Oct 17, 2016 at 6:35 PM, Meelis Roos  wrote:
>> > Now I reproduced the bug even with 4.7-rc1 so it is older than 4.7. Will
>> > test further.
>>
>> It gets stranger and stranger - my old 4.7 image worked fine, freshly
>> compiled 4.7 exhibits the same problem.
>>
>> Toolchain has not changed, that I know for sure.
>>
>> What may have changed is kernel .config. My old conf was with whatever I
>> had during 4.7. Then I upgraded to 4.8-rc3 and then 4.8 and selected
>> values for "make oldconfig" new entries. Then went back to 4.7-rc1 and
>> then to 4.7 with this config, answering quiestion about new options when
>> any appeared. Diff is not available since I do not have the old configs
>> archived.
>
> I did some more digging. Found an older configuration that is working
> and recreated a newer one that is bad, for the same 4.7 kernel. This is
> reproducible now, from "make clean" state.
>
> Working config from 4.7-rc4 attached as config-4.7, broken config from
> 4.7 attached as config-4.7-bad.
>
> Will try to bisect the configs as time permits. But looking at the
> stack traces, the issue is probably timing related, when ip and dhclient
> do something with the same lock. seq_read that outputs stats could be
> reading /proc/net/dev that reads counters from each interface.
>
> ifupdown seems to use the following for dhcp interfaces:
>   up
> [[/bin/ip link set dev %iface% address %hwaddress%]]
> /sbin/dhclient -v -pf /run/dhclient.%iface%.pid -lf 
> /var/lib/dhcp/dhclient.%iface%.leases -I -df 
> /var/lib/dhcp/dhclient6.%iface%.leases %iface% \
> ...
>
> so ip link is setting link up, this creates some work for the
> background, and the dhclient goes adn reads /proc/net/dev, and lockup is
> suspected but not proven?
>
> I started a loop for test, doing cat /proc/net/dev in a loop and at the
> same link link up and down from console, but up and down is slow process
> and the loop did not seem to trigger the warning over night, so it was
> not so simple.
>
I am busy with other priority tasks. One of my colleague Deepak will
work this with you.
I added him to CC list.
Thanks.
>
>> > > [   83.716570] BUG: spinlock lockup suspected on CPU#0, dhclient/1014
>> > > [   83.797819]  lock: 0xfff000123c8e4a08, .magic: dead4ead, .owner: 
>> > > ip/1001, .owner_cpu: 1
>> > > [   83.903130] CPU: 0 PID: 1014 Comm: dhclient Not tainted 4.8.0 #4
>> > > [   83.982129] Call Trace:
>> > > [   84.014160]  [004b7220] spin_dump+0x60/0xa0
>> > > [   84.078203]  [004b73a0] do_raw_spin_lock+0xa0/0x120
>> > > [   84.106344] IPv6: ADDRCONF(NETDEV_UP): eth0: link is not ready
>> > > [   84.107193] ip (1001) used greatest stack depth: 2168 bytes left
>> > > [   84.306955]  [0092c0d0] _raw_spin_lock_bh+0x30/0x40
>> > > [   84.380188]  [100822cc] tg3_get_stats64+0xc/0x80 [tg3]
>> > > [   84.456885]  [007fac8c] dev_get_stats+0x2c/0xc0
>> > > [   84.525506]  [0081a4e8] dev_seq_printf_stats+0x8/0xe0
>> > > [   84.600986]  [0081a5e4] dev_seq_show+0x24/0x40
>> > > [   84.668467]  [005cb6c4] seq_read+0x2c4/0x440
>> > > [   84.733656]  [0060b97c] proc_reg_read+0x3c/0x80
>> > > [   84.802282]  [005a219c] __vfs_read+0x1c/0x140
>> > > [   84.868613]  [005a2310] vfs_read+0x50/0x100
>> > > [   84.932662]  [005a265c] SyS_read+0x3c/0xa0
>> > > [   84.995573]  [004061d4] linux_sparc_syscall32+0x34/0x60
>> > > [   85.073748] * CPU[  0]: TSTATE[0044f0001a22] 
>> > > TPC[f79a16b0] TNPC[f79a16b4] TASK[dhclient:1014]
>> > > [   85.208732]  TPC[f79a16b0] O7[f79405c8] I7[0] RPC[0]
>> > > [   85.287633]   CPU[  1]: TSTATE[004480001605] 
>> > > TPC[004b26f0] TNPC[004d0b0c] TASK[swapper/1:0]
>> > > [   85.420338]  TPC[trace_hardirqs_off+0x10/0x20] 
>> > > O7[rcu_idle_enter+0x64/0xa0] I7[cpu_startup_entry+0x1b0/0x240] 
>> > > RPC[rest_init+0x178/0x1a0]
>> > > [   85.664600] tg3 :00:02.0 eth0: Link is up at 100 Mbps, full duplex
>> > > [   85.750515] tg3 :00:02.0 eth0: Flow control is off for TX and off 
>> > > for RX
>> > > [   85.843994] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
>
> --
> Meelis Roos (mr...@linux.ee)

bad commit touching stmmac_ptp.c

2016-10-18 Thread Nicolas Pitre

Hello,

I noticed a recently added commit 7086605a6a ("stmmac: fix error check 
when init ptp") to the mainline linux tree from you. This commit is 
wrong.  The affected code now reads as:

int stmmac_ptp_register(struct stmmac_priv *priv)
{
spin_lock_init(>ptp_lock);
priv->ptp_clock_ops = stmmac_ptp_clock_ops;

priv->ptp_clock = ptp_clock_register(>ptp_clock_ops,
 priv->device);
if (IS_ERR(priv->ptp_clock)) {
priv->ptp_clock = NULL;
return PTR_ERR(priv->ptp_clock);
}

spin_lock_init(>ptp_lock);

netdev_dbg(priv->dev, "Added PTP HW clock successfully\n");

return 0;
}

Firstly, you basically reverted the change I did with commit 
efee95f42b ("ptp_clock: future-proofing drivers against PTP subsystem 
becoming optional").  Please have a look at that commit and ponder its 
implications.

Secondly, the error you're actually returning to the caller with your 
patch is actually PTR_ERR(NULL) which is basically a more convoluted way 
to return the same value as what was returned before your patch, which 
is probably not what you intended.

And finally you added a needless initialization of priv->ptp_lock given 
that this was already done a few lines before that addition.

Was this patch actually reviewed?


Nicolas

Re: [PATCH] crypto: ccm - avoid scatterlist for MAC encryption

2016-10-18 Thread Herbert Xu

On Mon, Oct 17, 2016 at 06:21:14PM +0100, Ard Biesheuvel wrote:
>
> Annoyingly, all this complication with scatterlists etc is for doing
> asynchronous crypto via DMA capable crypto accelerators, and the
> networking code (ipsec as well as mac80211, afaik) only allow
> synchronous in the first place, given that they execute in softirq
> context.

I'm still thinking about the issue (in particular, whether we
should continue to rely on the request context being SG-capable
or allow it to be on the stack for AEAD).

But IPsec definitely supports async crypto.  In fact it was the
very first user of async crypto.

mac80211 on the other hand is currently sync-only.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[PATCH net-next 5/6] net: use core MTU range checking in virt drivers

2016-10-18 Thread Jarod Wilson

hyperv_net:
- set min/max_mtu

virtio_net:
- set min/max_mtu
- remove virtnet_change_mtu

vmxnet3:
- set min/max_mtu

CC: netdev@vger.kernel.org
CC: virtualizat...@lists.linux-foundation.org
CC: "K. Y. Srinivasan" 
CC: Haiyang Zhang 
CC: "Michael S. Tsirkin" 
CC: Shrikrishna Khare 
CC: "VMware, Inc." 
Signed-off-by: Jarod Wilson 
---
 drivers/net/hyperv/hyperv_net.h   |  4 ++--
 drivers/net/hyperv/netvsc_drv.c   | 14 +++---
 drivers/net/virtio_net.c  | 23 ++-
 drivers/net/vmxnet3/vmxnet3_drv.c |  7 ---
 4 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index f4fbcb5..3958ada 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -606,8 +606,8 @@ struct nvsp_message {
 } __packed;
 
 
-#define NETVSC_MTU 65536
-#define NETVSC_MTU_MIN 68
+#define NETVSC_MTU 65535
+#define NETVSC_MTU_MIN ETH_MIN_MTU
 
 #define NETVSC_RECEIVE_BUFFER_SIZE (1024*1024*16)  /* 16MB */
 #define NETVSC_RECEIVE_BUFFER_SIZE_LEGACY  (1024*1024*15)  /* 15MB */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f0919bd..3dc9679 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -872,19 +872,12 @@ static int netvsc_change_mtu(struct net_device *ndev, int 
mtu)
struct netvsc_device *nvdev = ndevctx->nvdev;
struct hv_device *hdev = ndevctx->device_ctx;
struct netvsc_device_info device_info;
-   int limit = ETH_DATA_LEN;
u32 num_chn;
int ret = 0;
 
if (ndevctx->start_remove || !nvdev || nvdev->destroy)
return -ENODEV;
 
-   if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
-   limit = NETVSC_MTU - ETH_HLEN;
-
-   if (mtu < NETVSC_MTU_MIN || mtu > limit)
-   return -EINVAL;
-
ret = netvsc_close(ndev);
if (ret)
goto out;
@@ -1343,6 +1336,13 @@ static int netvsc_probe(struct hv_device *dev,
 
netif_carrier_off(net);
 
+   /* MTU range: 68 - 1500 or 65521 */
+   net->min_mtu = NETVSC_MTU_MIN;
+   if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
+   net->max_mtu = NETVSC_MTU - ETH_HLEN;
+   else
+   net->max_mtu = ETH_DATA_LEN;
+
netvsc_init_settings(net);
 
net_device_ctx = netdev_priv(net);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fad84f3..4885a42 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1419,17 +1419,6 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
.set_settings = virtnet_set_settings,
 };
 
-#define MIN_MTU 68
-#define MAX_MTU 65535
-
-static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
-{
-   if (new_mtu < MIN_MTU || new_mtu > MAX_MTU)
-   return -EINVAL;
-   dev->mtu = new_mtu;
-   return 0;
-}
-
 static const struct net_device_ops virtnet_netdev = {
.ndo_open= virtnet_open,
.ndo_stop= virtnet_close,
@@ -1437,7 +1426,6 @@ static const struct net_device_ops virtnet_netdev = {
.ndo_validate_addr   = eth_validate_addr,
.ndo_set_mac_address = virtnet_set_mac_address,
.ndo_set_rx_mode = virtnet_set_rx_mode,
-   .ndo_change_mtu  = virtnet_change_mtu,
.ndo_get_stats64 = virtnet_stats,
.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
@@ -1748,6 +1736,9 @@ static bool virtnet_validate_features(struct 
virtio_device *vdev)
return true;
 }
 
+#define MIN_MTU ETH_MIN_MTU
+#define MAX_MTU 65535
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
int i, err;
@@ -1821,6 +1812,10 @@ static int virtnet_probe(struct virtio_device *vdev)
 
dev->vlan_features = dev->features;
 
+   /* MTU range: 68 - 65535 */
+   dev->min_mtu = MIN_MTU;
+   dev->max_mtu = MAX_MTU;
+
/* Configuration may specify what MAC to use.  Otherwise random. */
if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
virtio_cread_bytes(vdev,
@@ -1875,8 +1870,10 @@ static int virtnet_probe(struct virtio_device *vdev)
mtu = virtio_cread16(vdev,
 offsetof(struct virtio_net_config,
  mtu));
-   if (virtnet_change_mtu(dev, mtu))
+   if (mtu >= dev->min_mtu && mtu <= dev->max_mtu) {
+   dev->mtu = mtu;
__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
+   }
}
 
if (vi->any_header_sg)
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c 
b/drivers/net/vmxnet3/vmxnet3_drv.c
index b5554f2..0c36de1 100644
---

[PATCH net-next 6/6] net: use core MTU range checking in misc drivers

2016-10-18 Thread Jarod Wilson

firewire-net:
- set min/max_mtu
- remove fwnet_change_mtu

nes:
- set max_mtu
- clean up nes_netdev_change_mtu

xpnet:
- set min/max_mtu
- remove xpnet_dev_change_mtu

hippi:
- set min/max_mtu
- remove hippi_change_mtu

batman-adv:
- set max_mtu
- remove batadv_interface_change_mtu
- initialization is a little async, not 100% certain that max_mtu is set
  in the optimal place, don't have hardware to test with

rionet:
- set min/max_mtu
- remove rionet_change_mtu

slip:
- set min/max_mtu
- streamline sl_change_mtu

CC: netdev@vger.kernel.org
CC: Stefan Richter 
CC: Faisal Latif 
CC: linux-r...@vger.kernel.org
CC: Cliff Whickman 
CC: Robin Holt 
CC: Jes Sorensen 
CC: Marek Lindner 
CC: Simon Wunderlich 
CC: Antonio Quartulli 
Signed-off-by: Jarod Wilson 
---
 drivers/firewire/net.c  | 12 ++--
 drivers/infiniband/hw/nes/nes.c |  1 -
 drivers/infiniband/hw/nes/nes.h |  4 ++--
 drivers/infiniband/hw/nes/nes_nic.c | 10 +++---
 drivers/misc/sgi-xp/xpnet.c | 21 -
 drivers/net/hippi/rrunner.c |  1 -
 drivers/net/rionet.c| 15 +++
 drivers/net/slip/slip.c | 11 +--
 include/linux/hippidevice.h |  1 -
 net/802/hippi.c | 14 ++
 net/batman-adv/soft-interface.c | 13 +
 11 files changed, 22 insertions(+), 81 deletions(-)

diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index 309311b..b5f125c 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -1349,15 +1349,6 @@ static netdev_tx_t fwnet_tx(struct sk_buff *skb, struct 
net_device *net)
return NETDEV_TX_OK;
 }
 
-static int fwnet_change_mtu(struct net_device *net, int new_mtu)
-{
-   if (new_mtu < 68)
-   return -EINVAL;
-
-   net->mtu = new_mtu;
-   return 0;
-}
-
 static const struct ethtool_ops fwnet_ethtool_ops = {
.get_link   = ethtool_op_get_link,
 };
@@ -1366,7 +1357,6 @@ static const struct net_device_ops fwnet_netdev_ops = {
.ndo_open   = fwnet_open,
.ndo_stop   = fwnet_stop,
.ndo_start_xmit = fwnet_tx,
-   .ndo_change_mtu = fwnet_change_mtu,
 };
 
 static void fwnet_init_dev(struct net_device *net)
@@ -1481,6 +1471,8 @@ static int fwnet_probe(struct fw_unit *unit,
max_mtu = (1 << (card->max_receive + 1))
  - sizeof(struct rfc2734_header) - IEEE1394_GASP_HDR_SIZE;
net->mtu = min(1500U, max_mtu);
+   net->min_mtu = ETH_MIN_MTU;
+   net->max_mtu = net->mtu;
 
/* Set our hardware address while we're at it */
ha = (union fwnet_hwaddr *)net->dev_addr;
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index 35cbb17..2baa45a 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -65,7 +65,6 @@ MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(DRV_VERSION);
 
-int max_mtu = 9000;
 int interrupt_mod_interval = 0;
 
 /* Interoperability */
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
index e7430c9..85acd08 100644
--- a/drivers/infiniband/hw/nes/nes.h
+++ b/drivers/infiniband/hw/nes/nes.h
@@ -83,6 +83,8 @@
 #define NES_FIRST_QPN   64
 #define NES_SW_CONTEXT_ALIGN1024
 
+#define NES_MAX_MTU9000
+
 #define NES_NIC_MAX_NICS16
 #define NES_MAX_ARP_TABLE_SIZE  4096
 
@@ -169,8 +171,6 @@ do { \
 #include "nes_cm.h"
 #include "nes_mgt.h"
 
-extern int max_mtu;
-#define max_frame_len (max_mtu+ETH_HLEN)
 extern int interrupt_mod_interval;
 extern int nes_if_count;
 extern int mpa_version;
diff --git a/drivers/infiniband/hw/nes/nes_nic.c 
b/drivers/infiniband/hw/nes/nes_nic.c
index 2b27d13..7f8597d 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -981,20 +981,16 @@ static int nes_netdev_change_mtu(struct net_device 
*netdev, int new_mtu)
 {
struct nes_vnic *nesvnic = netdev_priv(netdev);
struct nes_device *nesdev = nesvnic->nesdev;
-   int ret = 0;
u8 jumbomode = 0;
u32 nic_active;
u32 nic_active_bit;
u32 uc_all_active;
u32 mc_all_active;
 
-   if ((new_mtu < ETH_ZLEN) || (new_mtu > max_mtu))
-   return -EINVAL;
-
netdev->mtu = new_mtu;
nesvnic->max_frame_size = new_mtu + VLAN_ETH_HLEN;
 
-   if (netdev->mtu > 1500) {
+   if (netdev->mtu > ETH_DATA_LEN) {
jumbomode=1;
}
nes_nic_init_timer_defaults(nesdev, jumbomode);
@@ -1020,7 +1016,7 @@ static int nes_netdev_change_mtu(struct net_device 
*netdev, int new_mtu)
nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
}

Re: [PATCH net-next 00/15] ethernet: use core min/max MTU checking

2016-10-18 Thread Jarod Wilson

On Tue, Oct 18, 2016 at 06:28:20PM -0400, Jarod Wilson wrote:
> On Tue, Oct 18, 2016 at 11:33:27AM -0400, David Miller wrote:
> > From: Jarod Wilson 
> > Date: Mon, 17 Oct 2016 16:29:43 -0400
> > 
> > > On Mon, Oct 17, 2016 at 04:03:41PM -0400, David Miller wrote:
> > >> From: Jarod Wilson 
> > >> Date: Mon, 17 Oct 2016 15:54:02 -0400
> > >> 
> > >> > For the most part, every patch does the same essential thing: removes 
> > >> > the
> > >> > MTU range checking from the drivers' ndo_change_mtu function, puts 
> > >> > those
> > >> > ranges into the core net_device min_mtu and max_mtu fields, and where
> > >> > possible, removes ndo_change_mtu functions entirely.
> > >> 
> > >> Jarod, please read my other posting.
> > > 
> > > Done, didn't see it until just after I'd hit send, have replied there as
> > > well.
> > > 
> > >> You've positively broken the maximum MTU for all of these drivers.
> > >> 
> > >> That's not cool.
> > >>
> > >> And this series fixing things doesn't make things better, because now
> > >> we've significanyly broken bisection for anyone running into this
> > >> regression.
> > > 
> > > Agreed, and my suggestion right now is to revert the 2nd patch from the
> > > prior series. I believe it can be resubmitted after all other callers of
> > > ether_setup() have been converted to have their own min/max_mtu.
> > > 
> > >> You should have arranged this in such a way that the drivers needing
> > >> > 1500 byte MTU were not impacted at all by your changes, but that
> > >> isn't what happened.
> > > 
> > > Yeah, I must admit to not looking closely enough at the state the first
> > > two patches left things in. It was absolutely my intention to not alter
> > > behaviour in any way, but I neglected to test sufficiently without this
> > > additional set applied.
> > 
> > So what I'm going to do now it simply just apply your current patch series
> > to net-next and hope this gets everything working again.
> 
> Unfortunately, no, it doesn't get *everything* working again, because...
> 
> direct ether_setup() callers:
> 
> drivers/misc/sgi-xp/xpnet.c
> drivers/net/geneve.c
> drivers/net/macvlan.c
> drivers/net/tun.c
> drivers/net/vxlan.c
> drivers/net/wan/hdlc.c
> drivers/net/wan/hdlc_fr.c
> drivers/net/wireless/ath/wil6210/netdev.c
> drivers/net/wireless/cisco/airo.c
> drivers/staging/wlan-ng/p80211netdev.c
> net/batman-adv/soft-interface.c
> net/bridge/br_device.c
> net/openvswitch/vport-internal_dev.c
> 
> alloc_etherdev*() callers:
> drivers/infiniband/hw/nes/nes_nic.c
> drivers/net/hyperv/netvsc_drv.c
> drivers/net/rionet.c
> drivers/net/usb/lan78xx.c
> drivers/net/usb/r8152.c
> drivers/net/usb/usbnet.c
> drivers/net/virtio_net.c
> drivers/net/vmxnet3/vmxnet3_drv.c
> drivers/net/wireless/atmel/atmel.c
> drivers/net/wireless/cisco/airo.c
> drivers/net/wireless/intel/ipw2x00/libipw_module.c
> net/atm/lec.c
> 
> I have additional patches for all of these that I haven't yet posted, so
> I'd still suggest backing out the one patch to keep the above working too
> until the subsequent patches are posted.

They're all posted now, just 6 more relatively small patches, though the
ones touching geneve and vxlan are a bit more involved than any others,
and could use a very close look (relevant people should all be cc'd).
Still wouldn't have any objection at all to backing out the patch that
touches min/max_mtu in ether_setup() though.

-- 
Jarod Wilson
ja...@redhat.com

[PATCH net-next 4/6] net: use core MTU range checking in core net infra

2016-10-18 Thread Jarod Wilson

geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
  closer inspection and testing

macvlan:
- set min/max_mtu

tun:
- set min/max_mtu, remove tun_net_change_mtu

vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu, set min/max_mtu
- This one is also not as straight-forward and could use closer inspection
  and testing from vxlan folks

bridge:
- set max_mtu via br_min_mtu()

openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
  is the largest possible size supported

sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)

CC: netdev@vger.kernel.org
CC: Nicolas Dichtel 
CC: Hannes Frederic Sowa 
CC: Tom Herbert 
CC: Daniel Borkmann 
CC: Alexander Duyck 
CC: Paolo Abeni 
CC: Jiri Benc 
CC: WANG Cong 
CC: Roopa Prabhu 
CC: Pravin B Shelar 
CC: Sabrina Dubroca 
CC: Patrick McHardy 
CC: Stephen Hemminger 
CC: Pravin Shelar 
Signed-off-by: Jarod Wilson 
---
 drivers/net/geneve.c | 48 +++-
 drivers/net/macvlan.c|  6 +++-
 drivers/net/tun.c| 20 
 drivers/net/vxlan.c  | 62 ++--
 net/bridge/br_device.c   |  9 +++---
 net/openvswitch/vport-internal_dev.c | 10 --
 net/sched/sch_teql.c |  5 ++-
 7 files changed, 67 insertions(+), 93 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 3c20e87..752bcaa 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1034,39 +1034,18 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, 
struct net_device *dev)
return geneve_xmit_skb(skb, dev, info);
 }
 
-static int __geneve_change_mtu(struct net_device *dev, int new_mtu, bool 
strict)
+static int geneve_change_mtu(struct net_device *dev, int new_mtu)
 {
-   struct geneve_dev *geneve = netdev_priv(dev);
-   /* The max_mtu calculation does not take account of GENEVE
-* options, to avoid excluding potentially valid
-* configurations.
+   /* Only possible if called internally, ndo_change_mtu path's new_mtu
+* is guaranteed to be between dev->min_mtu and dev->max_mtu.
 */
-   int max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - dev->hard_header_len;
-
-   if (geneve->remote.sa.sa_family == AF_INET6)
-   max_mtu -= sizeof(struct ipv6hdr);
-   else
-   max_mtu -= sizeof(struct iphdr);
-
-   if (new_mtu < 68)
-   return -EINVAL;
-
-   if (new_mtu > max_mtu) {
-   if (strict)
-   return -EINVAL;
-
-   new_mtu = max_mtu;
-   }
+   if (new_mtu > dev->max_mtu)
+   new_mtu = dev->max_mtu;
 
dev->mtu = new_mtu;
return 0;
 }
 
-static int geneve_change_mtu(struct net_device *dev, int new_mtu)
-{
-   return __geneve_change_mtu(dev, new_mtu, true);
-}
-
 static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff 
*skb)
 {
struct ip_tunnel_info *info = skb_tunnel_info(skb);
@@ -1170,6 +1149,14 @@ static void geneve_setup(struct net_device *dev)
dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 
+   /* MTU range: 68 - (something less than 65535) */
+   dev->min_mtu = ETH_MIN_MTU;
+   /* The max_mtu calculation does not take account of GENEVE
+* options, to avoid excluding potentially valid
+* configurations. This will be further reduced by IPvX hdr size.
+*/
+   dev->max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - dev->hard_header_len;
+
netif_keep_dst(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
@@ -1285,10 +1272,13 @@ static int geneve_configure(struct net *net, struct 
net_device *dev,
 
/* make enough headroom for basic scenario */
encap_len = GENEVE_BASE_HLEN + ETH_HLEN;
-   if (remote->sa.sa_family == AF_INET)
+   if (remote->sa.sa_family == AF_INET) {
encap_len += sizeof(struct iphdr);
-   else
+   dev->max_mtu -= sizeof(struct iphdr);
+   } else {
encap_len += sizeof(struct ipv6hdr);
+   dev->max_mtu -= sizeof(struct ipv6hdr);
+   }
dev->needed_headroom = encap_len + ETH_HLEN;
 
if (metadata) {
@@ -1488,7 +1478,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, 
const char *name,

[PATCH net-next 2/6] net: use core MTU range checking in wireless drivers

2016-10-18 Thread Jarod Wilson

- set max_mtu in wil6210 driver
- set max_mtu in atmel driver
- set min/max_mtu in cisco airo driver, remove airo_change_mtu
- set min/max_mtu in ipw2100/ipw2200 drivers, remove libipw_change_mtu
- set min/max_mtu in p80211netdev, remove wlan_change_mtu

CC: netdev@vger.kernel.org
CC: linux-wirel...@vger.kernel.org
CC: Maya Erez 
CC: Simon Kelley 
CC: Stanislav Yakovlev 
Signed-off-by: Jarod Wilson 
---
 drivers/net/wireless/ath/wil6210/netdev.c  | 17 +
 drivers/net/wireless/atmel/atmel.c | 13 -
 drivers/net/wireless/cisco/airo.c  | 14 +++---
 drivers/net/wireless/intel/ipw2x00/ipw2100.c   |  3 ++-
 drivers/net/wireless/intel/ipw2x00/ipw2200.c   |  8 ++--
 drivers/net/wireless/intel/ipw2x00/libipw.h|  1 -
 drivers/net/wireless/intel/ipw2x00/libipw_module.c |  9 -
 drivers/staging/wlan-ng/p80211netdev.c | 18 +-
 8 files changed, 21 insertions(+), 62 deletions(-)

diff --git a/drivers/net/wireless/ath/wil6210/netdev.c 
b/drivers/net/wireless/ath/wil6210/netdev.c
index 61de5e9..d18372c 100644
--- a/drivers/net/wireless/ath/wil6210/netdev.c
+++ b/drivers/net/wireless/ath/wil6210/netdev.c
@@ -41,21 +41,6 @@ static int wil_stop(struct net_device *ndev)
return wil_down(wil);
 }
 
-static int wil_change_mtu(struct net_device *ndev, int new_mtu)
-{
-   struct wil6210_priv *wil = ndev_to_wil(ndev);
-
-   if (new_mtu < 68 || new_mtu > mtu_max) {
-   wil_err(wil, "invalid MTU %d\n", new_mtu);
-   return -EINVAL;
-   }
-
-   wil_dbg_misc(wil, "change MTU %d -> %d\n", ndev->mtu, new_mtu);
-   ndev->mtu = new_mtu;
-
-   return 0;
-}
-
 static int wil_do_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd)
 {
struct wil6210_priv *wil = ndev_to_wil(ndev);
@@ -69,7 +54,6 @@ static const struct net_device_ops wil_netdev_ops = {
.ndo_start_xmit = wil_start_xmit,
.ndo_set_mac_address= eth_mac_addr,
.ndo_validate_addr  = eth_validate_addr,
-   .ndo_change_mtu = wil_change_mtu,
.ndo_do_ioctl   = wil_do_ioctl,
 };
 
@@ -126,6 +110,7 @@ static int wil6210_netdev_poll_tx(struct napi_struct *napi, 
int budget)
 static void wil_dev_setup(struct net_device *dev)
 {
ether_setup(dev);
+   dev->max_mtu = mtu_max;
dev->tx_queue_len = WIL_TX_Q_LEN_DEFAULT;
 }
 
diff --git a/drivers/net/wireless/atmel/atmel.c 
b/drivers/net/wireless/atmel/atmel.c
index bf2e9a0..eb92d5a 100644
--- a/drivers/net/wireless/atmel/atmel.c
+++ b/drivers/net/wireless/atmel/atmel.c
@@ -1295,14 +1295,6 @@ static struct iw_statistics 
*atmel_get_wireless_stats(struct net_device *dev)
return >wstats;
 }
 
-static int atmel_change_mtu(struct net_device *dev, int new_mtu)
-{
-   if ((new_mtu < 68) || (new_mtu > 2312))
-   return -EINVAL;
-   dev->mtu = new_mtu;
-   return 0;
-}
-
 static int atmel_set_mac_address(struct net_device *dev, void *p)
 {
struct sockaddr *addr = p;
@@ -1506,7 +1498,6 @@ static const struct file_operations atmel_proc_fops = {
 static const struct net_device_ops atmel_netdev_ops = {
.ndo_open   = atmel_open,
.ndo_stop   = atmel_close,
-   .ndo_change_mtu = atmel_change_mtu,
.ndo_set_mac_address= atmel_set_mac_address,
.ndo_start_xmit = start_tx,
.ndo_do_ioctl   = atmel_ioctl,
@@ -1600,6 +1591,10 @@ struct net_device *init_atmel_card(unsigned short irq, 
unsigned long port,
dev->irq = irq;
dev->base_addr = port;
 
+   /* MTU range: 68 - 2312 */
+   dev->min_mtu = 68;
+   dev->max_mtu = MAX_WIRELESS_BODY - ETH_FCS_LEN;
+
SET_NETDEV_DEV(dev, sys_dev);
 
if ((rc = request_irq(dev->irq, service_interrupt, IRQF_SHARED, 
dev->name, dev))) {
diff --git a/drivers/net/wireless/cisco/airo.c 
b/drivers/net/wireless/cisco/airo.c
index 69b826d..4b04045 100644
--- a/drivers/net/wireless/cisco/airo.c
+++ b/drivers/net/wireless/cisco/airo.c
@@ -2329,14 +2329,6 @@ static int airo_set_mac_address(struct net_device *dev, 
void *p)
return 0;
 }
 
-static int airo_change_mtu(struct net_device *dev, int new_mtu)
-{
-   if ((new_mtu < 68) || (new_mtu > 2400))
-   return -EINVAL;
-   dev->mtu = new_mtu;
-   return 0;
-}
-
 static LIST_HEAD(airo_devices);
 
 static void add_airo_dev(struct airo_info *ai)
@@ -2656,7 +2648,6 @@ static const struct net_device_ops airo11_netdev_ops = {
.ndo_get_stats  = airo_get_stats,
.ndo_set_mac_address= airo_set_mac_address,
.ndo_do_ioctl   = airo_ioctl,
-   .ndo_change_mtu = airo_change_mtu,
 };
 
 static void wifi_setup(struct net_device *dev)
@@ -2668,6 +2659,8 @@ static void wifi_setup(struct

[PATCH net-next 3/6] net: use core MTU range checking in WAN drivers

2016-10-18 Thread Jarod Wilson

- set min/max_mtu in all hdlc drivers, remove hdlc_change_mtu
- sent max_mtu in lec driver, remove lec_change_mtu

CC: netdev@vger.kernel.org
CC: Krzysztof Halasa 
CC: Krzysztof Halasa 
CC: Jan "Yenya" Kasprzak 
CC: Francois Romieu 
CC: Kevin Curtis 
CC: Zhao Qiang 
Signed-off-by: Jarod Wilson 
---
 drivers/char/pcmcia/synclink_cs.c |  1 -
 drivers/net/wan/c101.c|  1 -
 drivers/net/wan/cosa.c|  1 -
 drivers/net/wan/dscc4.c   |  1 -
 drivers/net/wan/farsync.c |  1 -
 drivers/net/wan/fsl_ucc_hdlc.c|  1 -
 drivers/net/wan/hdlc.c| 11 ++-
 drivers/net/wan/hdlc_fr.c |  3 ++-
 drivers/net/wan/hostess_sv11.c|  1 -
 drivers/net/wan/ixp4xx_hss.c  |  1 -
 drivers/net/wan/lmc/lmc_main.c|  1 -
 drivers/net/wan/n2.c  |  1 -
 drivers/net/wan/pc300too.c|  1 -
 drivers/net/wan/pci200syn.c   |  1 -
 drivers/net/wan/sealevel.c|  1 -
 drivers/net/wan/wanxl.c   |  1 -
 drivers/tty/synclink.c|  1 -
 drivers/tty/synclink_gt.c |  1 -
 drivers/tty/synclinkmp.c  |  1 -
 include/linux/hdlc.h  |  2 --
 net/atm/lec.c | 11 +--
 21 files changed, 5 insertions(+), 39 deletions(-)

diff --git a/drivers/char/pcmcia/synclink_cs.c 
b/drivers/char/pcmcia/synclink_cs.c
index d28922d..a7dd5f4 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -4248,7 +4248,6 @@ static void hdlcdev_rx(MGSLPC_INFO *info, char *buf, int 
size)
 static const struct net_device_ops hdlcdev_ops = {
.ndo_open   = hdlcdev_open,
.ndo_stop   = hdlcdev_close,
-   .ndo_change_mtu = hdlc_change_mtu,
.ndo_start_xmit = hdlc_start_xmit,
.ndo_do_ioctl   = hdlcdev_ioctl,
.ndo_tx_timeout = hdlcdev_tx_timeout,
diff --git a/drivers/net/wan/c101.c b/drivers/net/wan/c101.c
index 09a5075..2371e07 100644
--- a/drivers/net/wan/c101.c
+++ b/drivers/net/wan/c101.c
@@ -302,7 +302,6 @@ static void c101_destroy_card(card_t *card)
 static const struct net_device_ops c101_ops = {
.ndo_open   = c101_open,
.ndo_stop   = c101_close,
-   .ndo_change_mtu = hdlc_change_mtu,
.ndo_start_xmit = hdlc_start_xmit,
.ndo_do_ioctl   = c101_ioctl,
 };
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c
index b87fe0a..087eb26 100644
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -432,7 +432,6 @@ module_exit(cosa_exit);
 static const struct net_device_ops cosa_ops = {
.ndo_open   = cosa_net_open,
.ndo_stop   = cosa_net_close,
-   .ndo_change_mtu = hdlc_change_mtu,
.ndo_start_xmit = hdlc_start_xmit,
.ndo_do_ioctl   = cosa_net_ioctl,
.ndo_tx_timeout = cosa_net_timeout,
diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c
index 6292259..7351e54 100644
--- a/drivers/net/wan/dscc4.c
+++ b/drivers/net/wan/dscc4.c
@@ -887,7 +887,6 @@ static inline int dscc4_set_quartz(struct dscc4_dev_priv 
*dpriv, int hz)
 static const struct net_device_ops dscc4_ops = {
.ndo_open   = dscc4_open,
.ndo_stop   = dscc4_close,
-   .ndo_change_mtu = hdlc_change_mtu,
.ndo_start_xmit = hdlc_start_xmit,
.ndo_do_ioctl   = dscc4_ioctl,
.ndo_tx_timeout = dscc4_tx_timeout,
diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index 3c9cbf9..03696d3 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -2394,7 +2394,6 @@ fst_init_card(struct fst_card_info *card)
 static const struct net_device_ops fst_ops = {
.ndo_open   = fst_open,
.ndo_stop   = fst_close,
-   .ndo_change_mtu = hdlc_change_mtu,
.ndo_start_xmit = hdlc_start_xmit,
.ndo_do_ioctl   = fst_ioctl,
.ndo_tx_timeout = fst_tx_timeout,
diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index 6564753..e38ce4d 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -992,7 +992,6 @@ static const struct dev_pm_ops uhdlc_pm_ops = {
 static const struct net_device_ops uhdlc_ops = {
.ndo_open   = uhdlc_open,
.ndo_stop   = uhdlc_close,
-   .ndo_change_mtu = hdlc_change_mtu,
.ndo_start_xmit = hdlc_start_xmit,
.ndo_do_ioctl   = uhdlc_ioctl,
 };
diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c
index 9bd4aa8..7221a53 100644
--- a/drivers/net/wan/hdlc.c
+++ b/drivers/net/wan/hdlc.c
@@ -46,14 +46,6 @@ static const char* version = "HDLC support module revision 
1.22";
 
 static struct hdlc_proto *first_proto;
 
-int hdlc_change_mtu(struct net_device *dev, int new_mtu)
-{
-   if ((new_mtu < 68) || (new_mtu > HDLC_MAX_MTU))
-   return -EINVAL;
-   dev->mtu = new_mtu;
-   return 0;
-}
-
 static int

[PATCH net-next 1/6] net: use core MTU range checking in USB NIC drivers

2016-10-18 Thread Jarod Wilson

- Remove stale new_mtu <= 0 check in usbnet.c
- Set appropriate max_mtu for different r8152 driven variants
- Set max_mtu in lan78xx driver

CC: netdev@vger.kernel.org
CC: Woojung Huh 
CC: Microchip Linux Driver Support 
CC: Hayes Wang 
CC: Oliver Neukum 
Signed-off-by: Jarod Wilson 
---
 drivers/net/usb/lan78xx.c |  8 +++-
 drivers/net/usb/r8152.c   | 15 ---
 drivers/net/usb/usbnet.c  |  2 --
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 13f033c..c4e748e 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1980,11 +1980,6 @@ static int lan78xx_change_mtu(struct net_device *netdev, 
int new_mtu)
int old_rx_urb_size = dev->rx_urb_size;
int ret;
 
-   if (new_mtu > MAX_SINGLE_PACKET_SIZE)
-   return -EINVAL;
-
-   if (new_mtu <= 0)
-   return -EINVAL;
/* no second zero-length packet read wanted after mtu-sized packets */
if ((ll_mtu % dev->maxpacket) == 0)
return -EDOM;
@@ -3388,6 +3383,9 @@ static int lan78xx_probe(struct usb_interface *intf,
if (netdev->mtu > (dev->hard_mtu - netdev->hard_header_len))
netdev->mtu = dev->hard_mtu - netdev->hard_header_len;
 
+   /* MTU range: 68 - 9000 */
+   netdev->max_mtu = MAX_SINGLE_PACKET_SIZE;
+
dev->ep_blkin = (intf->cur_altsetting)->endpoint + 0;
dev->ep_blkout = (intf->cur_altsetting)->endpoint + 1;
dev->ep_intr = (intf->cur_altsetting)->endpoint + 2;
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 8d6e13c..4213c28 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -4119,9 +4119,6 @@ static int rtl8152_change_mtu(struct net_device *dev, int 
new_mtu)
break;
}
 
-   if (new_mtu < 68 || new_mtu > RTL8153_MAX_MTU)
-   return -EINVAL;
-
ret = usb_autopm_get_interface(tp->intf);
if (ret < 0)
return ret;
@@ -4311,6 +4308,18 @@ static int rtl8152_probe(struct usb_interface *intf,
netdev->ethtool_ops = 
netif_set_gso_max_size(netdev, RTL_LIMITED_TSO_SIZE);
 
+   /* MTU range: 68 - 1500 or 9194 */
+   netdev->min_mtu = ETH_MIN_MTU;
+   switch (tp->version) {
+   case RTL_VER_01:
+   case RTL_VER_02:
+   netdev->max_mtu = ETH_DATA_LEN;
+   break;
+   default:
+   netdev->max_mtu = RTL8153_MAX_MTU;
+   break;
+   }
+
tp->mii.dev = netdev;
tp->mii.mdio_read = read_mii_word;
tp->mii.mdio_write = write_mii_word;
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index d5071e3..52ec271 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -384,8 +384,6 @@ int usbnet_change_mtu (struct net_device *net, int new_mtu)
int old_hard_mtu = dev->hard_mtu;
int old_rx_urb_size = dev->rx_urb_size;
 
-   if (new_mtu <= 0)
-   return -EINVAL;
// no second zero-length packet read wanted after mtu-sized packets
if ((ll_mtu % dev->maxpacket) == 0)
return -EDOM;
-- 
2.10.0

[PATCH net-next 0/6] net: use core MTU range checking everywhere

2016-10-18 Thread Jarod Wilson

This stack of patches should get absolutely everything in the kernel
converted from doing their own MTU range checking to the core MTU range
checking.

Jarod Wilson (6):
  net: use core MTU range checking in USB NIC drivers
  net: use core MTU range checking in wireless drivers
  net: use core MTU range checking in WAN drivers
  net: use core MTU range checking in core net infra
  net: use core MTU range checking in virt drivers
  net: use core MTU range checking in misc drivers

CC: netdev@vger.kernel.org

 drivers/char/pcmcia/synclink_cs.c  |  1 -
 drivers/firewire/net.c | 12 +
 drivers/infiniband/hw/nes/nes.c|  1 -
 drivers/infiniband/hw/nes/nes.h|  4 +-
 drivers/infiniband/hw/nes/nes_nic.c| 10 ++--
 drivers/misc/sgi-xp/xpnet.c| 21 ++--
 drivers/net/geneve.c   | 48 +++--
 drivers/net/hippi/rrunner.c|  1 -
 drivers/net/hyperv/hyperv_net.h|  4 +-
 drivers/net/hyperv/netvsc_drv.c| 14 ++---
 drivers/net/macvlan.c  |  6 ++-
 drivers/net/rionet.c   | 15 ++
 drivers/net/slip/slip.c| 11 ++--
 drivers/net/tun.c  | 20 +++
 drivers/net/usb/lan78xx.c  |  8 ++-
 drivers/net/usb/r8152.c| 15 --
 drivers/net/usb/usbnet.c   |  2 -
 drivers/net/virtio_net.c   | 23 
 drivers/net/vmxnet3/vmxnet3_drv.c  |  7 +--
 drivers/net/vxlan.c| 62 +++---
 drivers/net/wan/c101.c |  1 -
 drivers/net/wan/cosa.c |  1 -
 drivers/net/wan/dscc4.c|  1 -
 drivers/net/wan/farsync.c  |  1 -
 drivers/net/wan/fsl_ucc_hdlc.c |  1 -
 drivers/net/wan/hdlc.c | 11 +---
 drivers/net/wan/hdlc_fr.c  |  3 +-
 drivers/net/wan/hostess_sv11.c |  1 -
 drivers/net/wan/ixp4xx_hss.c   |  1 -
 drivers/net/wan/lmc/lmc_main.c |  1 -
 drivers/net/wan/n2.c   |  1 -
 drivers/net/wan/pc300too.c |  1 -
 drivers/net/wan/pci200syn.c|  1 -
 drivers/net/wan/sealevel.c |  1 -
 drivers/net/wan/wanxl.c|  1 -
 drivers/net/wireless/ath/wil6210/netdev.c  | 17 +-
 drivers/net/wireless/atmel/atmel.c | 13 ++---
 drivers/net/wireless/cisco/airo.c  | 14 ++---
 drivers/net/wireless/intel/ipw2x00/ipw2100.c   |  3 +-
 drivers/net/wireless/intel/ipw2x00/ipw2200.c   |  8 ++-
 drivers/net/wireless/intel/ipw2x00/libipw.h|  1 -
 drivers/net/wireless/intel/ipw2x00/libipw_module.c |  9 
 drivers/staging/wlan-ng/p80211netdev.c | 18 ++-
 drivers/tty/synclink.c |  1 -
 drivers/tty/synclink_gt.c  |  1 -
 drivers/tty/synclinkmp.c   |  1 -
 include/linux/hdlc.h   |  2 -
 include/linux/hippidevice.h|  1 -
 net/802/hippi.c| 14 +
 net/atm/lec.c  | 11 +---
 net/batman-adv/soft-interface.c| 13 +
 net/bridge/br_device.c |  9 ++--
 net/openvswitch/vport-internal_dev.c   | 10 
 net/sched/sch_teql.c   |  5 +-
 54 files changed, 153 insertions(+), 310 deletions(-)

-- 
2.10.0

Re: [patch net] rtnetlink: Add rtnexthop offload flag to compare mask

2016-10-18 Thread Andy Gospodarek

On Tue, Oct 18, 2016 at 12:59 PM, Jiri Pirko  wrote:
> From: Jiri Pirko 
>
> The offload flag is a status flag and should not be used by
> FIB semantics for comparison.

This is definitely needed.

>
> Fixes: 37ed9493699c ("rtnetlink: add RTNH_F_EXTERNAL flag for fib offload")
> Signed-off-by: Jiri Pirko 
Reviewed-by: Andy Gospodarek 

> ---
> Please queue-up to stable as well. Thanks.
> ---
>  include/uapi/linux/rtnetlink.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
> index 262f037..5a78be5 100644
> --- a/include/uapi/linux/rtnetlink.h
> +++ b/include/uapi/linux/rtnetlink.h
> @@ -350,7 +350,7 @@ struct rtnexthop {
>  #define RTNH_F_OFFLOAD 8   /* offloaded route */
>  #define RTNH_F_LINKDOWN16  /* carrier-down on nexthop */
>
> -#define RTNH_COMPARE_MASK  (RTNH_F_DEAD | RTNH_F_LINKDOWN)
> +#define RTNH_COMPARE_MASK  (RTNH_F_DEAD | RTNH_F_LINKDOWN | 
> RTNH_F_OFFLOAD)
>
>  /* Macros to handle hexthops */
>
> --
> 2.5.5
>

[PATCH net] tipc: Guard against tiny MTU in tipc_msg_build()

2016-10-18 Thread Ben Hutchings

Qian Zhang (张谦) reported a potential socket buffer overflow in
tipc_msg_build().  The minimum fragment length needs to be checked
against the maximum packet size, which is based on the link MTU.

Reported-by: Qian Zhang (张谦) 
Signed-off-by: Ben Hutchings 
---
This is untested, but I think it fixes the issue reported.  Ideally
tipc_l2_device_event() would also disable use of TIPC on devices with
too small an MTU, like several other protocols do.

Ben.

 net/tipc/msg.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 17201aa8423d..b9124ac82c29 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -274,6 +274,10 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
goto error;
}
 
+   /* Check that fragment and message header will fit */
+   if (INT_H_SIZE + mhsz > pktmax)
+   return -EMSGSIZE;
+
/* Prepare reusable fragment header */
tipc_msg_init(msg_prevnode(mhdr), , MSG_FRAGMENTER,
  FIRST_FRAGMENT, INT_H_SIZE, msg_destnode(mhdr));


signature.asc
Description: Digital signature

Re: [PATCH 1/8] tools lib bpf: add error functions

2016-10-18 Thread Wangnan (F)




On 2016/10/19 6:52, Joe Stringer wrote:

On 16 October 2016 at 14:18, Eric Leblond  wrote:

The include of err.h is not explicitely needed in exported
functions and it was causing include conflict with some existing
code due to redefining some macros.

To fix this, let's have error handling functions provided by the
library. Furthermore this will allow user to have an homogeneous
API.

Signed-off-by: Eric Leblond 

Does it need to return the error like this or should we just fix up
the bpf_object__open() API to return errors in a simpler form?

There's already libbpf_set_print(...) for outputting errors, is it
reasonable to just change the library to return NULLs in error cases
instead?


Returning error code to caller so caller knows what happen.
Other subsystems in perf also do this.

Perf hides libbpf's error output (make it silent unless -v),
so it needs a way for receiving libbpf's error code.

I think this patch is good, decouple libbpf.h and kernel headers.

Thank you.

Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.

2016-10-18 Thread Alexei Starovoitov

On Sun, Oct 16, 2016 at 09:41:28AM -0700, William Tu wrote:
> When running bpf_map_lookup on percpu elements, the bytes copied to
> userspace depends on num_possible_cpus() * value_size, which could
> potentially be larger than memory allocated from user, which depends
> on sysconf(_SC_NPROCESSORS_CONF) to get the current cpu num.  As a
> result, the inconsistency might corrupt the user's stack.
> 
> The fact that sysconf(_SC_NPROCESSORS_CONF) != num_possible_cpu()
> happens when cpu hotadd is enabled.  For example, in Fusion when
> setting vcpu.hotadd = "TRUE" or in KVM, setting
>   ./qemu-system-x86_64 -smp 2, maxcpus=4 ...
> the num_possible_cpu() will be 4 and sysconf() will be 2[1].
> Currently the any percpu map lookup suffers this issue, try
> samples/bpf/test_maps.c or tracex3.c.
> 
> Th RFC patch adds additional 'size' param from userspace so that
> kernel knows the maximum memory it should copy to the user.
> 
> [1] https://www.mail-archive.com/netdev@vger.kernel.org/msg121183.html
> 
> Signed-off-by: William Tu 
> ---
>  include/uapi/linux/bpf.h   |  5 -
>  kernel/bpf/syscall.c   |  5 +++--
>  samples/bpf/fds_example.c  |  2 +-
>  samples/bpf/libbpf.c   |  3 ++-
>  samples/bpf/libbpf.h   |  2 +-
>  samples/bpf/test_maps.c| 30 +++---
>  tools/include/uapi/linux/bpf.h |  5 -
>  7 files changed, 30 insertions(+), 22 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f09c70b..fa0c40b 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -123,7 +123,10 @@ union bpf_attr {
>   __aligned_u64 value;
>   __aligned_u64 next_key;
>   };
> - __u64   flags;
> + union {
> + __u64   flags;
> + __u32   size; /* number of bytes allocated in 
> userspace */
> + };
...
> - if (copy_to_user(uvalue, value, value_size) != 0)
> + if (copy_to_user(uvalue, value, min_t(u32, usize, value_size)) != 0)
>   goto free_value;

I think such approach won't actually fix anything. User space
may lose some of the values and won't have any idea what was lost.
I think we need to fix sample code to avoid using sysconf(_SC_NPROCESSORS_CONF)
and use /sys/devices/system/cpu/possible instead.
I would argue that glibc should be fixed as well since relying on
ls -d /sys/devices/system/cpu/cpu[0-9]*|wc -l turned out to be incorrect.

Hi

2016-10-18 Thread Sydney

I want to discuss with you please reply

Need help parsing dropwatch results

2016-10-18 Thread Timur Tabi

Using iperf3 and dropwatch, I discovered that my EMAC driver is dropping 
packets, a lot of them.  This driver is based on an internal version 
(written by someone else) that does not have this problem, so obviously 
there's a bug in my driver.  Unfortunately, I need help understanding 
where in my driver the bug could be.


dropwatch display this:

3297 drops at net_tx_action+0 (0x08aa1108)
3671 drops at net_tx_action+0 (0x08aa1108)
4055 drops at net_tx_action+0 (0x08aa1108)
3976 drops at net_tx_action+0 (0x08aa1108)
3847 drops at net_tx_action+0 (0x08aa1108)
3933 drops at net_tx_action+0 (0x08aa1108)
3933 drops at net_tx_action+0 (0x08aa1108)
1376 drops at net_tx_action+0 (0x08aa1108)

Can someone tell me what "drops at net_tx_action+0" actually means? How 
does a packet drop at net_tx_action() actually occur?  Where in the 
driver should I look for the problem?


--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [PATCH net] sched, cls: don't dump kernel addr into tc monitors on delete event

2016-10-18 Thread Cong Wang

On Tue, Oct 18, 2016 at 2:21 PM, Jamal Hadi Salim  wrote:
> I was sitting on this patch I was going to send ;->
> Does this resolve it?

Your patch makes more sense to me. Maybe we can remove the
event != RTM_DELTFILTER special case too?

Re: [PATCH 1/8] tools lib bpf: add error functions

2016-10-18 Thread Joe Stringer

On 16 October 2016 at 14:18, Eric Leblond  wrote:
> The include of err.h is not explicitely needed in exported
> functions and it was causing include conflict with some existing
> code due to redefining some macros.
>
> To fix this, let's have error handling functions provided by the
> library. Furthermore this will allow user to have an homogeneous
> API.
>
> Signed-off-by: Eric Leblond 

Does it need to return the error like this or should we just fix up
the bpf_object__open() API to return errors in a simpler form?

There's already libbpf_set_print(...) for outputting errors, is it
reasonable to just change the library to return NULLs in error cases
instead?

Re: [PATCH net-next 00/15] ethernet: use core min/max MTU checking

2016-10-18 Thread Jarod Wilson

On Tue, Oct 18, 2016 at 11:33:27AM -0400, David Miller wrote:
> From: Jarod Wilson 
> Date: Mon, 17 Oct 2016 16:29:43 -0400
> 
> > On Mon, Oct 17, 2016 at 04:03:41PM -0400, David Miller wrote:
> >> From: Jarod Wilson 
> >> Date: Mon, 17 Oct 2016 15:54:02 -0400
> >> 
> >> > For the most part, every patch does the same essential thing: removes the
> >> > MTU range checking from the drivers' ndo_change_mtu function, puts those
> >> > ranges into the core net_device min_mtu and max_mtu fields, and where
> >> > possible, removes ndo_change_mtu functions entirely.
> >> 
> >> Jarod, please read my other posting.
> > 
> > Done, didn't see it until just after I'd hit send, have replied there as
> > well.
> > 
> >> You've positively broken the maximum MTU for all of these drivers.
> >> 
> >> That's not cool.
> >>
> >> And this series fixing things doesn't make things better, because now
> >> we've significanyly broken bisection for anyone running into this
> >> regression.
> > 
> > Agreed, and my suggestion right now is to revert the 2nd patch from the
> > prior series. I believe it can be resubmitted after all other callers of
> > ether_setup() have been converted to have their own min/max_mtu.
> > 
> >> You should have arranged this in such a way that the drivers needing
> >> > 1500 byte MTU were not impacted at all by your changes, but that
> >> isn't what happened.
> > 
> > Yeah, I must admit to not looking closely enough at the state the first
> > two patches left things in. It was absolutely my intention to not alter
> > behaviour in any way, but I neglected to test sufficiently without this
> > additional set applied.
> 
> So what I'm going to do now it simply just apply your current patch series
> to net-next and hope this gets everything working again.

Unfortunately, no, it doesn't get *everything* working again, because...

direct ether_setup() callers:

drivers/misc/sgi-xp/xpnet.c
drivers/net/geneve.c
drivers/net/macvlan.c
drivers/net/tun.c
drivers/net/vxlan.c
drivers/net/wan/hdlc.c
drivers/net/wan/hdlc_fr.c
drivers/net/wireless/ath/wil6210/netdev.c
drivers/net/wireless/cisco/airo.c
drivers/staging/wlan-ng/p80211netdev.c
net/batman-adv/soft-interface.c
net/bridge/br_device.c
net/openvswitch/vport-internal_dev.c

alloc_etherdev*() callers:
drivers/infiniband/hw/nes/nes_nic.c
drivers/net/hyperv/netvsc_drv.c
drivers/net/rionet.c
drivers/net/usb/lan78xx.c
drivers/net/usb/r8152.c
drivers/net/usb/usbnet.c
drivers/net/virtio_net.c
drivers/net/vmxnet3/vmxnet3_drv.c
drivers/net/wireless/atmel/atmel.c
drivers/net/wireless/cisco/airo.c
drivers/net/wireless/intel/ipw2x00/libipw_module.c
net/atm/lec.c

I have additional patches for all of these that I haven't yet posted, so
I'd still suggest backing out the one patch to keep the above working too
until the subsequent patches are posted.

> I'm just happy that you acknowledged how badly things got broken, so let's
> move on and try to avoid this happening again in the future.
> 
> Thanks.

I profusely apologize again for the mess. Was trying to clean up a mess,
made another one. Story of my life right now, it seems... :\

-- 
Jarod Wilson
ja...@redhat.com

Re: [PATCH net] sched, cls: don't dump kernel addr into tc monitors on delete event

2016-10-18 Thread Daniel Borkmann


On 10/19/2016 12:18 AM, Jamal Hadi Salim wrote:

On 16-10-18 05:59 PM, Daniel Borkmann wrote:

[...]

Ahh sure, looks good to me. All other RTM_DELTFILTER events
would be for the entire tcf_proto and 'enforced' destroy, so
zero handle would indicate that then as opposed to a individual
cls delete with non-zero handle. Seems fine.


Ok, thanks. I will send an official patch when i get a chance.
I tested earlier on net-next; but maybe this deserves to go
on net.


net would be appropriate as mentioned in commit message.

Thanks,
Daniel

Re: [PATCH net] sched, cls: don't dump kernel addr into tc monitors on delete event

2016-10-18 Thread Jamal Hadi Salim


On 16-10-18 05:59 PM, Daniel Borkmann wrote:


Ahh sure, looks good to me. All other RTM_DELTFILTER events
would be for the entire tcf_proto and 'enforced' destroy, so
zero handle would indicate that then as opposed to a individual
cls delete with non-zero handle. Seems fine.


Ok, thanks. I will send an official patch when i get a chance.
I tested earlier on net-next; but maybe this deserves to go
on net.

cheers,
jamal

Re: [PATCH net] sched, cls: don't dump kernel addr into tc monitors on delete event

2016-10-18 Thread Daniel Borkmann


On 10/18/2016 11:21 PM, Jamal Hadi Salim wrote:
[...]

I was sitting on this patch I was going to send ;->
Does this resolve it?


Ahh sure, looks good to me. All other RTM_DELTFILTER events
would be for the entire tcf_proto and 'enforced' destroy, so
zero handle would indicate that then as opposed to a individual
cls delete with non-zero handle. Seems fine.

Re: [PATCH net] sched, cls: don't dump kernel addr into tc monitors on delete event

2016-10-18 Thread Jamal Hadi Salim


On 16-10-18 04:18 PM, Daniel Borkmann wrote:

While trying out [1][2], I noticed that tc monitor doesn't show the
correct handle on delete:

  $ tc monitor
  qdisc clsact : dev eno1 parent :fff1
  filter dev eno1 ingress protocol all pref 49152 bpf handle 0x2a [...]
  deleted filter dev eno1 ingress protocol all pref 49152 bpf handle 0xf3be0c80

In fact, the shown handle points to a kernel address, in this case
0x8807f3be0c80, which points to a struct cls_bpf_prog from bpf
classifier.

The issue is not bpf specific though. tcf_fill_node() sets a fh as
tcm->tcm_handle, which gets overridden on != RTM_DELTFILTER events
only. For RTM_DELTFILTER events, we cannot call the classifier's
dump() handler, since notification is given after delete() handler
returned with success.

At latest when a classifier's dump() handler is called, tm->tcm_handle
is filled with an actual handle. They are currently classifier
internal, meaning a tcf_proto can handle multiple classifiers if
the implementation supports it, so it needs to be queried from the
callback.

For RTM_DELTFILTER, the fh value contains the address of the object
to dump. Commit 4e54c4816bfe ("[NET]: Add tc extensions infrastructure.")
added the logic to assign tcm->tcm_handle = fh. tcm_handle is 32bit
so for 64bit archs, it's stored truncated. Prior to that commit, it
was set to 0. Reintroduce this, so we at least don't leak the kernel
address or parts of it to unprivileged user space listeners.

Since user space cannot make any sense out of this 32bit part,
passing a random number would be just as good. Lets pass 0, since i)
this allows to add the feature at some point for net-next, and ii)
this is also consistent with notifications via tfilter_notify_chain()
when we delete the entire chain.

  [1] http://patchwork.ozlabs.org/patch/682828/
  [2] http://patchwork.ozlabs.org/patch/682829/

Fixes: 4e54c4816bfe ("[NET]: Add tc extensions infrastructure.")
Signed-off-by: Daniel Borkmann 
---
 ( Commit is in -history tree. Jamal, please take a look if you have
   a chance, thanks. )

 net/sched/cls_api.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2ee29a3..e11bdc5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -400,12 +400,11 @@ static int tcf_fill_node(struct net *net, struct sk_buff 
*skb,
tcm->tcm__pad2 = 0;
tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
tcm->tcm_parent = tp->classid;
+   tcm->tcm_handle = 0;
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
goto nla_put_failure;
-   tcm->tcm_handle = fh;
if (RTM_DELTFILTER != event) {
-   tcm->tcm_handle = 0;
if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
goto nla_put_failure;
}




I was sitting on this patch I was going to send ;->
Does this resolve it?

cheers,
jamal


diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2ee29a3..2b2a797 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -345,7 +345,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct 
nlmsghdr *n)
if (err == 0) {
struct tcf_proto *next = 
rtnl_dereference(tp->next);
 
-   tfilter_notify(net, skb, n, tp, fh,
+   tfilter_notify(net, skb, n, tp,
+  t->tcm_handle,
   RTM_DELTFILTER, false);
if (tcf_destroy(tp, false))
RCU_INIT_POINTER(*back, next);

[PATCH net] tcp: do not export sysctl_tcp_low_latency

2016-10-18 Thread Eric Dumazet

From: Eric Dumazet 

Since commit b2fb4f54ecd4 ("tcp: uninline tcp_prequeue()") we no longer
access sysctl_tcp_low_latency from a module.

Signed-off-by: Eric Dumazet 
---
 net/ipv4/tcp_ipv4.c |1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 79d55eb3ec3f..61b7be303eec 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -86,7 +86,6 @@
 
 int sysctl_tcp_tw_reuse __read_mostly;
 int sysctl_tcp_low_latency __read_mostly;
-EXPORT_SYMBOL(sysctl_tcp_low_latency);
 
 #ifdef CONFIG_TCP_MD5SIG
 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key 
*key,

[PATCH net] sched, cls: don't dump kernel addr into tc monitors on delete event

2016-10-18 Thread Daniel Borkmann

While trying out [1][2], I noticed that tc monitor doesn't show the
correct handle on delete:

  $ tc monitor
  qdisc clsact : dev eno1 parent :fff1
  filter dev eno1 ingress protocol all pref 49152 bpf handle 0x2a [...]
  deleted filter dev eno1 ingress protocol all pref 49152 bpf handle 0xf3be0c80

In fact, the shown handle points to a kernel address, in this case
0x8807f3be0c80, which points to a struct cls_bpf_prog from bpf
classifier.

The issue is not bpf specific though. tcf_fill_node() sets a fh as
tcm->tcm_handle, which gets overridden on != RTM_DELTFILTER events
only. For RTM_DELTFILTER events, we cannot call the classifier's
dump() handler, since notification is given after delete() handler
returned with success.

At latest when a classifier's dump() handler is called, tm->tcm_handle
is filled with an actual handle. They are currently classifier
internal, meaning a tcf_proto can handle multiple classifiers if
the implementation supports it, so it needs to be queried from the
callback.

For RTM_DELTFILTER, the fh value contains the address of the object
to dump. Commit 4e54c4816bfe ("[NET]: Add tc extensions infrastructure.")
added the logic to assign tcm->tcm_handle = fh. tcm_handle is 32bit
so for 64bit archs, it's stored truncated. Prior to that commit, it
was set to 0. Reintroduce this, so we at least don't leak the kernel
address or parts of it to unprivileged user space listeners.

Since user space cannot make any sense out of this 32bit part,
passing a random number would be just as good. Lets pass 0, since i)
this allows to add the feature at some point for net-next, and ii)
this is also consistent with notifications via tfilter_notify_chain()
when we delete the entire chain.

  [1] http://patchwork.ozlabs.org/patch/682828/
  [2] http://patchwork.ozlabs.org/patch/682829/

Fixes: 4e54c4816bfe ("[NET]: Add tc extensions infrastructure.")
Signed-off-by: Daniel Borkmann 
---
 ( Commit is in -history tree. Jamal, please take a look if you have
   a chance, thanks. )

 net/sched/cls_api.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2ee29a3..e11bdc5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -400,12 +400,11 @@ static int tcf_fill_node(struct net *net, struct sk_buff 
*skb,
tcm->tcm__pad2 = 0;
tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
tcm->tcm_parent = tp->classid;
+   tcm->tcm_handle = 0;
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
goto nla_put_failure;
-   tcm->tcm_handle = fh;
if (RTM_DELTFILTER != event) {
-   tcm->tcm_handle = 0;
if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
goto nla_put_failure;
}
-- 
1.9.3

Re: [PATCH net-next] openvswitch: remove unnecessary EXPORT_SYMBOLs

2016-10-18 Thread Pravin Shelar

On Tue, Oct 18, 2016 at 4:47 AM, Jiri Benc  wrote:
> Many symbols exported to other modules are really used only by
> openvswitch.ko. Remove the exports.
>
> Tested by loading all 4 openvswitch modules, nothing breaks.
>
> Signed-off-by: Jiri Benc 
> ---
>  net/openvswitch/datapath.c | 2 --
>  net/openvswitch/vport-netdev.c | 1 -
>  net/openvswitch/vport.c| 2 --
>  3 files changed, 5 deletions(-)
>
...
...
> @@ -479,7 +478,6 @@ void ovs_vport_deferred_free(struct vport *vport)
>
> call_rcu(>rcu, free_vport_rcu);
>  }
> -EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
>

ovs_vport_deferred_free() is not used anywhere. can you remove it?

Re: [PATCH net-next] bnx2x: ethtool -x full support

2016-10-18 Thread Eric Dumazet

On Tue, 2016-10-18 at 10:08 -0700, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> Implement ethtool -x full support, so that rss key can be fetched
> instead of assuming it matches /proc/sys/net/core/netdev_rss_key
> content.

I'll send a V2, tested with CONFIG_BNX2X_SRIOV=y ;)

Re: [PATCH v2 net-next 0/5] Interrupt support for mv88e6xxx

2016-10-18 Thread Vivien Didelot

Hi Andrew,

Andrew Lunn  writes:

>> mv88e6xxx_g1_* should've been moved to global1.c, I'll move them
>> later.
>
> Hi Vivian
>
> I did consider that, but at the moment, there are only access
> functions in there. But the code should be easy to move.

True, looks good to me like this for the moment :-)

Thanks,

Vivien

Re: [PATCH v2 net-next 0/5] Interrupt support for mv88e6xxx

2016-10-18 Thread Andrew Lunn

> mv88e6xxx_g1_* should've been moved to global1.c, I'll move them later.

Hi Vivian

I did consider that, but at the moment, there are only access
functions in there. But the code should be easy to move.

  Andrew

Re: [PATCH v2 net-next 0/5] Interrupt support for mv88e6xxx

2016-10-18 Thread Vivien Didelot

Hi Andrew,

Andrew Lunn  writes:

> This patchset add interrupt controller support to the MV88E6xxx.  This
> allows access to the interrupts the internal PHY generate. These
> interrupts can then be associated to a PHY device in the device tree
> and used by the PHY lib, rather than polling.
>
> Since interrupt handling needs to make MDIO bus accesses, threaded
> interrupts are used. The phylib needs to request the PHY interrupt
> using the threaded IRQ API. This in term allows some simplification to
> the code, in that the phylib interrupt handler can directly call
> phy_change(), rather than use a work queue. The work queue is however
> retained for the phy_mac_interrupt() call, which can be called in hard
> interrupt context.
>
> Since RFC v1:
>
> Keep phy_mac_interrupt() callable in hard IRQ context.
>
> The fix to trigger the phy state machine transitions on interrupts has
> already been submitted, so is dropped from here.
>
> Added back shared interrupts support.
>
>
> Andrew Lunn (5):
>   net: dsa: mv88e6xxx: Implement interrupt support.
>   net: phy: Use threaded IRQ, to allow IRQ from sleeping devices
>   net: phy: Threaded interrupts allow some simplification
>   net: phy: Use phy name when requesting the interrupt
>   arm: vf610: zii devel b: Add support for switch interrupts
>
>  .../devicetree/bindings/net/dsa/marvell.txt|  21 +-
>  arch/arm/boot/dts/vf610-zii-dev-rev-b.dts  |  51 +
>  drivers/net/dsa/mv88e6xxx/chip.c   | 248 
> -
>  drivers/net/dsa/mv88e6xxx/global2.c| 139 +++-
>  drivers/net/dsa/mv88e6xxx/global2.h|  11 +
>  drivers/net/dsa/mv88e6xxx/mv88e6xxx.h  |  31 +++
>  drivers/net/phy/phy.c  |  52 +++--
>  drivers/net/phy/phy_device.c   |   2 +-
>  include/linux/phy.h|   5 +-
>  9 files changed, 522 insertions(+), 38 deletions(-)

mv88e6xxx_g1_* should've been moved to global1.c, I'll move them later.

For what it's worth:

Reviewed-by: Vivien Didelot 

Thanks,

Vivien

Re: iproute: ss truncates abstract unix domain socket embedding null

2016-10-18 Thread Isaac Boukris

Hi again,

On Sun, Oct 16, 2016 at 11:43 PM, Isaac Boukris  wrote:
> Hello,
>
> The unix(7) man page says that null have no special meaning in
> abstract unix domain socket address (the length is specified
> therefore).
>
> However, when such name (embedding null) is used, ss (and netstat)
> will only show up to the first null occurrence (second technically, if
> we count the null prefix).
> e.g. the name "\0/tmp/fo\0.sock" is displayed as: "@/tmp/fo" (whilst
> strace tool shows it as: sun_path=@"/tmp/fo\0.sock").
>
> Would it be more useful if it printed the whole name and escaped the null?
> If so, would '\0' be ok for escaping the null?


Meanwhile, I've got it to escape the null character with with '\0' as suggested.
Can anyone take a look and advise if I'm on the right track? Thanks!


diff --git a/misc/ss.c b/misc/ss.c
index dd77b81..3e41f44 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -2869,7 +2869,7 @@ static int unix_show_sock(const struct
sockaddr_nl *addr, struct nlmsghdr *nlh,
struct filter *f = (struct filter *)arg;
struct unix_diag_msg *r = NLMSG_DATA(nlh);
struct rtattr *tb[UNIX_DIAG_MAX+1];
-   char name[128];
+   char name[128*2];
struct sockstat stat = { .name = "*", .peer_name = "*" };

parse_rtattr(tb, UNIX_DIAG_MAX, (struct rtattr *)(r+1),
@@ -2891,11 +2891,25 @@ static int unix_show_sock(const struct
sockaddr_nl *addr, struct nlmsghdr *nlh,
}
if (tb[UNIX_DIAG_NAME]) {
int len = RTA_PAYLOAD(tb[UNIX_DIAG_NAME]);
+   char *real_name = RTA_DATA(tb[UNIX_DIAG_NAME]);

-   memcpy(name, RTA_DATA(tb[UNIX_DIAG_NAME]), len);
-   name[len] = '\0';
-   if (name[0] == '\0')
+   if (real_name[0] == '\0') {
+   int i, j;
name[0] = '@';
+   for (i = j = 1; i < len; ++i) {
+   if (real_name[i] == '\0') {
+   name[j++] = '\\';
+   name[j++] = '0';
+   }
+   else
+   name[j++] = real_name[i];
+   }
+   name[j] = '\0';
+   } else {
+   memcpy(name, real_name, len);
+   name[len] = '\0';
+   }
+
stat.name = [0];
memcpy(stat.local.data, , sizeof(stat.name));
}

Re: [PATCH v3 net-next 8/8] net: qualcomm: add QCA7000 UART driver

2016-10-18 Thread David Miller

From: Stefan Wahren 
Date: Tue, 18 Oct 2016 13:27:34 +0200

> +void
> +qca_tty_receive(struct tty_struct *tty, const unsigned char *cp, char *fp,
> + int count)
> +{
> + struct qcauart *qca = tty->disc_data;
> + struct net_device_stats *n_stats = >net_dev->stats;
> + int dropped = 0;

Please order local variable declarations from longest to shortest line.

> +netdev_tx_t
> +qcauart_netdev_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> + struct qcauart *qca = netdev_priv(dev);
> + struct net_device_stats *n_stats = >stats;
> + u8 *pos;
> + u8 pad_len = 0;
> + int written;

Likewise.

Re: [PATCH v3 net-next 5/8] net: qualcomm: move MTU handling to qca_common

2016-10-18 Thread David Miller

From: Stefan Wahren 
Date: Tue, 18 Oct 2016 13:27:31 +0200

> The MTU of the QCA7000 is independent from it's host interface (UART,SPI).
> So move the change_mtu function to qca_common.
> 
> Signed-off-by: Stefan Wahren 
> ---
>  drivers/net/ethernet/qualcomm/qca_common.c | 11 +++
>  drivers/net/ethernet/qualcomm/qca_common.h |  3 +++
>  drivers/net/ethernet/qualcomm/qca_spi.c| 13 +
>  3 files changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/net/ethernet/qualcomm/qca_common.c 
> b/drivers/net/ethernet/qualcomm/qca_common.c
> index 26453a9..9020c57 100644
> --- a/drivers/net/ethernet/qualcomm/qca_common.c
> +++ b/drivers/net/ethernet/qualcomm/qca_common.c
> @@ -154,3 +154,14 @@ qcafrm_fsm_decode(struct qcafrm_handle *handle, u8 *buf, 
> u16 buf_len, u8 recv_by
>  
>   return ret;
>  }
> +
> +int
> +qcacmn_netdev_change_mtu(struct net_device *dev, int new_mtu)
> +{
> + if ((new_mtu < QCAFRM_ETHMINMTU) || (new_mtu > QCAFRM_ETHMAXMTU))
> + return -EINVAL;

In net-next this limiting is implemented by the driver properly setting
netdev->min_mtu and netdev->max_mtu respectively.

And once you do that, you no longer need this method at all.

Re: [PATCH v3 net-next 3/8] net: qualcomm: move qcaspi_tx_cmd to qca_spi.c

2016-10-18 Thread David Miller

From: Stefan Wahren 
Date: Tue, 18 Oct 2016 13:27:29 +0200

> diff --git a/drivers/net/ethernet/qualcomm/qca_7k.h 
> b/drivers/net/ethernet/qualcomm/qca_7k.h
> index 1cad851..b390b1f 100644
> --- a/drivers/net/ethernet/qualcomm/qca_7k.h
> +++ b/drivers/net/ethernet/qualcomm/qca_7k.h
> @@ -67,6 +67,5 @@
>  void qcaspi_spi_error(struct qcaspi *qca);
>  int qcaspi_read_register(struct qcaspi *qca, u16 reg, u16 *result);
>  int qcaspi_write_register(struct qcaspi *qca, u16 reg, u16 value);
> -int qcaspi_tx_cmd(struct qcaspi *qca, u16 cmd);
>  
>  #endif /* _QCA_7K_H */
> diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c 
> b/drivers/net/ethernet/qualcomm/qca_spi.c
> index 6e2add9..5bcac62 100644
> --- a/drivers/net/ethernet/qualcomm/qca_spi.c
> +++ b/drivers/net/ethernet/qualcomm/qca_spi.c
> @@ -192,6 +192,30 @@ qcaspi_read_legacy(struct qcaspi *qca, u8 *dst, u32 len)
>   return len;
>  }
>  
> +int
> +qcaspi_tx_cmd(struct qcaspi *qca, u16 cmd)
> +{

If you do this then you must mark this function 'static'.

Re: [PATCH] ipv6: fix signedness of tmp_prefered_lft underflow check

2016-10-18 Thread David Miller

From: Jiri Bohac 
Date: Tue, 18 Oct 2016 17:01:54 +0200

> Commit 76506a986dc31394fd1f2741db037d29c7e57843 (IPv6: fix
> DESYNC_FACTOR) introduced a buggy check for underflow of
> tmp_prefered_lft. tmp_prefered_lft is unsigned, so the condition
> is always false.
> 
> Signed-off-by: Jiri Bohac 
> Reported-by: Julia Lawall 
> Fixes: 76506a986dc3 ("IPv6: fix DESYNC_FACTOR")

Does the check make any sense at all?  I'd say just remove it.

Re: [PATCH net-next] r8152: add new products of Lenovo

2016-10-18 Thread David Miller

From: Hayes Wang 
Date: Tue, 18 Oct 2016 11:41:48 +0800

> Add the following four products of Lenovo and sort the order of the list.
> 
>   VID PID
>   0x17ef  0x3062
>   0x17ef  0x3069
>   0x17ef  0x720c
>   0x17ef  0x7214
> 
> Signed-off-by: Hayes Wang 

Applied.

Re: [PATCH 2/3 net] ibmvnic: Fix GFP_KERNEL allocation in interrupt context

2016-10-18 Thread David Miller

From: Thomas Falcon 
Date: Mon, 17 Oct 2016 15:28:09 -0500

> Signed-off-by: Thomas Falcon 

Applied.

Re: [PATCH 1/1] net: vlan: Use sizeof instead of literal number

2016-10-18 Thread David Miller

From: f...@ikuai8.com
Date: Tue, 18 Oct 2016 08:44:02 +0800

> From: Gao Feng 
> 
> Use sizeof variable instead of literal number to enhance the readability.
> 
> Signed-off-by: Gao Feng 

Applied.

Re: [PATCH 20/28] net: bcm63xx: avoid referencing uninitialized variable

2016-10-18 Thread David Miller

From: Arnd Bergmann 
Date: Tue, 18 Oct 2016 00:16:08 +0200

> gcc found a reference to an uninitialized variable in the error handling
> of bcm_enet_open, introduced by a recent cleanup:
> 
> drivers/net/ethernet/broadcom/bcm63xx_enet.c: In function 'bcm_enet_open'
> drivers/net/ethernet/broadcom/bcm63xx_enet.c:1129:2: warning: 'phydev' may be 
> used uninitialized in this function [-Wmaybe-uninitialized]
> 
> This makes the use of that variable conditional, so we only reference it
> here after it has been used before. Unlike my normal patches, I have not
> build-tested this one, as I don't currently have mips test in my
> randconfig setup.
> 
> Fixes: 625eb8667d6f ("net: ethernet: broadcom: bcm63xx: use phydev from 
> struct net_device")
> Cc: Philippe Reynes 
> Reported-by: kbuild test robot 
> Signed-off-by: Arnd Bergmann 

Applied.

Re: [PATCH 21/28] net/hyperv: avoid uninitialized variable

2016-10-18 Thread David Miller

From: Arnd Bergmann 
Date: Tue, 18 Oct 2016 00:16:09 +0200

> The hdr_offset variable is only if we deal with a TCP or UDP packet,
> but as the check surrounding its usage tests for skb_is_gso()
> instead, the compiler has no idea if the variable is initialized
> or not at that point:
> 
> drivers/net/hyperv/netvsc_drv.c: In function ‘netvsc_start_xmit’:
> drivers/net/hyperv/netvsc_drv.c:494:42: error: ‘hdr_offset’ may be used 
> uninitialized in this function [-Werror=maybe-uninitialized]
> 
> This adds an additional check for the transport type, which
> tells the compiler that this path cannot happen. Since the
> get_net_transport_info() function should always be inlined
> here, I don't expect this to result in additional runtime
> checks.
> 
> Signed-off-by: Arnd Bergmann 

Applied.

Re: [PATCH 27/28] rocker: fix maybe-uninitialized warning

2016-10-18 Thread David Miller

From: Arnd Bergmann 
Date: Tue, 18 Oct 2016 00:16:15 +0200

> In some rare configurations, we get a warning about the 'index' variable
> being used without an initialization:
> 
> drivers/net/ethernet/rocker/rocker_ofdpa.c: In function 
> ‘ofdpa_port_fib_ipv4.isra.16.constprop’:
> drivers/net/ethernet/rocker/rocker_ofdpa.c:2425:92: warning: ‘index’ may be 
> used uninitialized in this function [-Wmaybe-uninitialized]
> 
> This is a false positive, the logic is just a bit too complex for gcc
> to follow here. Moving the intialization of 'index' a little further
> down makes it clear to gcc that the function always returns an error
> if it is not initialized.
> 
> Signed-off-by: Arnd Bergmann 

Applied.

Re: [PATCH net] soreuseport: do not export reuseport_add_sock()

2016-10-18 Thread David Miller

From: Eric Dumazet 
Date: Mon, 17 Oct 2016 14:22:48 -0700

> From: Eric Dumazet 
> 
> reuseport_add_sock() is not used from a module,
> no need to export it.
> 
> Signed-off-by: Eric Dumazet 

Applied, thanks Eric.

Re: [PATCH 3/3 net] ibmvnic: Update MTU after device initialization

2016-10-18 Thread David Miller

From: Thomas Falcon 
Date: Mon, 17 Oct 2016 15:28:10 -0500

> It is possible for the MTU to be changed during the initialization
> process with the VNIC Server.  Ensure that the net device is updated 
> to reflect the new MTU.
> 
> Signed-off-by: Thomas Falcon 

Applied.

Re: [PATCH 1/3 net v2] ibmvnic: Driver Version 1.0.1

2016-10-18 Thread David Miller

From: Thomas Falcon 
Date: Mon, 17 Oct 2016 15:56:29 -0500

> Increment driver version to reflect features that have
> been added since release.
> 
> Signed-off-by: Thomas Falcon 

Applied.

Re: [PATCH v3 0/4] support smc91x on mainstone and devicetree

2016-10-18 Thread David Miller

From: Robert Jarzmik 
Date: Mon, 17 Oct 2016 21:45:28 +0200

> This serie aims at bringing support to mainstone board on a device-tree based
> build, as what is already in place for legacy mainstone.
> 
> The bulk of the mainstone "specific" behavior is that a u16 write doesn't work
> on a address of the form 4*n + 2, while it works on 4*n.
> 
> The legacy workaround was in SMC_outw(), with calls to
> machine_is_mainstone(). These calls don't work with a pxa27x-dt machine type,
> which is used when a generic device-tree pxa27x machine is used to boot the
> mainstone board.
> 
> Therefore, this serie enables the smc91c111 adapter of the mainstone board to
> work on a device-tree build, exaclty as it's been working for years with the
> legacy arch/arm/mach-pxa/mainstone.c definition.
> 
> As a sum up, this extends an existing mechanism to device-tree based pxa 
> platforms.

Series applied, thanks.

Re: [PATCH v3 net-next 0/7] udp: Flow dissection for tunnels

2016-10-18 Thread David Miller

From: Tom Herbert 
Date: Tue, 18 Oct 2016 10:02:36 -0700

> v3:
>   - Fix build issues with modules that call IPv6 functions and
> CONFIG_INET is not set.
>   - Fix compilation error in init'ing .flow_dissect in IPv6 UDP
> offload.

Still doesn't build:

net/ipv6/udp_offload.c:208:19: error: initialization from incompatible pointer 
type [-Werror=incompatible-pointer-types]
   .flow_dissect = udp6_flow_dissect,
   ^
net/ipv6/udp_offload.c:208:19: note: (near initialization for 
‘udpv6_offload.callbacks.flow_dissect’)
cc1: some warnings being treated as errors
scripts/Makefile.build:289: recipe for target 'net/ipv6/udp_offload.o' failed
make[2]: *** [net/ipv6/udp_offload.o] Error 1
make[2]: *** Waiting for unfinished jobs
scripts/Makefile.build:440: recipe for target 'net/ipv6' failed
make[1]: *** [net/ipv6] Error 2
make[1]: *** Waiting for unfinished jobs
Makefile:969: recipe for target 'net' failed
make: *** [net] Error 2
make: *** Waiting for unfinished jobs

The final argument to udp6_flow_dissect() is marked const but that is not what 
the
method definition wants.

Re: [PATCH net-next] ethernet/sfc: use core min/max MTU checking

2016-10-18 Thread David Miller

From: Bert Kenward 
Date: Tue, 18 Oct 2016 17:47:45 +0100

> Fixes: 61e84623 ("net: centralize net_device min/max MTU checking")
> Signed-off-by: Bert Kenward 

Applied with Fixes tag fixed up to use a 12 character SHA1-ID.

Re: [PATCH net v2] bridge: multicast: restore perm router ports on multicast enable

2016-10-18 Thread David Miller

From: Nikolay Aleksandrov 
Date: Tue, 18 Oct 2016 18:09:48 +0200

> Satish reported a problem with the perm multicast router ports not getting
> reenabled after some series of events, in particular if it happens that the
> multicast snooping has been disabled and the port goes to disabled state
> then it will be deleted from the router port list, but if it moves into
> non-disabled state it will not be re-added because the mcast snooping is
> still disabled, and enabling snooping later does nothing.
> 
> Here are the steps to reproduce, setup br0 with snooping enabled and eth1
> added as a perm router (multicast_router = 2):
> 1. $ echo 0 > /sys/class/net/br0/bridge/multicast_snooping
> 2. $ ip l set eth1 down
> ^ This step deletes the interface from the router list
> 3. $ ip l set eth1 up
> ^ This step does not add it again because mcast snooping is disabled
> 4. $ echo 1 > /sys/class/net/br0/bridge/multicast_snooping
> 5. $ bridge -d -s mdb show
> 
> 
> At this point we have mcast enabled and eth1 as a perm router (value = 2)
> but it is not in the router list which is incorrect.
> 
> After this change:
> 1. $ echo 0 > /sys/class/net/br0/bridge/multicast_snooping
> 2. $ ip l set eth1 down
> ^ This step deletes the interface from the router list
> 3. $ ip l set eth1 up
> ^ This step does not add it again because mcast snooping is disabled
> 4. $ echo 1 > /sys/class/net/br0/bridge/multicast_snooping
> 5. $ bridge -d -s mdb show
> router ports on br0: eth1
> 
> Note: we can directly do br_multicast_enable_port for all because the
> querier timer already has checks for the port state and will simply
> expire if it's in blocking/disabled. See the comment added by
> commit 9aa66382163e7 ("bridge: multicast: add a comment to
> br_port_state_selection about blocking state")
> 
> Fixes: 561f1103a2b7 ("bridge: Add multicast_snooping sysfs toggle")
> Reported-by: Satish Ashok 
> Signed-off-by: Nikolay Aleksandrov 

Applied and queued up for -stable, thanks!

[PATCH net-next] bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers

2016-10-18 Thread Thomas Graf

A BPF program is required to check the return register of a
map_elem_lookup() call before accessing memory. The verifier keeps
track of this by converting the type of the result register from
PTR_TO_MAP_VALUE_OR_NULL to PTR_TO_MAP_VALUE after a conditional
jump ensures safety. This check is currently exclusively performed
for the result register 0.

In the event the compiler reorders instructions, BPF_MOV64_REG
instructions may be moved before the conditional jump which causes
them to keep their type PTR_TO_MAP_VALUE_OR_NULL to which the
verifier objects when the register is accessed:

0: (b7) r1 = 10
1: (7b) *(u64 *)(r10 -8) = r1
2: (bf) r2 = r10
3: (07) r2 += -8
4: (18) r1 = 0x59c0
6: (85) call 1
7: (bf) r4 = r0
8: (15) if r0 == 0x0 goto pc+1
 R0=map_value(ks=8,vs=8) R4=map_value_or_null(ks=8,vs=8) R10=fp
9: (7a) *(u64 *)(r4 +0) = 0
R4 invalid mem access 'map_value_or_null'

This commit extends the verifier to keep track of all identical
PTR_TO_MAP_VALUE_OR_NULL registers after a map_elem_lookup() by
assigning them an ID and then marking them all when the conditional
jump is observed.

Signed-off-by: Thomas Graf 
Reviewed-by: Josef Bacik 
Acked-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 include/linux/bpf_verifier.h|  2 +-
 kernel/bpf/verifier.c   | 61 +---
 tools/testing/selftests/bpf/test_verifier.c | 72 +
 3 files changed, 118 insertions(+), 17 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7035b99..ac5b393 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -23,13 +23,13 @@ struct bpf_reg_state {
 * result in a bad access.
 */
u64 min_value, max_value;
+   u32 id;
union {
/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE 
*/
s64 imm;
 
/* valid when type == PTR_TO_PACKET* */
struct {
-   u32 id;
u16 off;
u16 range;
};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 99a7e5b..846d7ce 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -212,9 +212,10 @@ static void print_verifier_state(struct bpf_verifier_state 
*state)
else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
 t == PTR_TO_MAP_VALUE_OR_NULL ||
 t == PTR_TO_MAP_VALUE_ADJ)
-   verbose("(ks=%d,vs=%d)",
+   verbose("(ks=%d,vs=%d,id=%u)",
reg->map_ptr->key_size,
-   reg->map_ptr->value_size);
+   reg->map_ptr->value_size,
+   reg->id);
if (reg->min_value != BPF_REGISTER_MIN_RANGE)
verbose(",min_value=%llu",
(unsigned long long)reg->min_value);
@@ -447,6 +448,7 @@ static void mark_reg_unknown_value(struct bpf_reg_state 
*regs, u32 regno)
 {
BUG_ON(regno >= MAX_BPF_REG);
regs[regno].type = UNKNOWN_VALUE;
+   regs[regno].id = 0;
regs[regno].imm = 0;
 }
 
@@ -1252,6 +1254,7 @@ static int check_call(struct bpf_verifier_env *env, int 
func_id)
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
+   regs[BPF_REG_0].id = ++env->id_gen;
} else {
verbose("unknown return type %d of func %d\n",
fn->ret_type, func_id);
@@ -1644,8 +1647,7 @@ static int check_alu_op(struct bpf_verifier_env *env, 
struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
}
-   regs[insn->dst_reg].type = UNKNOWN_VALUE;
-   regs[insn->dst_reg].map_ptr = NULL;
+   mark_reg_unknown_value(regs, insn->dst_reg);
}
} else {
/* case: R = imm
@@ -1907,6 +1909,38 @@ static void reg_set_min_max_inv(struct bpf_reg_state 
*true_reg,
check_reg_overflow(true_reg);
 }
 
+static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
+enum bpf_reg_type type)
+{
+   struct bpf_reg_state *reg = [regno];
+
+   if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
+   reg->type = type;
+   if (type == UNKNOWN_VALUE)
+   mark_reg_unknown_value(regs, regno);
+   }
+}
+
+/* The logic is similar to find_good_pkt_pointers(), both could eventually
+ * be folded together at some point.
+ */
+static void mark_map_regs(struct bpf_verifier_state

Re: [PATCH net-next] ethernet/sfc: use core min/max MTU checking

2016-10-18 Thread Sergei Shtylyov


On 10/18/2016 07:47 PM, Bert Kenward wrote:


Fixes: 61e84623 ("net: centralize net_device min/max MTU checking")


   The commit SHA1 should be at least 12 digits here.


Signed-off-by: Bert Kenward 

[...]

MBR, Sergei

Re: [PATCH -next] qed: Remove useless set memory to zero use memset()

2016-10-18 Thread Sergei Shtylyov


On 10/18/2016 06:54 PM, Wei Yongjun wrote:


From: Wei Yongjun 

The memory return by kzalloc() has already be set to zero, so


   Returned?


remove useless memset(0).

Signed-off-by: Wei Yongjun 

[...]

MBR, Sergei

[PATCH net-next] bnx2x: ethtool -x full support

2016-10-18 Thread Eric Dumazet

From: Eric Dumazet 

Implement ethtool -x full support, so that rss key can be fetched
instead of assuming it matches /proc/sys/net/core/netdev_rss_key
content.

We might add "ethtool --rxfh" support later to set a different rss key.

Tested:

lpk51:~# ethtool --show-rxfh eth0
RX flow hash indirection table for eth0 with 4 RX ring(s):
0:  0 1 2 3 0 1 2 3
8:  0 1 2 3 0 1 2 3
   16:  0 1 2 3 0 1 2 3
   24:  0 1 2 3 0 1 2 3
   32:  0 1 2 3 0 1 2 3
   40:  0 1 2 3 0 1 2 3
   48:  0 1 2 3 0 1 2 3
   56:  0 1 2 3 0 1 2 3
   64:  0 1 2 3 0 1 2 3
   72:  0 1 2 3 0 1 2 3
   80:  0 1 2 3 0 1 2 3
   88:  0 1 2 3 0 1 2 3
   96:  0 1 2 3 0 1 2 3
  104:  0 1 2 3 0 1 2 3
  112:  0 1 2 3 0 1 2 3
  120:  0 1 2 3 0 1 2 3
RSS hash key:
8b:a9:3a:ff:3e:f8:44:bd:5a:44:b7:b5:6d:e8:2d:f0:f0:72:98:54:03:86:8f:39:a4:42:5a:b3:84:71:5c:4f:1c:18:d6:a3:04:68:85:ac
lpk51:~# cat /proc/sys/net/core/netdev_rss_key
8b:a9:3a:ff:3e:f8:44:bd:5a:44:b7:b5:6d:e8:2d:f0:f0:72:98:54:03:86:8f:39:a4:42:5a:b3:84:71:5c:4f:1c:18:d6:a3:04:68:85:ac:22:1f:50:76:d4:c8:a5:20:7b:61:3c:0c

Signed-off-by: Eric Dumazet 
Cc: Ariel Elior 
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |2 
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c |   47 ++
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c  |2 
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h  |5 -
 4 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 0a9108cd4c45..b979bb7c4ffb 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -2106,7 +2106,7 @@ int bnx2x_rss(struct bnx2x *bp, struct 
bnx2x_rss_config_obj *rss_obj,
 
if (config_hash) {
/* RSS keys */
-   netdev_rss_key_fill(params.rss_key, T_ETH_RSS_KEY * 4);
+   netdev_rss_key_fill(_obj->rss_key, T_ETH_RSS_KEY * 4);
__set_bit(BNX2X_RSS_SET_SRCH, _flags);
}
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 85a7800bfc12..28bc9479fc74 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -3421,6 +3421,13 @@ static u32 bnx2x_get_rxfh_indir_size(struct net_device 
*dev)
return T_ETH_INDIRECTION_TABLE_SIZE;
 }
 
+static u32 bnx2x_get_rxfh_key_size(struct net_device *dev)
+{
+   struct bnx2x *bp = netdev_priv(dev);
+
+   return (bp->port.pmf || !CHIP_IS_E1x(bp)) ? T_ETH_RSS_KEY * 4 : 0;
+}
+
 static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
  u8 *hfunc)
 {
@@ -3430,23 +3437,30 @@ static int bnx2x_get_rxfh(struct net_device *dev, u32 
*indir, u8 *key,
 
if (hfunc)
*hfunc = ETH_RSS_HASH_TOP;
-   if (!indir)
-   return 0;
 
-   /* Get the current configuration of the RSS indirection table */
-   bnx2x_get_rss_ind_table(>rss_conf_obj, ind_table);
-
-   /*
-* We can't use a memcpy() as an internal storage of an
-* indirection table is a u8 array while indir->ring_index
-* points to an array of u32.
-*
-* Indirection table contains the FW Client IDs, so we need to
-* align the returned table to the Client ID of the leading RSS
-* queue.
-*/
-   for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++)
-   indir[i] = ind_table[i] - bp->fp->cl_id;
+   if (key) {
+   if (bp->port.pmf || !CHIP_IS_E1x(bp))
+   memcpy(key, >rss_conf_obj.rss_key, T_ETH_RSS_KEY * 
4);
+   else
+   memset(key, 0, T_ETH_RSS_KEY * 4);
+   }
+
+   if (indir) {
+   /* Get the current configuration of the RSS indirection table */
+   bnx2x_get_rss_ind_table(>rss_conf_obj, ind_table);
+
+   /*
+* We can't use a memcpy() as an internal storage of an
+* indirection table is a u8 array while indir->ring_index
+* points to an array of u32.
+*
+* Indirection table contains the FW Client IDs, so we need to
+* align the returned table to the Client ID of the leading RSS
+* queue.
+

[PATCH v3 net-next 7/7] fou: Support flow dissection

2016-10-18 Thread Tom Herbert

This patch performs flow dissection for GUE and FOU. This is an
optional feature on the receiver and is set by FOU_ATTR_DEEP_HASH
netlink configuration. When enable the UDP socket flow_dissect
function is set to fou_flow_dissect or gue_flow_dissect as
appropriate. These functions return FLOW_DIS_RET_IPPROTO and
set ip protocol argument. In the case of GUE the header is
parsed to find the protocol number.

Signed-off-by: Tom Herbert 
---
 include/uapi/linux/fou.h |  1 +
 net/ipv4/fou.c   | 68 +++-
 2 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h
index d2947c5..2c837eb 100644
--- a/include/uapi/linux/fou.h
+++ b/include/uapi/linux/fou.h
@@ -15,6 +15,7 @@ enum {
FOU_ATTR_IPPROTO,   /* u8 */
FOU_ATTR_TYPE,  /* u8 */
FOU_ATTR_REMCSUM_NOPARTIAL, /* flag */
+   FOU_ATTR_DEEP_HASH, /* flag */
 
__FOU_ATTR_MAX,
 };
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index cf50f7e..95ac5a8 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -27,7 +27,8 @@ struct fou {
struct rcu_head rcu;
 };
 
-#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
+#define FOU_F_REMCSUM_NOPARTIALBIT(0)
+#define FOU_F_DEEP_HASHBIT(1)
 
 struct fou_cfg {
u16 type;
@@ -281,6 +282,16 @@ static int fou_gro_complete(struct sock *sk, struct 
sk_buff *skb,
return err;
 }
 
+static int fou_flow_dissect(struct sock *sk, const struct sk_buff *skb,
+   void *data, int hlen, int *nhoff, u8 *ip_proto,
+   __be16 *proto)
+{
+   *ip_proto = fou_from_sock(sk)->protocol;
+   *nhoff += sizeof(struct udphdr);
+
+   return FLOW_DIS_RET_IPPROTO;
+}
+
 static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
  struct guehdr *guehdr, void *data,
  size_t hdrlen, struct gro_remcsum *grc,
@@ -498,6 +509,48 @@ static int gue_gro_complete(struct sock *sk, struct 
sk_buff *skb, int nhoff)
return err;
 }
 
+static int gue_flow_dissect(struct sock *sk, const struct sk_buff *skb,
+   void *data, int hlen, int *nhoff, u8 *ip_proto,
+   __be16 *proto)
+{
+   struct guehdr _hdr, *hdr;
+
+   hdr = __skb_header_pointer(skb, *nhoff + sizeof(struct udphdr),
+  sizeof(_hdr), data, hlen, &_hdr);
+   if (!hdr)
+   return FLOW_DIS_RET_BAD;
+
+   switch (hdr->version) {
+   case 0: /* Full GUE header present */
+   if (hdr->control)
+   return FLOW_DIS_RET_PASS;
+
+   *nhoff += sizeof(struct udphdr) + sizeof(_hdr) +
+ (hdr->hlen << 2);
+   *ip_proto = hdr->proto_ctype;
+
+   return FLOW_DIS_RET_IPPROTO;
+   case 1:
+   /* Direct encasulation of IPv4 or IPv6 */
+
+   switch (((struct iphdr *)hdr)->version) {
+   case 4:
+   *nhoff += sizeof(struct udphdr);
+   *ip_proto = IPPROTO_IPIP;
+   return FLOW_DIS_RET_IPPROTO;
+   case 6:
+   *nhoff += sizeof(struct udphdr);
+   *ip_proto = IPPROTO_IPV6;
+   return FLOW_DIS_RET_IPPROTO;
+   default:
+   return FLOW_DIS_RET_PASS;
+   }
+
+   default:
+   return FLOW_DIS_RET_PASS;
+   }
+}
+
 static int fou_add_to_port_list(struct net *net, struct fou *fou)
 {
struct fou_net *fn = net_generic(net, fou_net_id);
@@ -568,12 +621,16 @@ static int fou_create(struct net *net, struct fou_cfg 
*cfg,
tunnel_cfg.encap_rcv = fou_udp_recv;
tunnel_cfg.gro_receive = fou_gro_receive;
tunnel_cfg.gro_complete = fou_gro_complete;
+   if (cfg->flags & FOU_F_DEEP_HASH)
+   tunnel_cfg.flow_dissect = fou_flow_dissect;
fou->protocol = cfg->protocol;
break;
case FOU_ENCAP_GUE:
tunnel_cfg.encap_rcv = gue_udp_recv;
tunnel_cfg.gro_receive = gue_gro_receive;
tunnel_cfg.gro_complete = gue_gro_complete;
+   if (cfg->flags & FOU_F_DEEP_HASH)
+   tunnel_cfg.flow_dissect = gue_flow_dissect;
break;
default:
err = -EINVAL;
@@ -637,6 +694,7 @@ static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 
1] = {
[FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
[FOU_ATTR_TYPE] = { .type = NLA_U8, },
[FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
+   [FOU_ATTR_DEEP_HASH] = { .type = NLA_FLAG },
 };
 
 static int parse_nl_config(struct

[PATCH v3 net-next 6/7] udp: UDP tunnel flow dissection infrastructure

2016-10-18 Thread Tom Herbert

Add infrastructure to allow UDP tunnels to setup flow dissecion.

Signed-off-by: Tom Herbert 
---
 include/net/udp_tunnel.h | 5 +
 net/ipv4/udp_tunnel.c| 5 +
 2 files changed, 10 insertions(+)

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 02c5be0..81d2584 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -69,6 +69,10 @@ typedef struct sk_buff **(*udp_tunnel_gro_receive_t)(struct 
sock *sk,
 struct sk_buff *skb);
 typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb,
 int nhoff);
+typedef int (*udp_tunnel_flow_dissect_t)(struct sock *sk,
+const struct sk_buff *skb,
+void *data, int hlen, int *nhoff,
+u8 *ip_proto, __be16 *proto);
 
 struct udp_tunnel_sock_cfg {
void *sk_user_data; /* user data used by encap_rcv call back */
@@ -78,6 +82,7 @@ struct udp_tunnel_sock_cfg {
udp_tunnel_encap_destroy_t encap_destroy;
udp_tunnel_gro_receive_t gro_receive;
udp_tunnel_gro_complete_t gro_complete;
+   udp_tunnel_flow_dissect_t flow_dissect;
 };
 
 /* Setup the given (UDP) sock to receive UDP encapsulated packets */
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 58bd39f..4459288 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -72,6 +72,11 @@ void setup_udp_tunnel_sock(struct net *net, struct socket 
*sock,
udp_sk(sk)->gro_receive = cfg->gro_receive;
udp_sk(sk)->gro_complete = cfg->gro_complete;
 
+   if (cfg->flow_dissect) {
+   udp_sk(sk)->flow_dissect = cfg->flow_dissect;
+   udp_flow_dissect_enable();
+   }
+
udp_tunnel_encap_enable(sock);
 }
 EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
-- 
2.9.3

[PATCH v3 net-next 5/7] udp: Add UDP flow dissection functions to IPv4 and IPv6

2016-10-18 Thread Tom Herbert

Add per protocol offload callbacks for flow_dissect to UDP for
IPv4 and IPv6. The callback functions extract the port number
information and with the packet addresses (given in an argument with
type flow_dissector_key_addrs) it performs a lookup on the UDP
socket. If a socket is found and flow_dissect is set for the
socket then that function is called.

Signed-off-by: Tom Herbert 
---
 net/ipv4/udp_offload.c | 39 +++
 net/ipv6/udp_offload.c | 40 +++-
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index f9333c9..c7753ba 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -377,11 +377,50 @@ static int udp4_gro_complete(struct sk_buff *skb, int 
nhoff)
return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
 
+/* Assumes rcu lock is held */
+static int udp4_flow_dissect(const struct sk_buff *skb, void *data, int hlen,
+int *nhoff, u8 *ip_proto, __be16 *proto,
+struct flow_dissector_key_addrs *key_addrs)
+{
+   u16 _ports[2], *ports;
+   struct net *net;
+   struct sock *sk;
+   int dif = -1;
+
+   /* See if there is a flow dissector in the UDP socket */
+
+   if (skb->dev) {
+   net = dev_net(skb->dev);
+   dif = skb->dev->ifindex;
+   } else if (skb->sk) {
+   net = sock_net(skb->sk);
+   } else {
+   return FLOW_DIS_RET_PASS;
+   }
+
+   ports = __skb_header_pointer(skb, *nhoff, sizeof(_ports),
+data, hlen, &_ports);
+   if (!ports)
+   return FLOW_DIS_RET_BAD;
+
+   sk = udp4_lib_lookup_noref(net,
+  key_addrs->v4addrs.src, ports[0],
+  key_addrs->v4addrs.dst, ports[1],
+  dif);
+
+   if (sk && udp_sk(sk)->flow_dissect)
+   return udp_sk(sk)->flow_dissect(sk, skb, data, hlen, nhoff,
+   ip_proto, proto);
+   else
+   return FLOW_DIS_RET_PASS;
+}
+
 static const struct net_offload udpv4_offload = {
.callbacks = {
.gso_segment = udp4_ufo_fragment,
.gro_receive  = udp4_gro_receive,
.gro_complete = udp4_gro_complete,
+   .flow_dissect = udp4_flow_dissect,
},
 };
 
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index ac858c4..12d9a92 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -1,5 +1,5 @@
 /*
- * IPV6 GSO/GRO offload support
+ * ipv6 gso/gro offload support
  * Linux INET6 implementation
  *
  * This program is free software; you can redistribute it and/or
@@ -163,11 +163,49 @@ static int udp6_gro_complete(struct sk_buff *skb, int 
nhoff)
return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
 
+/* Assumes rcu lock is held */
+static int udp6_flow_dissect(const struct sk_buff *skb, void *data, int hlen,
+int *nhoff, u8 *ip_proto, __be16 *proto,
+const struct flow_dissector_key_addrs *key_addrs)
+{
+   u16 _ports[2], *ports;
+   struct net *net;
+   struct sock *sk;
+   int dif = -1;
+
+   /* See if there is a flow dissector in the UDP socket */
+
+   if (skb->dev) {
+   net = dev_net(skb->dev);
+   dif = skb->dev->ifindex;
+   } else if (skb->sk) {
+   net = sock_net(skb->sk);
+   } else {
+   return FLOW_DIS_RET_PASS;
+   }
+
+   ports = __skb_header_pointer(skb, *nhoff, sizeof(_ports),
+data, hlen, &_ports);
+   if (!ports)
+   return FLOW_DIS_RET_BAD;
+
+   sk = udp6_lib_lookup_noref(net,
+  _addrs->v6addrs.src, ports[0],
+  _addrs->v6addrs.dst, ports[1],
+  dif);
+
+   if (sk && udp_sk(sk)->flow_dissect)
+   return udp_sk(sk)->flow_dissect(sk, skb, data, hlen, nhoff,
+   ip_proto, proto);
+   return FLOW_DIS_RET_PASS;
+}
+
 static const struct net_offload udpv6_offload = {
.callbacks = {
.gso_segment=   udp6_ufo_fragment,
.gro_receive=   udp6_gro_receive,
.gro_complete   =   udp6_gro_complete,
+   .flow_dissect   =   udp6_flow_dissect,
},
 };
 
-- 
2.9.3

[PATCH v3 net-next 2/7] flow_dissector: Limit processing of next encaps and extensions

2016-10-18 Thread Tom Herbert

Flow dissector does not limit the number of encapsulated packets or IPv6
header extensions that will be processed. This could easily be
suceptible to DOS attack-- for instance a 1500 byte packet could contain
75 IPIP headers.

This patch places limits on the number of encapsulations and IPv6 extension
headers that are processed in flow dissector

Signed-off-by: Tom Herbert 
---
 net/core/flow_dissector.c | 37 +++--
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1a7b80f..919bd02 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -91,6 +91,22 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int 
thoff, u8 ip_proto,
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
+#define MAX_DISSECT_DEPTH  10
+#define MAX_DISSECT_EXT10
+
+#define __DISSECT_AGAIN(_target, _depth, _limit) do {  \
+   (_depth)++; \
+   if ((_depth) > (_limit))\
+   goto out_good;  \
+   else\
+   goto _target;   \
+} while (0)
+
+#define DISSECT_AGAIN(target) \
+   __DISSECT_AGAIN(target, depth, MAX_DISSECT_DEPTH)
+#define DISSECT_AGAIN_EXT(target) \
+   __DISSECT_AGAIN(target, ext_cnt, MAX_DISSECT_EXT)
+
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @skb: sk_buff to extract the flow from, can be NULL if the rest are 
specified
@@ -123,6 +139,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
bool skip_vlan = false;
u8 ip_proto = 0;
bool ret = false;
+   int depth = 0, ext_cnt = 0;
 
if (!data) {
data = skb->data;
@@ -262,7 +279,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
proto = vlan->h_vlan_encapsulated_proto;
nhoff += sizeof(*vlan);
if (skip_vlan)
-   goto again;
+   DISSECT_AGAIN(again);
}
 
skip_vlan = true;
@@ -285,7 +302,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
}
}
 
-   goto again;
+   DISSECT_AGAIN(again);
}
case htons(ETH_P_PPP_SES): {
struct {
@@ -299,9 +316,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
nhoff += PPPOE_SES_HLEN;
switch (proto) {
case htons(PPP_IP):
-   goto ip;
+   DISSECT_AGAIN(ip);
case htons(PPP_IPV6):
-   goto ipv6;
+   DISSECT_AGAIN(ipv6);
default:
goto out_bad;
}
@@ -472,7 +489,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
goto out_good;
 
-   goto again;
+   DISSECT_AGAIN(again);
}
case NEXTHDR_HOP:
case NEXTHDR_ROUTING:
@@ -490,7 +507,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
ip_proto = opthdr[0];
nhoff += (opthdr[1] + 1) << 3;
 
-   goto ip_proto_again;
+   DISSECT_AGAIN_EXT(ip_proto_again);
}
case NEXTHDR_FRAGMENT: {
struct frag_hdr _fh, *fh;
@@ -512,7 +529,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
if (!(fh->frag_off & htons(IP6_OFFSET))) {
key_control->flags |= FLOW_DIS_FIRST_FRAG;
if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)
-   goto ip_proto_again;
+   DISSECT_AGAIN_EXT(ip_proto_again);
}
goto out_good;
}
@@ -523,7 +540,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
goto out_good;
 
-   goto ip;
+   DISSECT_AGAIN(ip);
case IPPROTO_IPV6:
proto = htons(ETH_P_IPV6);
 
@@ -531,10 +548,10 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
goto out_good;
 
-   goto ipv6;
+   DISSECT_AGAIN(ipv6);
case IPPROTO_MPLS:
proto = htons(ETH_P_MPLS_UC);
-   goto mpls;
+   DISSECT_AGAIN(mpls);
default:
break;
}
-- 
2.9.3

[PATCH v3 net-next 1/7] ipv6: Fix Makefile conditional to use CONFIG_INET

2016-10-18 Thread Tom Herbert

ipv6 directory was being built based on CONFIG_NET not CONFIG_INET.

Signed-off-by: Tom Herbert 
---
 drivers/net/usb/cdc_mbim.c  |  4 
 include/net/ipv6.h  | 15 +++
 include/net/net_namespace.h |  2 ++
 net/Makefile|  2 +-
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index 96a5028..6b38e0c 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -301,6 +301,7 @@ static struct sk_buff *cdc_mbim_tx_fixup(struct usbnet 
*dev, struct sk_buff *skb
return NULL;
 }
 
+#ifdef CONFIG_INET
 /* Some devices are known to send Neigbor Solicitation messages and
  * require Neigbor Advertisement replies.  The IPv6 core will not
  * respond since IFF_NOARP is set, so we must handle them ourselves.
@@ -350,6 +351,7 @@ static void do_neigh_solicit(struct usbnet *dev, u8 *buf, 
u16 tci)
 out:
dev_put(netdev);
 }
+#endif
 
 static bool is_neigh_solicit(u8 *buf, size_t len)
 {
@@ -377,8 +379,10 @@ static struct sk_buff *cdc_mbim_process_dgram(struct 
usbnet *dev, u8 *buf, size_
proto = htons(ETH_P_IP);
break;
case 0x60:
+#ifdef CONFIG_INET
if (is_neigh_solicit(buf, len))
do_neigh_solicit(dev, buf, tci);
+#endif
proto = htons(ETH_P_IPV6);
break;
default:
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 8fed1cd..cbb1ce0 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -936,8 +936,15 @@ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct 
ipv6_txoptions *opt,
 void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
 u8 *proto);
 
+#ifdef CONFIG_INET
 int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp,
 __be16 *frag_offp);
+#else
+static inline int ipv6_skip_exthdr(const struct sk_buff *skb, int start,
+  u8 *nexthdrp, __be16 *frag_offp) {
+   return -1;
+}
+#endif
 
 bool ipv6_ext_hdr(u8 nexthdr);
 
@@ -948,8 +955,16 @@ enum {
 };
 
 /* find specified header and get offset to it */
+#ifdef CONFIG_INET
 int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target,
  unsigned short *fragoff, int *fragflg);
+#else
+static inline int ipv6_find_hdr(const struct sk_buff *skb, unsigned int 
*offset,
+   int target, unsigned short *fragoff,
+   int *fragflg) {
+   return -EPROTONOSUPPORT;
+}
+#endif
 
 int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type);
 
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index fc4f757..b4c4a5f 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -352,8 +352,10 @@ static inline void rt_genid_bump_ipv4(struct net *net)
 extern void (*__fib6_flush_trees)(struct net *net);
 static inline void rt_genid_bump_ipv6(struct net *net)
 {
+#ifdef CONFIG_INET
if (__fib6_flush_trees)
__fib6_flush_trees(net);
+#endif
 }
 
 #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
diff --git a/net/Makefile b/net/Makefile
index 4cafaa2..82ffb91 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER)   += netfilter/
 obj-$(CONFIG_INET) += ipv4/
 obj-$(CONFIG_XFRM) += xfrm/
 obj-$(CONFIG_UNIX) += unix/
-obj-$(CONFIG_NET)  += ipv6/
+obj-$(CONFIG_INET) += ipv6/
 obj-$(CONFIG_PACKET)   += packet/
 obj-$(CONFIG_NET_KEY)  += key/
 obj-$(CONFIG_BRIDGE)   += bridge/
-- 
2.9.3

[PATCH v3 net-next 4/7] udp: UDP flow dissector

2016-10-18 Thread Tom Herbert

Add infrastructure for performing per protocol flow dissection and
support flow dissection in UDP payloads (e.g. flow dissection on a
UDP encapsulated tunnel.

The per protocol flow dissector is called by flow_dissect function
in the offload_callbacks of a protocol. The arguments of this function
include the necessary information to do flow dissection as derived
from __skb_flow_dissect which is where the callback is intended to be
called from. There are return codes from the callback in the form
FLOW_DIS_RET_* that indicate the result. FLOW_DIS_RET_IPPROTO
means that the payload should be dissected as an IP proto, the
specific protocol is returned in a pointer argument. Likewise,
FLOW_DIS_RET_PROTO indicate the payload should be processed as
an ethertype which is returned in another argument.

A case for IPPROTO_UDP was added to __skb_flow_dissect. Since
UDP flow dissector involves a relatively expensive socket lookup
there is a static key check first to see if there are any sockets
that have enabled flow dissection. After this check, the offload
ops for UDP for either IPv4 or IPv6 is considered. If the
flow_dissect function is it is called. Upon return the result
is processed (pass, out_bad, process as IP protocol, process
as ethertype). Note that if the result indicates a protocol must
be processed it is expected that nhoff has been updated to the
encapsulated protocol header.

Signed-off-by: Tom Herbert 
---
 include/linux/netdevice.h|  5 +++
 include/linux/udp.h  |  7 
 include/net/flow_dissector.h |  8 +
 include/net/udp.h|  4 +++
 net/core/flow_dissector.c| 85 ++--
 net/ipv4/udp.c   |  3 ++
 6 files changed, 110 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf341b6..c5f4295 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2203,6 +2203,11 @@ struct offload_callbacks {
struct sk_buff  **(*gro_receive)(struct sk_buff **head,
 struct sk_buff *skb);
int (*gro_complete)(struct sk_buff *skb, int nhoff);
+   int (*flow_dissect)(const struct sk_buff *skb,
+   void *data, int hlen,
+   int *nhoff, u8 *ip_proto,
+   __be16 *proto,
+struct flow_dissector_key_addrs *key_addrs);
 };
 
 struct packet_offload {
diff --git a/include/linux/udp.h b/include/linux/udp.h
index d1fd8cd..608ebf4 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -79,6 +79,13 @@ struct udp_sock {
int (*gro_complete)(struct sock *sk,
struct sk_buff *skb,
int nhoff);
+
+   /* Flow dissector function for UDP socket */
+   int (*flow_dissect)(struct sock *sk,
+   const struct sk_buff *skb,
+   void *data, int hlen,
+   int *nhoff, u8 *ip_proto,
+   __be16 *proto);
 };
 
 static inline struct udp_sock *udp_sk(const struct sock *sk)
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index d953492..9de4904 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -203,4 +203,12 @@ static inline void *skb_flow_dissector_target(struct 
flow_dissector *flow_dissec
return ((char *)target_container) + flow_dissector->offset[key_id];
 }
 
+/* Return codes from per socket flow dissector (e.g. UDP) */
+enum {
+   FLOW_DIS_RET_PASS = 0,
+   FLOW_DIS_RET_BAD,
+   FLOW_DIS_RET_IPPROTO,
+   FLOW_DIS_RET_PROTO,
+};
+
 #endif
diff --git a/include/net/udp.h b/include/net/udp.h
index 717a972..8d364e8 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -360,4 +360,8 @@ void udp_encap_enable(void);
 #if IS_ENABLED(CONFIG_IPV6)
 void udpv6_encap_enable(void);
 #endif
+
+void udp_flow_dissect_enable(void);
+void udp_flow_dissect_disable(void);
+
 #endif /* _UDP_H */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 919bd02..06ccfd5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -8,6 +8,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -57,6 +59,20 @@ void skb_flow_dissector_init(struct flow_dissector 
*flow_dissector,
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
+static struct static_key udp_flow_dissect __read_mostly;
+
+void udp_flow_dissect_enable(void)
+{
+   static_key_slow_inc(_flow_dissect);
+}
+EXPORT_SYMBOL(udp_flow_dissect_enable);
+
+void udp_flow_dissect_disable(void)
+{
+   static_key_slow_dec(_flow_dissect);
+}
+EXPORT_SYMBOL(udp_flow_dissect_disable);
+
 /**
  * __skb_flow_get_ports -

[PATCH v3 net-next 0/7] udp: Flow dissection for tunnels

2016-10-18 Thread Tom Herbert

Now that we have a means to perform a UDP socket lookup without taking
a reference, it is feasible to have flow dissector crack open UDP
encapsulated packets. Generally, we would expect that the UDP source
port or the flow label in IPv6 would contain enough entropy about
the encapsulated flow. However, there will be cases, such as a static
UDP tunnel with fixed ports, where dissecting the encapsulated packet
is valuable.

The model is here is similar to that implemented for UDP GRO. A
tunnel implementation (e.g. GUE) may set a flow_dissect function
in the udp_sk. In __skb_flow_dissect a case has been added for
UDP to check if there is a socket with flow_dissect set. If there
is the function is called. The (per tunnel implementation)
function can parse the encapsulation headers and return the
next protocol for __skb_flow_dissect to process and it's position
in nhoff.

Since performing a UDP lookup on every packet might be expensive
I added a static key check to bypass the lookup if there are no
sockets with flow_dissect set. I should mention that doing the
lookup wasn't particularly a big hit anyway.

Fou/gue was modified to perform tunnel dissection. This is enabled
on each listener socket via a netlink configuration option.

v2:
  - davem suggested that we don't need udp_flow_dissect and that
udp{v6}_encap_needed could be used. Problem is that those are
in respective udp.c and flow_dissector.c is in net/core. Keep
udp_flow_dissect as more generic item.
  - Fixed Makefile issue where we were using CONFIG_NET instead of
CONFIG_INET.
  - Added limits inf flow dissector from controlling number of nested
encapsulations or EHs that are dissected.
  - Added CONFIG_INET around use of inet_offloads in flow_dissector.c.

v3:
  - Fix build issues with modules that call IPv6 functions and
CONFIG_INET is not set.
  - Fix compilation error in init'ing .flow_dissect in IPv6 UDP
offload.

Tested:

Running 200 streams with TCP_RR.

GRE/GUE variable source port (baseline)
RSS distributes packets, RFS is effective
1211702 tps
147/241/442 50/90/99% latencies
87.95 CPU utilization

GRE/GUE fixed source port
All packets to one CPU, RFS is ineffective
173680 tps
1170/1377/1853 50/90/99% latencies
7.42 CPU utilization

GRE/GUE fixed source port with deep hash enabled
All packets to one CPU, but now RFS is effective
730359 tps
263/325/464 50/90/99% latencies
38.25% CPU utilization (Interrupting CPU is maxed out)


Tom Herbert (7):
  ipv6: Fix Makefile conditional to use CONFIG_INET
  flow_dissector: Limit processing of next encaps and extensions
  udp: Add socket lookup functions with noref
  udp: UDP flow dissector
  udp: Add UDP flow dissection functions to IPv4 and IPv6
  udp: UDP tunnel flow dissection infrastructure
  fou: Support flow dissection

 drivers/net/usb/cdc_mbim.c   |   4 ++
 include/linux/netdevice.h|   5 ++
 include/linux/udp.h  |   7 +++
 include/net/flow_dissector.h |   8 +++
 include/net/ipv6.h   |  15 ++
 include/net/net_namespace.h  |   2 +
 include/net/udp.h|  12 +
 include/net/udp_tunnel.h |   5 ++
 include/uapi/linux/fou.h |   1 +
 net/Makefile |   2 +-
 net/core/flow_dissector.c| 122 ++-
 net/ipv4/fou.c   |  68 +++-
 net/ipv4/udp.c   |  11 
 net/ipv4/udp_offload.c   |  39 ++
 net/ipv4/udp_tunnel.c|   5 ++
 net/ipv6/udp.c   |  10 
 net/ipv6/udp_offload.c   |  40 +-
 17 files changed, 341 insertions(+), 15 deletions(-)

-- 
2.9.3

[PATCH v3 net-next 3/7] udp: Add socket lookup functions with noref

2016-10-18 Thread Tom Herbert

Create udp4_lib_lookup_noref and udp6_lib_lookup_noref. These perfrom
a socket lookup on addresses and ports without taking a reference.

Signed-off-by: Tom Herbert 
---
 include/net/udp.h |  8 
 net/ipv4/udp.c|  8 
 net/ipv6/udp.c| 10 ++
 3 files changed, 26 insertions(+)

diff --git a/include/net/udp.h b/include/net/udp.h
index ea53a87..717a972 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -275,6 +275,10 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 
saddr, __be16 sport,
   struct udp_table *tbl, struct sk_buff *skb);
 struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
 __be16 sport, __be16 dport);
+struct sock *udp4_lib_lookup_noref(struct net *net,
+  __be32 saddr, __be16 sport,
+  __be32 daddr, __be16 dport,
+  int dif);
 struct sock *udp6_lib_lookup(struct net *net,
 const struct in6_addr *saddr, __be16 sport,
 const struct in6_addr *daddr, __be16 dport,
@@ -286,6 +290,10 @@ struct sock *__udp6_lib_lookup(struct net *net,
   struct sk_buff *skb);
 struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
 __be16 sport, __be16 dport);
+struct sock *udp6_lib_lookup_noref(struct net *net,
+  const struct in6_addr *saddr, __be16 sport,
+  const struct in6_addr *daddr, __be16 dport,
+  int dif);
 
 /*
  * SNMP statistics for UDP and UDP-Lite
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7d96dc2..7f84c51 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -595,6 +595,14 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 
saddr, __be16 sport,
 EXPORT_SYMBOL_GPL(udp4_lib_lookup);
 #endif
 
+struct sock *udp4_lib_lookup_noref(struct net *net, __be32 saddr, __be16 sport,
+  __be32 daddr, __be16 dport, int dif)
+{
+   return __udp4_lib_lookup(net, saddr, sport, daddr, dport,
+dif, _table, NULL);
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup_noref);
+
 static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
   __be16 loc_port, __be32 loc_addr,
   __be16 rmt_port, __be32 rmt_addr,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9aa7c1c..6e382d9 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -317,6 +317,16 @@ struct sock *udp6_lib_lookup(struct net *net, const struct 
in6_addr *saddr, __be
 EXPORT_SYMBOL_GPL(udp6_lib_lookup);
 #endif
 
+struct sock *udp6_lib_lookup_noref(struct net *net,
+  const struct in6_addr *saddr, __be16 sport,
+  const struct in6_addr *daddr, __be16 dport,
+  int dif)
+{
+   return __udp6_lib_lookup(net, saddr, sport, daddr, dport,
+dif, _table, NULL);
+}
+EXPORT_SYMBOL_GPL(udp6_lib_lookup_noref);
+
 /*
  * This should be easy, if there is something there we
  * return it, otherwise we block.
-- 
2.9.3

[patch net] rtnetlink: Add rtnexthop offload flag to compare mask

2016-10-18 Thread Jiri Pirko

From: Jiri Pirko 

The offload flag is a status flag and should not be used by
FIB semantics for comparison.

Fixes: 37ed9493699c ("rtnetlink: add RTNH_F_EXTERNAL flag for fib offload")
Signed-off-by: Jiri Pirko 
---
Please queue-up to stable as well. Thanks.
---
 include/uapi/linux/rtnetlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 262f037..5a78be5 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -350,7 +350,7 @@ struct rtnexthop {
 #define RTNH_F_OFFLOAD 8   /* offloaded route */
 #define RTNH_F_LINKDOWN16  /* carrier-down on nexthop */
 
-#define RTNH_COMPARE_MASK  (RTNH_F_DEAD | RTNH_F_LINKDOWN)
+#define RTNH_COMPARE_MASK  (RTNH_F_DEAD | RTNH_F_LINKDOWN | RTNH_F_OFFLOAD)
 
 /* Macros to handle hexthops */
 
-- 
2.5.5

Re: [patch net 1/6] switchdev: Execute bridge ndos only for bridge ports

2016-10-18 Thread Jiri Pirko

That's it. Only one patch :)

[patch net 1/6] switchdev: Execute bridge ndos only for bridge ports

2016-10-18 Thread Jiri Pirko

From: Ido Schimmel 

We recently got the following warning after setting up a vlan device on
top of an offloaded bridge and executing 'bridge link':

WARNING: CPU: 0 PID: 18566 at 
drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c:81 
mlxsw_sp_port_orig_get.part.9+0x55/0x70 [mlxsw_spectrum]
[...]
 CPU: 0 PID: 18566 Comm: bridge Not tainted 4.8.0-rc7 #1
 Hardware name: Mellanox Technologies Ltd. Mellanox switch/Mellanox switch, 
BIOS 4.6.5 05/21/2015
  0286 e64ab94f 880406e6f8f0 8135eaa3
    880406e6f930 8108c43b
  005106e6f988 8803df398840 880403c60108 880406e6f990
 Call Trace:
  [] dump_stack+0x63/0x90
  [] __warn+0xcb/0xf0
  [] warn_slowpath_null+0x1d/0x20
  [] mlxsw_sp_port_orig_get.part.9+0x55/0x70 [mlxsw_spectrum]
  [] mlxsw_sp_port_attr_get+0xa5/0xb0 [mlxsw_spectrum]
  [] switchdev_port_attr_get+0x4f/0x140
  [] switchdev_port_attr_get+0x100/0x140
  [] switchdev_port_attr_get+0x100/0x140
  [] switchdev_port_bridge_getlink+0x5b/0xc0
  [] ? switchdev_port_fdb_dump+0x90/0x90
  [] rtnl_bridge_getlink+0xe7/0x190
  [] netlink_dump+0x122/0x290
  [] __netlink_dump_start+0x15f/0x190
  [] ? rtnl_bridge_dellink+0x230/0x230
  [] rtnetlink_rcv_msg+0x1a6/0x220
  [] ? __kmalloc_node_track_caller+0x208/0x2c0
  [] ? rtnl_bridge_dellink+0x230/0x230
  [] ? rtnl_newlink+0x890/0x890
  [] netlink_rcv_skb+0xa4/0xc0
  [] rtnetlink_rcv+0x28/0x30
  [] netlink_unicast+0x18c/0x240
  [] netlink_sendmsg+0x2fb/0x3a0
  [] sock_sendmsg+0x38/0x50
  [] SYSC_sendto+0x101/0x190
  [] ? __sys_recvmsg+0x51/0x90
  [] SyS_sendto+0xe/0x10
  [] entry_SYSCALL_64_fastpath+0x1a/0xa4

The problem is that the 8021q module propagates the call to
ndo_bridge_getlink() via switchdev ops, but the switch driver doesn't
recognize the netdev, as it's not offloaded.

While we can ignore calls being made to non-bridge ports inside the
driver, a better fix would be to push this check up to the switchdev
layer.

Note that these ndos can be called for non-bridged netdev, but this only
happens in certain PF drivers which don't call the corresponding
switchdev functions anyway.

Fixes: 99f44bb3527b ("mlxsw: spectrum: Enable L3 interfaces on top of bridge 
devices")
Signed-off-by: Ido Schimmel 
Reported-by: Tamir Winetroub 
Tested-by: Tamir Winetroub 
Signed-off-by: Jiri Pirko 
---
Please queue-up to stable as well. Thanks.
---
 net/switchdev/switchdev.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 02beb35..3b95fe9 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -771,6 +771,9 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 
pid, u32 seq,
u32 mask = BR_LEARNING | BR_LEARNING_SYNC | BR_FLOOD;
int err;
 
+   if (!netif_is_bridge_port(dev))
+   return -EOPNOTSUPP;
+
err = switchdev_port_attr_get(dev, );
if (err && err != -EOPNOTSUPP)
return err;
@@ -926,6 +929,9 @@ int switchdev_port_bridge_setlink(struct net_device *dev,
struct nlattr *afspec;
int err = 0;
 
+   if (!netif_is_bridge_port(dev))
+   return -EOPNOTSUPP;
+
protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
   IFLA_PROTINFO);
if (protinfo) {
@@ -959,6 +965,9 @@ int switchdev_port_bridge_dellink(struct net_device *dev,
 {
struct nlattr *afspec;
 
+   if (!netif_is_bridge_port(dev))
+   return -EOPNOTSUPP;
+
afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
 IFLA_AF_SPEC);
if (afspec)
-- 
2.5.5

[PATCH net-next] ethernet/sfc: use core min/max MTU checking

2016-10-18 Thread Bert Kenward

Fixes: 61e84623 ("net: centralize net_device min/max MTU checking")
Signed-off-by: Bert Kenward 
---
 drivers/net/ethernet/sfc/efx.c | 14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 3cf3557..b626da6 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2263,18 +2263,6 @@ static int efx_change_mtu(struct net_device *net_dev, 
int new_mtu)
rc = efx_check_disabled(efx);
if (rc)
return rc;
-   if (new_mtu > EFX_MAX_MTU) {
-   netif_err(efx, drv, efx->net_dev,
- "Requested MTU of %d too big (max: %d)\n",
- new_mtu, EFX_MAX_MTU);
-   return -EINVAL;
-   }
-   if (new_mtu < EFX_MIN_MTU) {
-   netif_err(efx, drv, efx->net_dev,
- "Requested MTU of %d too small (min: %d)\n",
- new_mtu, EFX_MIN_MTU);
-   return -EINVAL;
-   }
 
netif_dbg(efx, drv, efx->net_dev, "changing MTU to %d\n", new_mtu);
 
@@ -2478,6 +2466,8 @@ static int efx_register_netdev(struct efx_nic *efx)
net_dev->priv_flags |= IFF_UNICAST_FLT;
net_dev->ethtool_ops = _ethtool_ops;
net_dev->gso_max_segs = EFX_TSO_MAX_SEGS;
+   net_dev->min_mtu = EFX_MIN_MTU;
+   net_dev->max_mtu = EFX_MAX_MTU;
 
rtnl_lock();
 
-- 
2.7.4

Re: [PATCH v3 4/4] net: smsc91x: add u16 workaround for pxa platforms

2016-10-18 Thread Rob Herring

On Mon, Oct 17, 2016 at 09:45:32PM +0200, Robert Jarzmik wrote:
> Add a workaround for mainstone, idp and stargate2 boards, for u16 writes
> which must be aligned on 32 bits addresses.
> 
> Signed-off-by: Robert Jarzmik 
> Cc: Jeremy Linton 
> ---
> Since v1: rename dt property to pxa-u16-align4
> change the binding documentation file
> ---
>  Documentation/devicetree/bindings/net/smsc-lan91c111.txt | 2 ++
>  1 file changed, 2 insertions(+)

Acked-by: Rob Herring

[PATCH net v2] bridge: multicast: restore perm router ports on multicast enable

2016-10-18 Thread Nikolay Aleksandrov

Satish reported a problem with the perm multicast router ports not getting
reenabled after some series of events, in particular if it happens that the
multicast snooping has been disabled and the port goes to disabled state
then it will be deleted from the router port list, but if it moves into
non-disabled state it will not be re-added because the mcast snooping is
still disabled, and enabling snooping later does nothing.

Here are the steps to reproduce, setup br0 with snooping enabled and eth1
added as a perm router (multicast_router = 2):
1. $ echo 0 > /sys/class/net/br0/bridge/multicast_snooping
2. $ ip l set eth1 down
^ This step deletes the interface from the router list
3. $ ip l set eth1 up
^ This step does not add it again because mcast snooping is disabled
4. $ echo 1 > /sys/class/net/br0/bridge/multicast_snooping
5. $ bridge -d -s mdb show


At this point we have mcast enabled and eth1 as a perm router (value = 2)
but it is not in the router list which is incorrect.

After this change:
1. $ echo 0 > /sys/class/net/br0/bridge/multicast_snooping
2. $ ip l set eth1 down
^ This step deletes the interface from the router list
3. $ ip l set eth1 up
^ This step does not add it again because mcast snooping is disabled
4. $ echo 1 > /sys/class/net/br0/bridge/multicast_snooping
5. $ bridge -d -s mdb show
router ports on br0: eth1

Note: we can directly do br_multicast_enable_port for all because the
querier timer already has checks for the port state and will simply
expire if it's in blocking/disabled. See the comment added by
commit 9aa66382163e7 ("bridge: multicast: add a comment to
br_port_state_selection about blocking state")

Fixes: 561f1103a2b7 ("bridge: Add multicast_snooping sysfs toggle")
Reported-by: Satish Ashok 
Signed-off-by: Nikolay Aleksandrov 
---
v2: just call br_multicast_enable_port for all ports and use
br_multicast_open

 net/bridge/br_multicast.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index c5fea9393946..2136e45f5277 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -972,13 +972,12 @@ static void br_multicast_enable(struct 
bridge_mcast_own_query *query)
mod_timer(>timer, jiffies);
 }
 
-void br_multicast_enable_port(struct net_bridge_port *port)
+static void __br_multicast_enable_port(struct net_bridge_port *port)
 {
struct net_bridge *br = port->br;
 
-   spin_lock(>multicast_lock);
if (br->multicast_disabled || !netif_running(br->dev))
-   goto out;
+   return;
 
br_multicast_enable(>ip4_own_query);
 #if IS_ENABLED(CONFIG_IPV6)
@@ -987,8 +986,14 @@ void br_multicast_enable_port(struct net_bridge_port *port)
if (port->multicast_router == MDB_RTR_TYPE_PERM &&
hlist_unhashed(>rlist))
br_multicast_add_router(br, port);
+}
 
-out:
+void br_multicast_enable_port(struct net_bridge_port *port)
+{
+   struct net_bridge *br = port->br;
+
+   spin_lock(>multicast_lock);
+   __br_multicast_enable_port(port);
spin_unlock(>multicast_lock);
 }
 
@@ -1994,8 +1999,9 @@ static void br_multicast_start_querier(struct net_bridge 
*br,
 
 int br_multicast_toggle(struct net_bridge *br, unsigned long val)
 {
-   int err = 0;
struct net_bridge_mdb_htable *mdb;
+   struct net_bridge_port *port;
+   int err = 0;
 
spin_lock_bh(>multicast_lock);
if (br->multicast_disabled == !val)
@@ -2023,10 +2029,9 @@ int br_multicast_toggle(struct net_bridge *br, unsigned 
long val)
goto rollback;
}
 
-   br_multicast_start_querier(br, >ip4_own_query);
-#if IS_ENABLED(CONFIG_IPV6)
-   br_multicast_start_querier(br, >ip6_own_query);
-#endif
+   br_multicast_open(br);
+   list_for_each_entry(port, >port_list, list)
+   __br_multicast_enable_port(port);
 
 unlock:
spin_unlock_bh(>multicast_lock);
-- 
2.1.4

Re: [PATCH net v2 0/3] Tunneling fixes

2016-10-18 Thread Juerg Haefliger

> This series fixes a problem that was reported where encapsulated packets
> do not have their encapsulation offload markers stripped off when being
> decapsulated. This causes a significant performance drop if the packets
> are later retransmitted.
>
> Fixing this revealed two other bugs which are also addressed as prerequisites:
>  * GRO can aggregate packets for multiple layers of encapsulation which the
>stack cannot properly handle.
>  * IPIP packets which are combined by GRO are not marked properly with their
>GSO type.
>
> Note that this is based off the net-next tree as the current target for
> bug fixes.

I need to backport this series to the 4.4 kernel to fix a performance issue 
we're seeing. The series
applies but commit a09a4c8dd1ec (tunnels: Remove encapsulation offloads on 
decap) breaks compilation
when CONFIG_IPV6_SIT is enabled. This is because the patch uses 
iptunnel_pull_header() whose usage
changed with commit 7f290c94352e (iptunnel: scrub packet in 
iptunnel_pull_header) which is not in 4.4.

7f290c94352e seems to be a cleanup patch which also requires c9e78efb6f66 
(vxlan: move vxlan device
lookup before iptunnel_pull_header) and potentially others. Rather than pulling 
in a slew of cleanup
patches, I was wondering if the following from commit a09a4c8dd1ec can be 
rewritten without using
the 'new' iptunnel_pull_header() function:

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index f45b8ffc2840..83384308d032 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -681,14 +681,16 @@ static int ipip6_rcv(struct sk_buff *skb)
skb->mac_header = skb->network_header;
skb_reset_network_header(skb);
IPCB(skb)->flags = 0;
-   skb->protocol = htons(ETH_P_IPV6);
+   skb->dev = tunnel->dev;

if (packet_is_spoofed(skb, iph, tunnel)) {
tunnel->dev->stats.rx_errors++;
goto out;
}

-   __skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
+   if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6),
+   !net_eq(tunnel->net, dev_net(tunnel->dev
+   goto out;


Thanks
...Juerg


> v2: No code changes, just additional information in commit messages and
> a new cover letter.
>
> Jesse Gross (3):
>   ipip: Properly mark ipip GRO packets as encapsulated.
>   tunnels: Don't apply GRO to multiple layers of encapsulation.
>   tunnels: Remove encapsulation offloads on decap.
>
>  include/linux/netdevice.h |  4 ++--
>  include/net/ip_tunnels.h  | 16 
>  net/core/dev.c|  2 +-
>  net/ipv4/af_inet.c| 24 ++--
>  net/ipv4/fou.c| 13 +++--
>  net/ipv4/gre_offload.c|  5 +
>  net/ipv4/ip_tunnel_core.c |  3 ++-
>  net/ipv4/udp_offload.c|  6 +++---
>  net/ipv6/ip6_offload.c| 15 ++-
>  net/ipv6/sit.c|  6 --
>  10 files changed, 80 insertions(+), 14 deletions(-)



signature.asc
Description: OpenPGP digital signature

Re: [PATCH v3 net-next 00/11] net: Fix netdev adjacency tracking

2016-10-18 Thread David Ahern

On 10/18/16 9:46 AM, David Miller wrote:
> Series applied, but the recursion is disappointing.
> 
> If we run into problems due to kernel stack depth because of this with
> some configurations (reasonable or not, if we allow it then it can't
> crash the kernel), we will either need to find a way to make this walk
> iterative or revert these changes.

understood.

Since 4.9 is tagged as the next LTS I would like to see the series applied to 
it at some point - assuming no problems show up with wider exposure in net-next.

Re: [PATCH v5 0/4] Add support for led triggers on phy link state change

2016-10-18 Thread David Miller

From: Zach Brown 
Date: Mon, 17 Oct 2016 10:49:51 -0500

> Fix skge driver that declared enum contants that conflicted with enum
> constants in linux/leds.h
> 
> Create function that encapsulates actions taken during the adjust phy link 
> step
> of phy state changes.
> 
> Create function that provides list of speeds currently supported by the phy.
> 
> Add support for led triggers on phy link state changes by adding
> a config option. When set the config option will create a set of led triggers
> for each phy device. Users can use the led triggers to represent link state
> changes on the phy.

Ok this looks good enough for now, we can expand and improve upon it
later if necessary.

Series applied, thanks Zach.

Re: [PATCH -next] net: dsa: mv88e6xxx: fix non static symbol warning

2016-10-18 Thread Andrew Lunn

On Tue, Oct 18, 2016 at 03:53:37PM +, Wei Yongjun wrote:
> From: Wei Yongjun 
> 
> Fixes the following sparse warning:
> 
> drivers/net/dsa/mv88e6xxx/chip.c:2866:5: warning:
>  symbol 'mv88e6xxx_g1_set_switch_mac' was not declared. Should it be static?
> 
> Signed-off-by: Wei Yongjun 

Reviewed-by: Andrew Lunn 

Andrew

[PATCH -next] qed: Remove useless set memory to zero use memset()

2016-10-18 Thread Wei Yongjun

From: Wei Yongjun 

The memory return by kzalloc() has already be set to zero, so
remove useless memset(0).

Signed-off-by: Wei Yongjun 
---
 drivers/net/ethernet/qlogic/qed/qed_roce.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c 
b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index f3a825a..6a353ff 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -2658,7 +2658,6 @@ static int qed_roce_ll2_start(struct qed_dev *cdev,
DP_ERR(cdev, "qed roce ll2 start: failed memory allocation\n");
return -ENOMEM;
}
-   memset(roce_ll2, 0, sizeof(*roce_ll2));
roce_ll2->handle = QED_LL2_UNUSED_HANDLE;
roce_ll2->cbs = params->cbs;
roce_ll2->cb_cookie = params->cb_cookie;

[PATCH -next] net: dsa: mv88e6xxx: fix non static symbol warning

2016-10-18 Thread Wei Yongjun

From: Wei Yongjun 

Fixes the following sparse warning:

drivers/net/dsa/mv88e6xxx/chip.c:2866:5: warning:
 symbol 'mv88e6xxx_g1_set_switch_mac' was not declared. Should it be static?

Signed-off-by: Wei Yongjun 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index ac03297..157360f 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2863,7 +2863,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
return mv88e6xxx_port_write(chip, port, PORT_DEFAULT_VLAN, 0x);
 }
 
-int mv88e6xxx_g1_set_switch_mac(struct mv88e6xxx_chip *chip, u8 *addr)
+static int mv88e6xxx_g1_set_switch_mac(struct mv88e6xxx_chip *chip, u8 *addr)
 {
int err;

Re: [PATCH v2 net-next 0/7] udp: Flow dissection for tunnels

2016-10-18 Thread David Miller

From: David Miller 
Date: Tue, 18 Oct 2016 11:48:37 -0400 (EDT)

> Series applied, thanks Tom.

Actually, reverted.

Tom, would you mind build testing with ipv6 enabled? :-)

net/ipv6/udp_offload.c:208:19: error: initialization from incompatible pointer 
type [-Werror=incompatible-pointer-types]
   .flow_dissect = udp6_flow_dissect,
   ^
net/ipv6/udp_offload.c:208:19: note: (near initialization for 
‘udpv6_offload.callbacks.flow_dissect’)

Re: [PATCH v2 net-next 0/7] udp: Flow dissection for tunnels

2016-10-18 Thread David Miller

From: Tom Herbert 
Date: Mon, 17 Oct 2016 12:41:55 -0700

> Now that we have a means to perform a UDP socket lookup without taking
> a reference, it is feasible to have flow dissector crack open UDP
> encapsulated packets. Generally, we would expect that the UDP source
> port or the flow label in IPv6 would contain enough entropy about
> the encapsulated flow. However, there will be cases, such as a static
> UDP tunnel with fixed ports, where dissecting the encapsulated packet
> is valuable.
> 
> The model is here is similar to that implemented for UDP GRO. A
> tunnel implementation (e.g. GUE) may set a flow_dissect function
> in the udp_sk. In __skb_flow_dissect a case has been added for
> UDP to check if there is a socket with flow_dissect set. If there
> is the function is called. The (per tunnel implementation)
> function can parse the encapsulation headers and return the
> next protocol for __skb_flow_dissect to process and it's position
> in nhoff.
> 
> Since performing a UDP lookup on every packet might be expensive
> I added a static key check to bypass the lookup if there are no
> sockets with flow_dissect set. I should mention that doing the
> lookup wasn't particularly a big hit anyway.
> 
> Fou/gue was modified to perform tunnel dissection. This is enabled
> on each listener socket via a netlink configuration option.

Series applied, thanks Tom.

Re: [PATCH v3 net-next 00/11] net: Fix netdev adjacency tracking

2016-10-18 Thread David Miller

From: David Ahern 
Date: Mon, 17 Oct 2016 19:15:42 -0700

> The netdev adjacency tracking is failing to create proper dependencies
> for some topologies. For example this topology
 ...
> hits 1 of 2 problems depending on the order of enslavement. The base set of
> commands for both cases:
 ...
> Case 1 enslave macvlan to the vrf before enslaving the bond to the bridge:
 ...
> Attempts to delete the VRF:
> ip link delete myvrf
> 
> trigger the BUG in __netdev_adjacent_dev_remove:
 ...
> When the BUG is converted to a WARN_ON it shows 4 missing adjacencies:
>   eth3 - myvrf, mvrf - eth3, bond1 - myvrf and myvrf - bond1
> 
> All of those are because the __netdev_upper_dev_link function does not
> properly link macvlan lower devices to myvrf when it is enslaved.
 ...
> Rather than try to maintain a linked list of all upper and lower devices
> per netdevice, only track the direct neighbors. The remaining stack can
> be determined by recursively walking the neighbors.
> 
> The existing netdev_for_each_all_upper_dev_rcu,
> netdev_for_each_all_lower_dev and netdev_for_each_all_lower_dev_rcu macros
> are replaced with APIs that walk the upper and lower device lists. The
> new APIs take a callback function and a data arg that is passed to the
> callback for each device in the list. Drivers using the old macros are
> converted in separate patches to make it easier on reviewers. It is an
> API conversion only; no functional change is intended.

Series applied, but the recursion is disappointing.

If we run into problems due to kernel stack depth because of this with
some configurations (reasonable or not, if we allow it then it can't
crash the kernel), we will either need to find a way to make this walk
iterative or revert these changes.

Thanks.

Re: [PATCH net-next 00/15] ethernet: use core min/max MTU checking

2016-10-18 Thread David Miller

From: Jarod Wilson 
Date: Mon, 17 Oct 2016 15:54:02 -0400

> Now that the network stack core min/max MTU checking infrastructure is in
> place, time to start making drivers use it. We'll start with the easiest
> ones, the ethernet drivers, split roughly by vendor, with a catch-all
> patch at the end.
> 
> For the most part, every patch does the same essential thing: removes the
> MTU range checking from the drivers' ndo_change_mtu function, puts those
> ranges into the core net_device min_mtu and max_mtu fields, and where
> possible, removes ndo_change_mtu functions entirely.
> 
> These patches have all been built through the 0-day build infrastructure
> provided by Intel, on top of net-next as of October 17.

Series applied, thanks Jarod.

Re: [PATCH net-next 0/2] Move to BPF selftests

2016-10-18 Thread David Miller

From: Daniel Borkmann 
Date: Mon, 17 Oct 2016 14:28:34 +0200

> This set improves the test_verifier and test_maps suite and moves
> it over to a new BPF selftest directory, so we can keep improving
> it under kernel selftest umbrella. This also integrates a test
> script for checking test_bpf.ko under various JIT options.

Series applied, thanks Daniel.

Re: [PATCH net-next 00/15] ethernet: use core min/max MTU checking

2016-10-18 Thread David Miller

From: Jarod Wilson 
Date: Mon, 17 Oct 2016 16:29:43 -0400

> On Mon, Oct 17, 2016 at 04:03:41PM -0400, David Miller wrote:
>> From: Jarod Wilson 
>> Date: Mon, 17 Oct 2016 15:54:02 -0400
>> 
>> > For the most part, every patch does the same essential thing: removes the
>> > MTU range checking from the drivers' ndo_change_mtu function, puts those
>> > ranges into the core net_device min_mtu and max_mtu fields, and where
>> > possible, removes ndo_change_mtu functions entirely.
>> 
>> Jarod, please read my other posting.
> 
> Done, didn't see it until just after I'd hit send, have replied there as
> well.
> 
>> You've positively broken the maximum MTU for all of these drivers.
>> 
>> That's not cool.
>>
>> And this series fixing things doesn't make things better, because now
>> we've significanyly broken bisection for anyone running into this
>> regression.
> 
> Agreed, and my suggestion right now is to revert the 2nd patch from the
> prior series. I believe it can be resubmitted after all other callers of
> ether_setup() have been converted to have their own min/max_mtu.
> 
>> You should have arranged this in such a way that the drivers needing
>> > 1500 byte MTU were not impacted at all by your changes, but that
>> isn't what happened.
> 
> Yeah, I must admit to not looking closely enough at the state the first
> two patches left things in. It was absolutely my intention to not alter
> behaviour in any way, but I neglected to test sufficiently without this
> additional set applied.

So what I'm going to do now it simply just apply your current patch series
to net-next and hope this gets everything working again.

I'm just happy that you acknowledged how badly things got broken, so let's
move on and try to avoid this happening again in the future.

Thanks.

Re: [PATCH 00/10] mm: adjust get_user_pages* functions to explicitly pass FOLL_* flags

2016-10-18 Thread Michal Hocko

On Thu 13-10-16 01:20:10, Lorenzo Stoakes wrote:
> This patch series adjusts functions in the get_user_pages* family such that
> desired FOLL_* flags are passed as an argument rather than implied by flags.
> 
> The purpose of this change is to make the use of FOLL_FORCE explicit so it is
> easier to grep for and clearer to callers that this flag is being used. The 
> use
> of FOLL_FORCE is an issue as it overrides missing VM_READ/VM_WRITE flags for 
> the
> VMA whose pages we are reading from/writing to, which can result in surprising
> behaviour.
> 
> The patch series came out of the discussion around commit 38e0885, which
> addressed a BUG_ON() being triggered when a page was faulted in with PROT_NONE
> set but having been overridden by FOLL_FORCE. do_numa_page() was run on the
> assumption the page _must_ be one marked for NUMA node migration as an actual
> PROT_NONE page would have been dealt with prior to this code path, however
> FOLL_FORCE introduced a situation where this assumption did not hold.
> 
> See https://marc.info/?l=linux-mm=147585445805166 for the patch proposal.

I like this cleanup. Tracking FOLL_FORCE users was always a nightmare
and the flag behavior is really subtle so we should better be explicit
about it. I haven't gone through each patch separately but rather
applied the whole series and checked the resulting diff. This all seems
OK to me and feel free to add
Acked-by: Michal Hocko 

I am wondering whether we can go further. E.g. it is not really clear to
me whether we need an explicit FOLL_REMOTE when we can in fact check
mm != current->mm and imply that. Maybe there are some contexts which
wouldn't work, I haven't checked.

Then I am also wondering about FOLL_TOUCH behavior.
__get_user_pages_unlocked has only few callers which used to be
get_user_pages_unlocked before 1e9877902dc7e ("mm/gup: Introduce
get_user_pages_remote()"). To me a dropped FOLL_TOUCH seems
unintentional. Now that get_user_pages_unlocked has gup_flags argument I
guess we might want to get rid of the __g-u-p-u version altogether, no?

__get_user_pages is quite low level and imho shouldn't be exported. It's
only user - kvm - should rather pull those two functions to gup instead
and export them. There is nothing really KVM specific in them.

I also cannot say I would be entirely thrilled about get_user_pages_locked,
we only have one user which can simply do lock g-u-p unlock AFAICS.

I guess there is more work in that area and I do not want to impose all
that work on you, but I couldn't resist once I saw you playing in that
area ;) Definitely a good start!
-- 
Michal Hocko
SUSE Labs

Re: [PATCH 01/28] [v2] netfilter: nf_tables: avoid uninitialized variable warning

2016-10-18 Thread Pablo Neira Ayuso

On Tue, Oct 18, 2016 at 12:05:30AM +0200, Arnd Bergmann wrote:
> The newly added nft_range_eval() function handles the two possible
> nft range operations, but as the compiler warning points out,
> any unexpected value would lead to the 'mismatch' variable being
> used without being initialized:
> 
> net/netfilter/nft_range.c: In function 'nft_range_eval':
> net/netfilter/nft_range.c:45:5: error: 'mismatch' may be used uninitialized 
> in this function [-Werror=maybe-uninitialized]
> 
> This removes the variable in question and instead moves the
> condition into the switch itself, which is potentially more
> efficient than adding a bogus 'default' clause as in my
> first approach, and is nicer than using the 'uninitialized_var'
> macro.

Applied to the nf tree, thanks Arnd.

Re: [PATCH net-next 09/15] ethernet/dlink: use core min/max MTU checking

2016-10-18 Thread Jarod Wilson

On Tue, Oct 18, 2016 at 04:45:51PM +0300, Denis Kirjanov wrote:
> On 10/17/16, Jarod Wilson  wrote:
> > dl2k: min_mtu 68, max_mtu 1536 or 8000, depending on hardware
> > - Removed change_mtu, does nothing productive anymore
> >
> > sundance: min_mtu 68, max_mtu 8191
> >
> > CC: netdev@vger.kernel.org
> > CC: Denis Kirjanov 
> > Signed-off-by: Jarod Wilson 
> > ---
> >  drivers/net/ethernet/dlink/dl2k.c | 22 --
> >  drivers/net/ethernet/dlink/sundance.c |  6 --
> >  2 files changed, 8 insertions(+), 20 deletions(-)
...
> > diff --git a/drivers/net/ethernet/dlink/sundance.c
> > b/drivers/net/ethernet/dlink/sundance.c
> > index 79d8009..eab36ac 100644
> > --- a/drivers/net/ethernet/dlink/sundance.c
> > +++ b/drivers/net/ethernet/dlink/sundance.c
> > @@ -580,6 +580,10 @@ static int sundance_probe1(struct pci_dev *pdev,
> > dev->ethtool_ops = _ops;
> > dev->watchdog_timeo = TX_TIMEOUT;
> >
> > +   /* MTU range: 68 - 8191 */
> > +   dev->min_mtu = ETH_MIN_MTU;
> > +   dev->max_mtu = 8191;
> > +
> ICPlus datasheet defines the max frame size like 0x1514 or 0x4491
> based on the RcvLargeFrames bit in the MACCtrl0 register

I do anticipate this patchset might bring to light some inaccuracies in
min/max mtu values currently implemented in various drivers, but for the
moment, I'm going for 100% identical behavior with what's currently in the
driver, and if you'll look down below...

> > pci_set_drvdata(pdev, dev);
> >
> > i = register_netdev(dev);
> > @@ -713,8 +717,6 @@ static int sundance_probe1(struct pci_dev *pdev,
> >
> >  static int change_mtu(struct net_device *dev, int new_mtu)
> >  {
> > -   if ((new_mtu < 68) || (new_mtu > 8191)) /* Set by RxDMAFrameLen */

 
The 8191 was simply transplanted from right here. I think altering the
value should be the subject of a separate patch.

-- 
Jarod Wilson
ja...@redhat.com

[PATCH] ipv6: fix signedness of tmp_prefered_lft underflow check

2016-10-18 Thread Jiri Bohac

Commit 76506a986dc31394fd1f2741db037d29c7e57843 (IPv6: fix
DESYNC_FACTOR) introduced a buggy check for underflow of
tmp_prefered_lft. tmp_prefered_lft is unsigned, so the condition
is always false.

Signed-off-by: Jiri Bohac 
Reported-by: Julia Lawall 
Fixes: 76506a986dc3 ("IPv6: fix DESYNC_FACTOR")

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index cc7c26d..7a043a4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1248,7 +1248,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, 
struct inet6_ifaddr *i
tmp_prefered_lft = idev->cnf.temp_prefered_lft + age -
idev->desync_factor;
/* guard against underflow in case of concurrent updates to cnf */
-   if (unlikely(tmp_prefered_lft < 0))
+   if (unlikely((long)tmp_prefered_lft < 0))
tmp_prefered_lft = 0;
tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft);
tmp_plen = ifp->prefix_len;

Re: [PATCH v5 4/4] net: phy: leds: add support for led triggers on phy link state change

2016-10-18 Thread Zach Brown

On Tue, Oct 18, 2016 at 09:13:50AM +0200, Andrew Lunn wrote:
> On Mon, Oct 17, 2016 at 10:49:55AM -0500, Zach Brown wrote:
> > Create an option CONFIG_LED_TRIGGER_PHY (default n), which will create a
> > set of led triggers for each instantiated PHY device. There is one LED
> > trigger per link-speed, per-phy.
> > The triggers are registered during phy_attach and unregistered during
> > phy_detach.
> >
> > This allows for a user to configure their system to allow a set of LEDs
> > not controlled by the phy to represent link state changes on the phy.
> > LEDS controlled by the phy are unaffected.
> >
> > For example, we have a board where some of the leds in the
> > RJ45 socket are controlled by the phy, but others are not. Using the
> > triggers provided by this patch the leds not controlled by the phy can
> > be configured to show the current speed of the ethernet connection. The
> > leds controlled by the phy are unaffected.
>
> Is there a clear path how we generalise this, so it could also be used
> to control PHY LEDs?
>

No, this patch set is only concerned with generic LEDs not PHY LEDs.

> The idea i had a while ago was that the PHY LEDS would also have a
> list of triggers which mapped to what the PHY controller could do. So
> to enable the PHY LED to show RxTx activity, you would enable the RxTx
> trigger on the PHY LED.
>
> This needs an extension to the PHY code, in that these triggers are
> specific to the LED, not general across all LEDs. But this is not too
> big an issue.
>
> We could end up that if a PHY LED can be used as a generic LED, your
> 'manual' trigger could be used on it. But it could also support the
> PHY driven 'automatic' link status indication trigger. Do we want two
> triggers for the same or similar information?
>

If generic LEDs can only use the 'manual' triggers then having two triggers
for the same or similar information would be okay. The focus of this patch set
was to allow generic LEDs to represent the current speed of the phy, which is
very useful when the PHY LEDs don't for whatever reason i.e they don't exist or
are busy representing another aspect.

Re: [PATCH v4 0/4] add support for impedance control for TI dp83867 phy and fix 2nd ethernet on dra72 rev C evm

2016-10-18 Thread David Miller

From: Mugunthan V N 
Date: Tue, 18 Oct 2016 16:50:16 +0530

> Add support for configurable impedance control for TI dp83867
> phy via devicetree. More documentation in [1].
> CPSW second ethernet is not working, fix it by enabling
> impedance configuration on the phy.

Series applied to net-next, thanks.

1 2 >

1 - 100 of 185 matches

Mail list logo