date:20181123

[PATCH net-next v2 2/4] qede: Simplify the usage of qede-flags.

2018-11-23 Thread Sudarsana Reddy Kalluru

The values represented by qede->flags is being used in mixed ways:
  1. As 'value' at some places e.g., QEDE_FLAGS_IS_VF usage
  2. As bit-mask(value) at some places e.g., QEDE_FLAGS_PTP_TX_IN_PRORGESS
 usage.
This implementation pose problems in future when we want to add more flag
values e.g., overlap of the values, overflow of 64-bit storage.

Updated the implementation to go with approach (2) for qede->flags.

Signed-off-by: Sudarsana Reddy Kalluru 
Signed-off-by: Ariel Elior 
Signed-off-by: Michal Kalderon 
---
 drivers/net/ethernet/qlogic/qede/qede.h  | 11 +++
 drivers/net/ethernet/qlogic/qede/qede_main.c |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_ptp.c  |  6 +++---
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h 
b/drivers/net/ethernet/qlogic/qede/qede.h
index de98a97..f8ced12 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -168,6 +168,12 @@ struct qede_rdma_dev {
 
 #define QEDE_RFS_MAX_FLTR  256
 
+enum qede_flags_bit {
+   QEDE_FLAGS_IS_VF = 0,
+   QEDE_FLAGS_PTP_TX_IN_PRORGESS,
+   QEDE_FLAGS_TX_TIMESTAMPING_EN
+};
+
 struct qede_dev {
struct qed_dev  *cdev;
struct net_device   *ndev;
@@ -177,10 +183,7 @@ struct qede_dev {
u8  dp_level;
 
unsigned long flags;
-#define QEDE_FLAG_IS_VFBIT(0)
-#define IS_VF(edev)(!!((edev)->flags & QEDE_FLAG_IS_VF))
-#define QEDE_TX_TIMESTAMPING_ENBIT(1)
-#define QEDE_FLAGS_PTP_TX_IN_PRORGESS  BIT(2)
+#define IS_VF(edev)(test_bit(QEDE_FLAGS_IS_VF, &(edev)->flags))
 
const struct qed_eth_ops*ops;
struct qede_ptp *ptp;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c 
b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 46d0f2e..61f9664 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1086,7 +1086,7 @@ static int __qede_probe(struct pci_dev *pdev, u32 
dp_module, u8 dp_level,
}
 
if (is_vf)
-   edev->flags |= QEDE_FLAG_IS_VF;
+   __set_bit(QEDE_FLAGS_IS_VF, >flags);
 
qede_init_ndev(edev);
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.c 
b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
index 013ff56..67c1f6e 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ptp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
@@ -223,12 +223,12 @@ static int qede_ptp_cfg_filters(struct qede_dev *edev)
 
switch (ptp->tx_type) {
case HWTSTAMP_TX_ON:
-   edev->flags |= QEDE_TX_TIMESTAMPING_EN;
+   __set_bit(QEDE_FLAGS_TX_TIMESTAMPING_EN, >flags);
tx_type = QED_PTP_HWTSTAMP_TX_ON;
break;
 
case HWTSTAMP_TX_OFF:
-   edev->flags &= ~QEDE_TX_TIMESTAMPING_EN;
+   __clear_bit(QEDE_FLAGS_TX_TIMESTAMPING_EN, >flags);
tx_type = QED_PTP_HWTSTAMP_TX_OFF;
break;
 
@@ -518,7 +518,7 @@ void qede_ptp_tx_ts(struct qede_dev *edev, struct sk_buff 
*skb)
if (test_and_set_bit_lock(QEDE_FLAGS_PTP_TX_IN_PRORGESS, >flags))
return;
 
-   if (unlikely(!(edev->flags & QEDE_TX_TIMESTAMPING_EN))) {
+   if (unlikely(!test_bit(QEDE_FLAGS_TX_TIMESTAMPING_EN, >flags))) {
DP_NOTICE(edev,
  "Tx timestamping was not enabled, this packet will 
not be timestamped\n");
} else if (unlikely(ptp->tx_skb)) {
-- 
1.8.3.1

[PATCH net-next v2 4/4] qed: Add support for MBI upgrade over MFW.

2018-11-23 Thread Sudarsana Reddy Kalluru

The patch adds driver support for MBI image update through MFW.

Signed-off-by: Sudarsana Reddy Kalluru 
Signed-off-by: Ariel Elior 
Signed-off-by: Michal Kalderon 
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h  |  6 
 drivers/net/ethernet/qlogic/qed/qed_main.c | 13 +++--
 drivers/net/ethernet/qlogic/qed/qed_mcp.c  | 45 +++---
 drivers/net/ethernet/qlogic/qed/qed_mcp.h  | 10 ---
 4 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h 
b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 5c221eb..7e120b5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -12655,6 +12655,7 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_DCBX_NOTIFY_MASK  0x00FF
 #define DRV_MB_PARAM_DCBX_NOTIFY_SHIFT 3
 
+#define DRV_MB_PARAM_NVM_PUT_FILE_BEGIN_MBI 0x3
 #define DRV_MB_PARAM_NVM_LEN_OFFSET24
 
 #define DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_SHIFT   0
@@ -12814,6 +12815,11 @@ struct public_drv_mb {
union drv_union_data union_data;
 };
 
+#define FW_MB_PARAM_NVM_PUT_FILE_REQ_OFFSET_MASK   0x00ff
+#define FW_MB_PARAM_NVM_PUT_FILE_REQ_OFFSET_SHIFT  0
+#define FW_MB_PARAM_NVM_PUT_FILE_REQ_SIZE_MASK 0xff00
+#define FW_MB_PARAM_NVM_PUT_FILE_REQ_SIZE_SHIFT24
+
 enum MFW_DRV_MSG_TYPE {
MFW_DRV_MSG_LINK_CHANGE,
MFW_DRV_MSG_FLR_FW_ACK_FAILED,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c 
b/drivers/net/ethernet/qlogic/qed/qed_main.c
index fff7f04..4b3e682 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1939,21 +1939,30 @@ static int qed_nvm_flash_image_access(struct qed_dev 
*cdev, const u8 **data,
  * 0B  |   0x3 [command index]|
  * 4B  | b'0: check_response?   | b'1-31  reserved|
  * 8B  | File-type |   reserved   |
+ * 12B |Image length in bytes |
  * \--/
  * Start a new file of the provided type
  */
 static int qed_nvm_flash_image_file_start(struct qed_dev *cdev,
  const u8 **data, bool *check_resp)
 {
+   u32 file_type, file_size = 0;
int rc;
 
*data += 4;
*check_resp = !!(**data & BIT(0));
*data += 4;
+   file_type = **data;
 
DP_VERBOSE(cdev, NETIF_MSG_DRV,
-  "About to start a new file of type %02x\n", **data);
-   rc = qed_mcp_nvm_put_file_begin(cdev, **data);
+  "About to start a new file of type %02x\n", file_type);
+   if (file_type == DRV_MB_PARAM_NVM_PUT_FILE_BEGIN_MBI) {
+   *data += 4;
+   file_size = *((u32 *)(*data));
+   }
+
+   rc = qed_mcp_nvm_write(cdev, QED_PUT_FILE_BEGIN, file_type,
+  (u8 *)(_size), 4);
*data += 4;
 
return rc;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c 
b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 34ed757..e7f18e3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -2745,24 +2745,6 @@ int qed_mcp_nvm_resp(struct qed_dev *cdev, u8 *p_buf)
return 0;
 }
 
-int qed_mcp_nvm_put_file_begin(struct qed_dev *cdev, u32 addr)
-{
-   struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
-   struct qed_ptt *p_ptt;
-   u32 resp, param;
-   int rc;
-
-   p_ptt = qed_ptt_acquire(p_hwfn);
-   if (!p_ptt)
-   return -EBUSY;
-   rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_NVM_PUT_FILE_BEGIN, addr,
-, );
-   cdev->mcp_nvm_resp = resp;
-   qed_ptt_release(p_hwfn, p_ptt);
-
-   return rc;
-}
-
 int qed_mcp_nvm_write(struct qed_dev *cdev,
  u32 cmd, u32 addr, u8 *p_buf, u32 len)
 {
@@ -2776,6 +2758,9 @@ int qed_mcp_nvm_write(struct qed_dev *cdev,
return -EBUSY;
 
switch (cmd) {
+   case QED_PUT_FILE_BEGIN:
+   nvm_cmd = DRV_MSG_CODE_NVM_PUT_FILE_BEGIN;
+   break;
case QED_PUT_FILE_DATA:
nvm_cmd = DRV_MSG_CODE_NVM_PUT_FILE_DATA;
break;
@@ -2788,10 +2773,14 @@ int qed_mcp_nvm_write(struct qed_dev *cdev,
goto out;
}
 
+   buf_size = min_t(u32, (len - buf_idx), MCP_DRV_NVM_BUF_LEN);
while (buf_idx < len) {
-   buf_size = min_t(u32, (len - buf_idx), MCP_DRV_NVM_BUF_LEN);
-   nvm_offset = ((buf_size << DRV_MB_PARAM_NVM_LEN_OFFSET) |
- addr) + buf_idx;
+   if (cmd == QED_PUT_FILE_BEGIN)
+   nvm_offset = addr;
+   else
+   nvm_offset = ((buf_size <<
+

[PATCH net-next v2 3/4] qede: Update link status only when interface is ready.

2018-11-23 Thread Sudarsana Reddy Kalluru

In the case of internal reload (e.g., mtu change), there could be a race
between link-up notification from mfw and the driver unload processing. In
such case kernel assumes the link is up and starts using the queues which
leads to the server crash.

Send link notification to the kernel only when driver has already requested
MFW for the link.

Signed-off-by: Sudarsana Reddy Kalluru 
Signed-off-by: Ariel Elior 
Signed-off-by: Michal Kalderon 
---
 drivers/net/ethernet/qlogic/qede/qede.h  | 1 +
 drivers/net/ethernet/qlogic/qede/qede_main.c | 8 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h 
b/drivers/net/ethernet/qlogic/qede/qede.h
index f8ced12..8c0fe59 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -170,6 +170,7 @@ struct qede_rdma_dev {
 
 enum qede_flags_bit {
QEDE_FLAGS_IS_VF = 0,
+   QEDE_FLAGS_LINK_REQUESTED,
QEDE_FLAGS_PTP_TX_IN_PRORGESS,
QEDE_FLAGS_TX_TIMESTAMPING_EN
 };
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c 
b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 61f9664..c6e387e 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2057,6 +2057,8 @@ static void qede_unload(struct qede_dev *edev, enum 
qede_unload_mode mode,
if (!is_locked)
__qede_lock(edev);
 
+   clear_bit(QEDE_FLAGS_LINK_REQUESTED, >flags);
+
edev->state = QEDE_STATE_CLOSED;
 
qede_rdma_dev_event_close(edev);
@@ -2163,6 +2165,8 @@ static int qede_load(struct qede_dev *edev, enum 
qede_load_mode mode,
/* Program un-configured VLANs */
qede_configure_vlan_filters(edev);
 
+   set_bit(QEDE_FLAGS_LINK_REQUESTED, >flags);
+
/* Ask for link-up using current configuration */
memset(_params, 0, sizeof(link_params));
link_params.link_up = true;
@@ -2258,8 +2262,8 @@ static void qede_link_update(void *dev, struct 
qed_link_output *link)
 {
struct qede_dev *edev = dev;
 
-   if (!netif_running(edev->ndev)) {
-   DP_VERBOSE(edev, NETIF_MSG_LINK, "Interface is not running\n");
+   if (!test_bit(QEDE_FLAGS_LINK_REQUESTED, >flags)) {
+   DP_VERBOSE(edev, NETIF_MSG_LINK, "Interface is not ready\n");
return;
}
 
-- 
1.8.3.1

[PATCH net-next v2 1/4] qed: Display port_id in the UFP debug messages.

2018-11-23 Thread Sudarsana Reddy Kalluru

MFW sends UFP notifications mostly during the device init phase and PFs
might not be assigned with a name by this time. Hence capturing port-id in
the debug messages would help in finding which PF the ufp notification was
sent to.

Also, fixed a minor scemantic issue in a debug print.

Signed-off-by: Sudarsana Reddy Kalluru 
Signed-off-by: Ariel Elior 
Signed-off-by: Michal Kalderon 
---
 drivers/net/ethernet/qlogic/qed/qed_mcp.c | 20 +---
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c 
b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index a96364d..34ed757 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1619,7 +1619,7 @@ static void qed_mcp_update_stag(struct qed_hwfn *p_hwfn, 
struct qed_ptt *p_ptt)
qed_sp_pf_update_stag(p_hwfn);
}
 
-   DP_VERBOSE(p_hwfn, QED_MSG_SP, "ovlan  = %d hw_mode = 0x%x\n",
+   DP_VERBOSE(p_hwfn, QED_MSG_SP, "ovlan = %d hw_mode = 0x%x\n",
   p_hwfn->mcp_info->func_info.ovlan, p_hwfn->hw_info.hw_mode);
 
/* Acknowledge the MFW */
@@ -1641,7 +1641,9 @@ void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, 
struct qed_ptt *p_ptt)
val = (port_cfg & OEM_CFG_CHANNEL_TYPE_MASK) >>
OEM_CFG_CHANNEL_TYPE_OFFSET;
if (val != OEM_CFG_CHANNEL_TYPE_STAGGED)
-   DP_NOTICE(p_hwfn, "Incorrect UFP Channel type  %d\n", val);
+   DP_NOTICE(p_hwfn,
+ "Incorrect UFP Channel type  %d port_id 0x%02x\n",
+ val, MFW_PORT(p_hwfn));
 
val = (port_cfg & OEM_CFG_SCHED_TYPE_MASK) >> OEM_CFG_SCHED_TYPE_OFFSET;
if (val == OEM_CFG_SCHED_TYPE_ETS) {
@@ -1650,7 +1652,9 @@ void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, 
struct qed_ptt *p_ptt)
p_hwfn->ufp_info.mode = QED_UFP_MODE_VNIC_BW;
} else {
p_hwfn->ufp_info.mode = QED_UFP_MODE_UNKNOWN;
-   DP_NOTICE(p_hwfn, "Unknown UFP scheduling mode %d\n", val);
+   DP_NOTICE(p_hwfn,
+ "Unknown UFP scheduling mode %d port_id 0x%02x\n",
+ val, MFW_PORT(p_hwfn));
}
 
qed_mcp_get_shmem_func(p_hwfn, p_ptt, _info, MCP_PF_ID(p_hwfn));
@@ -1665,13 +1669,15 @@ void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, 
struct qed_ptt *p_ptt)
p_hwfn->ufp_info.pri_type = QED_UFP_PRI_OS;
} else {
p_hwfn->ufp_info.pri_type = QED_UFP_PRI_UNKNOWN;
-   DP_NOTICE(p_hwfn, "Unknown Host priority control %d\n", val);
+   DP_NOTICE(p_hwfn,
+ "Unknown Host priority control %d port_id 0x%02x\n",
+ val, MFW_PORT(p_hwfn));
}
 
DP_NOTICE(p_hwfn,
- "UFP shmem config: mode = %d tc = %d pri_type = %d\n",
- p_hwfn->ufp_info.mode,
- p_hwfn->ufp_info.tc, p_hwfn->ufp_info.pri_type);
+ "UFP shmem config: mode = %d tc = %d pri_type = %d port_id 
0x%02x\n",
+ p_hwfn->ufp_info.mode, p_hwfn->ufp_info.tc,
+ p_hwfn->ufp_info.pri_type, MFW_PORT(p_hwfn));
 }
 
 static int
-- 
1.8.3.1

[PATCH net-next v2 0/4] qed* enhancements series

2018-11-23 Thread Sudarsana Reddy Kalluru

From: Sudarsana Reddy Kalluru 

The patch series add few enhancements to qed/qede drivers.

Changes from previous versions:
---
v2: Use __set_bit()/__clear_bit() where data access doesn't need to be
atomic.

Please consider applying it to "net-next".

Sudarsana Reddy Kalluru (4):
  qed: Display port_id in the UFP debug messages.
  qede: Simplify the usage of qede-flags.
  qede: Update link status only when interface is ready.
  qed: Add support for MBI upgrade over MFW.

 drivers/net/ethernet/qlogic/qed/qed_hsi.h|  6 +++
 drivers/net/ethernet/qlogic/qed/qed_main.c   | 13 +-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c| 65 +++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.h| 10 -
 drivers/net/ethernet/qlogic/qede/qede.h  | 12 +++--
 drivers/net/ethernet/qlogic/qede/qede_main.c | 10 +++--
 drivers/net/ethernet/qlogic/qede/qede_ptp.c  |  6 +--
 7 files changed, 71 insertions(+), 51 deletions(-)

-- 
1.8.3.1

Re: consistency for statistics with XDP mode

2018-11-23 Thread David Miller

From: David Ahern 
Date: Thu, 22 Nov 2018 09:51:27 -0700

> I would like to see basic packets, bytes, and dropped counters tracked
> for Rx and Tx via the standard netdev counters for all devices. This is
> for ease in accounting as well as speed and simplicity for bumping
> counters for virtual devices from bpf helpers.
> 
> From there, the XDP ones can be in the driver private stats as they are
> currently but with some consistency across drivers for redirects, drops,
> any thing else.

I would go so far as to say we should provide generic infrastructure
for this, in the format of a template of statistic name strings, a
templace structure to hold the counters, etc.

[PATCH iproute2-next] tc: fq: support ce_threshold attribute

2018-11-23 Thread Eric Dumazet

Kernel commit 48872c11b772 ("net_sched: sch_fq: add dctcp-like marking")
added support for TCA_FQ_CE_THRESHOLD attribute.

This patch adds iproute2 support for it.

It also makes sure fq_print_xstats() can deal with smaller tc_fq_qd_stats
structures given by older kernels.

Usage :

FQATTRS="ce_threshold 4ms"
TXQS=8

for ETH in eth0
do
 tc qd del dev $ETH root 2>/dev/null
 tc qd add dev $ETH root handle 1: mq
 for i in `seq 1 $TXQS`
 do
  tc qd add dev $ETH parent 1:$i fq $FQATTRS
 done
done

Signed-off-by: Eric Dumazet 
---
 tc/q_fq.c | 32 
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/tc/q_fq.c b/tc/q_fq.c
index 
f3dbf2ba0c6f520ec1080b90fa4f08c968325102..a4174380d5d49730e1f7b2d9e83d684f852aa3cf
 100644
--- a/tc/q_fq.c
+++ b/tc/q_fq.c
@@ -56,6 +56,7 @@ static void explain(void)
fprintf(stderr, "  [ [no]pacing ] [ refill_delay TIME ]\n");
fprintf(stderr, "  [ low_rate_threshold RATE ]\n");
fprintf(stderr, "  [ orphan_mask MASK]\n");
+   fprintf(stderr, "  [ ce_threshold TIME ]\n");
 }
 
 static unsigned int ilog2(unsigned int val)
@@ -83,6 +84,7 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, char 
**argv,
unsigned int defrate;
unsigned int refill_delay;
unsigned int orphan_mask;
+   unsigned int ce_threshold;
bool set_plimit = false;
bool set_flow_plimit = false;
bool set_quantum = false;
@@ -92,6 +94,7 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, char 
**argv,
bool set_refill_delay = false;
bool set_orphan_mask = false;
bool set_low_rate_threshold = false;
+   bool set_ce_threshold = false;
int pacing = -1;
struct rtattr *tail;
 
@@ -135,6 +138,13 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv,
return -1;
}
set_low_rate_threshold = true;
+   } else if (strcmp(*argv, "ce_threshold") == 0) {
+   NEXT_ARG();
+   if (get_time(_threshold, *argv)) {
+   fprintf(stderr, "Illegal \"ce_threshold\"\n");
+   return -1;
+   }
+   set_ce_threshold = true;
} else if (strcmp(*argv, "defrate") == 0) {
NEXT_ARG();
if (strchr(*argv, '%')) {
@@ -226,6 +236,9 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv,
if (set_orphan_mask)
addattr_l(n, 1024, TCA_FQ_ORPHAN_MASK,
  _mask, sizeof(refill_delay));
+   if (set_ce_threshold)
+   addattr_l(n, 1024, TCA_FQ_CE_THRESHOLD,
+ _threshold, sizeof(ce_threshold));
addattr_nest_end(n, tail);
return 0;
 }
@@ -239,6 +252,7 @@ static int fq_print_opt(struct qdisc_util *qu, FILE *f, 
struct rtattr *opt)
unsigned int rate, quantum;
unsigned int refill_delay;
unsigned int orphan_mask;
+   unsigned int ce_threshold;
 
SPRINT_BUF(b1);
 
@@ -310,21 +324,28 @@ static int fq_print_opt(struct qdisc_util *qu, FILE *f, 
struct rtattr *opt)
fprintf(f, "refill_delay %s ", sprint_time(refill_delay, b1));
}
 
+   if (tb[TCA_FQ_CE_THRESHOLD] &&
+   RTA_PAYLOAD(tb[TCA_FQ_CE_THRESHOLD]) >= sizeof(__u32)) {
+   ce_threshold = rta_getattr_u32(tb[TCA_FQ_CE_THRESHOLD]);
+   if (ce_threshold != ~0U)
+   fprintf(f, "ce_threshold %s ", 
sprint_time(ce_threshold, b1));
+   }
+
return 0;
 }
 
 static int fq_print_xstats(struct qdisc_util *qu, FILE *f,
   struct rtattr *xstats)
 {
-   struct tc_fq_qd_stats *st;
+   struct tc_fq_qd_stats *st, _st;
 
if (xstats == NULL)
return 0;
 
-   if (RTA_PAYLOAD(xstats) < sizeof(*st))
-   return -1;
+   memset(&_st, 0, sizeof(_st));
+   memcpy(&_st, RTA_DATA(xstats), min(RTA_PAYLOAD(xstats), sizeof(*st)));
 
-   st = RTA_DATA(xstats);
+   st = &_st;
 
fprintf(f, "  %u flows (%u inactive, %u throttled)",
st->flows, st->inactive_flows, st->throttled_flows);
@@ -343,6 +364,9 @@ static int fq_print_xstats(struct qdisc_util *qu, FILE *f,
if (st->unthrottle_latency_ns)
fprintf(f, ", %u ns latency", st->unthrottle_latency_ns);
 
+   if (st->ce_mark)
+   fprintf(f, ", %llu ce_mark", st->ce_mark);
+
if (st->flows_plimit)
fprintf(f, ", %llu flows_plimit", st->flows_plimit);
 
-- 
2.20.0.rc0.387.gc7a69e6b6c-goog

Re: [PATCH] net: gemini: Fix copy/paste error

2018-11-23 Thread David Miller

From: Linus Walleij 
Date: Sat, 24 Nov 2018 00:16:34 +0100

> From: Andreas Fiedler 
> 
> The TX stats should be started with the tx_stats_syncp,
> there seems to be a copy/paste error in the driver.
> 
> Signed-off-by: Andreas Fiedler 
> Signed-off-by: Linus Walleij 

Applied.

Re: [PATCH] dt-bindings: dsa: Fix typo in "probed"

2018-11-23 Thread David Miller

From: Fabio Estevam 
Date: Fri, 23 Nov 2018 15:46:50 -0200

> The correct form is "can be probed", so fix the typo.
> 
> Signed-off-by: Fabio Estevam 

Applied.

Re: [PATCH net] net: thunderx: set tso_hdrs pointer to NULL in nicvf_free_snd_queue

2018-11-23 Thread David Miller

From: Lorenzo Bianconi 
Date: Fri, 23 Nov 2018 18:28:01 +0100

> Reset snd_queue tso_hdrs pointer to NULL in nicvf_free_snd_queue routine
> since it is used to check if tso dma descriptor queue has been previously
> allocated. The issue can be triggered with the following reproducer:
> 
> $ip link set dev enP2p1s0v0 xdpdrv obj xdp_dummy.o
> $ip link set dev enP2p1s0v0 xdpdrv off
 ...
> where xdp_dummy.c is a simple bpf program that forwards the incoming
> frames to the network stack (available here:
> https://github.com/altoor/xdp_walkthrough_examples/blob/master/sample_1/xdp_dummy.c)
> 
> Fixes: 05c773f52b96 ("net: thunderx: Add basic XDP support")
> Fixes: 4863dea3fab0 ("net: Adding support for Cavium ThunderX network
> controller")
> 
> Signed-off-by: Lorenzo Bianconi 

Applied and queued up for -stable, but please in the future:

1) Do not break up long "Fixes: " tag lines, it must be keep as a single
   uninterrupted line for grep'ability etc.

2) Do not put an empty line between "Fixes: " and other tags.  All tags
   are equal, and appear in a straight uninterrupted sequence of lines.

Thank you.

Re: [PATCH net-next 2/4] qede: Simplify the usage of qede-flags.

2018-11-23 Thread David Miller

From: Sudarsana Reddy Kalluru 
Date: Thu, 22 Nov 2018 22:44:49 -0800

> - edev->flags |= QEDE_FLAG_IS_VF;
> + set_bit(QEDE_FLAGS_IS_VF, >flags);
...
> - edev->flags |= QEDE_TX_TIMESTAMPING_EN;
> + set_bit(QEDE_FLAGS_TX_TIMESTAMPING_EN, >flags);
 ...
> - edev->flags &= ~QEDE_TX_TIMESTAMPING_EN;
> + clear_bit(QEDE_FLAGS_TX_TIMESTAMPING_EN, >flags);

Unless these operations need to be atomic, use __set_bit and __clear_bit.

Re: [PATCH net-next 00/12] switchdev: Convert switchdev_port_obj_{add,del}() to notifiers

2018-11-23 Thread David Miller

From: Petr Machata 
Date: Thu, 22 Nov 2018 23:27:52 +

> An offloading driver may need to have access to switchdev events on
> ports that aren't directly under its control. An example is a VXLAN port
> attached to a bridge offloaded by a driver. The driver needs to know
> about VLANs configured on the VXLAN device. However the VXLAN device
> isn't stashed between the bridge and a front-panel-port device (such as
> is the case e.g. for LAG devices), so the usual switchdev ops don't
> reach the driver.
> 
> VXLAN is likely not the only device type like this: in theory any L2
> tunnel device that needs offloading will prompt requirement of this
> sort.
> 
> A way to fix this is to give up the notion of port object addition /
> deletion as a switchdev operation, which assumes somewhat tight coupling
> between the message producer and consumer. And instead send the message
> over a notifier chain.
 ...

Series applied, thank you.

Re: [PATCH net-next 0/5] r8169: some functional improvements

2018-11-23 Thread David Miller

From: Heiner Kallweit 
Date: Thu, 22 Nov 2018 21:54:36 +0100

> This series includes a few functional improvements.

Series applied.

Re: [PATCH net-next 0/4] octeontx2-af: CGX LMAC link bringup and cleanups

2018-11-23 Thread David Miller

From: Linu Cherian 
Date: Thu, 22 Nov 2018 17:18:33 +0530

> From: Linu Cherian 
> 
> Patch 1: Code cleanup
> Patch 2: Adds support for an unhandled hardware configuration 
> Patch 3: Preparatory patch for enabling cgx lmac links 
> Patch 4: Support for enabling cgx lmac links 

Series applied.

Re: [PATCH net-next 00/10] net/smc: patches 2018-11-22

2018-11-23 Thread David Miller

From: Ursula Braun 
Date: Thu, 22 Nov 2018 10:26:33 +0100

> here are more patches for SMC:
> * patches 1-3 and 7 are cleanups without functional change
> * patches 4-6 and 8 are optimizations of existing code
> * patches 9 and 10 introduce and exploit LLC message DELETE RKEY

Series applied.

Re: [PATCH net] team: no need to do team_notify_peers or team_mcast_rejoin when disabling port

2018-11-23 Thread David Miller

From: Hangbin Liu 
Date: Thu, 22 Nov 2018 16:15:28 +0800

> team_notify_peers() will send ARP and NA to notify peers. team_mcast_rejoin()
> will send multicast join group message to notify peers. We should do this when
> enabling/changed to a new port. But it doesn't make sense to do it when a port
> is disabled.
> 
> On the other hand, when we set mcast_rejoin_count to 2, and do a failover,
> team_port_disable() will increase mcast_rejoin.count_pending to 2 and then
> team_port_enable() will increase mcast_rejoin.count_pending to 4. We will send
> 4 mcast rejoin messages at latest, which will make user confused. The same
> with notify_peers.count.
> 
> Fix it by deleting team_notify_peers() and team_mcast_rejoin() in
> team_port_disable().
> 
> Reported-by: Liang Li 
> Fixes: fc423ff00df3a ("team: add peer notification")
> Fixes: 492b200efdd20 ("team: add support for sending multicast rejoins")
> Signed-off-by: Hangbin Liu 

Applied.

Re: [PATCH mac80211-next v4] mac80211-next: rtnetlink wifi simulation device

2018-11-23 Thread kbuild test robot

Hi Cody,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on mac80211-next/master]

url:
https://github.com/0day-ci/linux/commits/Cody-Schuffelen/mac80211-next-rtnetlink-wifi-simulation-device/20181124-020949
base:   https://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git 
master
config: ia64-allmodconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 8.1.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=8.1.0 make.cross ARCH=ia64 

All errors (new ones prefixed by >>):

   drivers/net/wireless/virt_wifi.o: In function `virt_wifi_cancel_scan':
>> virt_wifi.c:(.text+0x101): undefined reference to `cfg80211_scan_done'
   drivers/net/wireless/virt_wifi.o: In function `virt_wifi_cancel_connect':
>> virt_wifi.c:(.text+0x1f2): undefined reference to `cfg80211_connect_done'
   drivers/net/wireless/virt_wifi.o: In function `virt_wifi_connect_complete':
   virt_wifi.c:(.text+0x562): undefined reference to `cfg80211_connect_done'
   drivers/net/wireless/virt_wifi.o: In function `virt_wifi_scan_result':
>> virt_wifi.c:(.text+0x1022): undefined reference to `cfg80211_inform_bss_data'
>> virt_wifi.c:(.text+0x1032): undefined reference to `cfg80211_put_bss'
   virt_wifi.c:(.text+0x1052): undefined reference to `cfg80211_scan_done'
   drivers/net/wireless/virt_wifi.o: In function `virt_wifi_disconnect':
>> virt_wifi.c:(.text+0x13d2): undefined reference to `cfg80211_disconnected'
   drivers/net/wireless/virt_wifi.o: In function `virt_wifi_destroy_wiphy':
>> virt_wifi.c:(.text.unlikely+0x92): undefined reference to `wiphy_unregister'
>> virt_wifi.c:(.text.unlikely+0xa2): undefined reference to `wiphy_free'
   drivers/net/wireless/virt_wifi.o: In function `virt_wifi_init_module':
>> virt_wifi.c:(.init.text+0x72): undefined reference to `wiphy_new_nm'
>> virt_wifi.c:(.init.text+0x1f2): undefined reference to `wiphy_register'
>> virt_wifi.c:(.init.text+0x232): undefined reference to `wiphy_free'

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip

[PATCH bpf-next v2 4/4] libbpf: Document API and ABI conventions

2018-11-23 Thread Andrey Ignatov

Document API and ABI for libbpf: naming convention, symbol visibility,
ABI versioning.

This is just a starting point. Documentation can be significantly
extended in the future to cover more topics.

ABI versioning section touches only a few basic points with a link to
more comprehensive documentation from Ulrich Drepper. This section can
be extended in the future when there is better understanding what works
well and what not so well in libbpf development process and production
usage.

Signed-off-by: Andrey Ignatov 
---
 tools/lib/bpf/README.rst | 139 +++
 1 file changed, 139 insertions(+)
 create mode 100644 tools/lib/bpf/README.rst

diff --git a/tools/lib/bpf/README.rst b/tools/lib/bpf/README.rst
new file mode 100644
index ..2ced9e061c4b
--- /dev/null
+++ b/tools/lib/bpf/README.rst
@@ -0,0 +1,139 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+libbpf API naming convention
+
+
+libbpf API provides access to a few logically separated groups of
+functions and types. Every group has its own naming convention
+described here. It's recommended to follow these conventions whenever a
+new function or type is added to keep libbpf API clean and consistent.
+
+All types and functions provided by libbpf API should have one of the
+following prefixes: ``bpf_``, ``btf_``, ``libbpf_``.
+
+System call wrappers
+
+
+System call wrappers are simple wrappers for commands supported by
+sys_bpf system call. These wrappers should go to ``bpf.h`` header file
+and map one-on-one to corresponding commands.
+
+For example ``bpf_map_lookup_elem`` wraps ``BPF_MAP_LOOKUP_ELEM``
+command of sys_bpf, ``bpf_prog_attach`` wraps ``BPF_PROG_ATTACH``, etc.
+
+Objects
+---
+
+Another class of types and functions provided by libbpf API is "objects"
+and functions to work with them. Objects are high-level abstractions
+such as BPF program or BPF map. They're represented by corresponding
+structures such as ``struct bpf_object``, ``struct bpf_program``,
+``struct bpf_map``, etc.
+
+Structures are forward declared and access to their fields should be
+provided via corresponding getters and setters rather than directly.
+
+These objects are associated with corresponding parts of ELF object that
+contains compiled BPF programs.
+
+For example ``struct bpf_object`` represents ELF object itself created
+from an ELF file or from a buffer, ``struct bpf_program`` represents a
+program in ELF object and ``struct bpf_map`` is a map.
+
+Functions that work with an object have names built from object name,
+double underscore and part that describes function purpose.
+
+For example ``bpf_object__open`` consists of the name of corresponding
+object, ``bpf_object``, double underscore and ``open`` that defines the
+purpose of the function to open ELF file and create ``bpf_object`` from
+it.
+
+Another example: ``bpf_program__load`` is named for corresponding
+object, ``bpf_program``, that is separated from other part of the name
+by double underscore.
+
+All objects and corresponding functions other than BTF related should go
+to ``libbpf.h``. BTF types and functions should go to ``btf.h``.
+
+Auxiliary functions
+---
+
+Auxiliary functions and types that don't fit well in any of categories
+described above should have ``libbpf_`` prefix, e.g.
+``libbpf_get_error`` or ``libbpf_prog_type_by_name``.
+
+libbpf ABI
+==
+
+libbpf can be both linked statically or used as DSO. To avoid possible
+conflicts with other libraries an application is linked with, all
+non-static libbpf symbols should have one of the prefixes mentioned in
+API documentation above. See API naming convention to choose the right
+name for a new symbol.
+
+Symbol visibility
+-
+
+libbpf follow the model when all global symbols have visibility "hidden"
+by default and to make a symbol visible it has to be explicitly
+attributed with ``LIBBPF_API`` macro. For example:
+
+.. code-block:: c
+
+LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id);
+
+This prevents from accidentally exporting a symbol, that is not supposed
+to be a part of ABI what, in turn, improves both libbpf developer- and
+user-experiences.
+
+ABI versionning
+---
+
+To make future ABI extensions possible libbpf ABI is versioned.
+Versioning is implemented by ``libbpf.map`` version script that is
+passed to linker.
+
+Version name is ``LIBBPF_`` prefix + three-component numeric version,
+starting from ``0.0.1``.
+
+Every time ABI is being changed, e.g. because a new symbol is added or
+semantic of existing symbol is changed, ABI version should be bumped.
+
+For example, if current state of ``libbpf.map`` is:
+
+.. code-block::
+LIBBPF_0.0.1 {
+   global:
+bpf_func_a;
+bpf_func_b;
+   local:
+   \*;
+};
+
+, and a new symbol ``bpf_func_c`` is being introduced, then

[PATCH bpf-next v2 2/4] libbpf: Add version script for DSO

2018-11-23 Thread Andrey Ignatov

More and more projects use libbpf and one day it'll likely be packaged
and distributed as DSO and that requires ABI versioning so that both
compatible and incompatible changes to ABI can be introduced in a safe
way in the future without breaking executables dynamically linked with a
previous version of the library.

Usual way to do ABI versioning is version script for the linker. Add
such a script for libbpf. All global symbols currently exported via
LIBBPF_API macro are added to the version script libbpf.map.

The version name LIBBPF_0.0.1 is constructed from the name of the
library + version specified by $(LIBBPF_VERSION) in Makefile.

Version script does not duplicate the work done by LIBBPF_API macro, it
rather complements it. The macro is used at compile time and can be used
by compiler to do optimization that can't be done at link time, it is
purely about global symbol visibility. The version script, in turn, is
used at link time and takes care of ABI versioning. Both techniques are
described in details in [1].

Whenever ABI is changed in the future, version script should be changed
appropriately.

[1] https://www.akkadia.org/drepper/dsohowto.pdf

Signed-off-by: Andrey Ignatov 
---
 tools/lib/bpf/Makefile   |   4 +-
 tools/lib/bpf/libbpf.map | 121 +++
 2 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 tools/lib/bpf/libbpf.map

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 1b4a683a00fc..22c5ffe22825 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -145,6 +145,7 @@ include $(srctree)/tools/build/Makefile.include
 
 BPF_IN:= $(OUTPUT)libbpf-in.o
 LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
+VERSION_SCRIPT := libbpf.map
 
 CMD_TARGETS = $(LIB_FILE)
 
@@ -176,7 +177,8 @@ $(BPF_IN): force elfdep bpfdep
$(Q)$(MAKE) $(build)=libbpf
 
 $(OUTPUT)libbpf.so: $(BPF_IN)
-   $(QUIET_LINK)$(CC) --shared $^ -o $@
+   $(QUIET_LINK)$(CC) --shared -Wl,--version-script=$(VERSION_SCRIPT) \
+   $^ -o $@
 
 $(OUTPUT)libbpf.a: $(BPF_IN)
$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
new file mode 100644
index ..4fb29f6d7a80
--- /dev/null
+++ b/tools/lib/bpf/libbpf.map
@@ -0,0 +1,121 @@
+LIBBPF_0.0.1 {
+   global:
+   bpf_btf_get_fd_by_id;
+   bpf_create_map;
+   bpf_create_map_in_map;
+   bpf_create_map_in_map_node;
+   bpf_create_map_name;
+   bpf_create_map_node;
+   bpf_create_map_xattr;
+   bpf_load_btf;
+   bpf_load_program;
+   bpf_load_program_xattr;
+   bpf_map__btf_key_type_id;
+   bpf_map__btf_value_type_id;
+   bpf_map__def;
+   bpf_map__fd;
+   bpf_map__is_offload_neutral;
+   bpf_map__name;
+   bpf_map__next;
+   bpf_map__pin;
+   bpf_map__prev;
+   bpf_map__priv;
+   bpf_map__reuse_fd;
+   bpf_map__set_ifindex;
+   bpf_map__set_inner_map_fd;
+   bpf_map__set_priv;
+   bpf_map__unpin;
+   bpf_map_delete_elem;
+   bpf_map_get_fd_by_id;
+   bpf_map_get_next_id;
+   bpf_map_get_next_key;
+   bpf_map_lookup_and_delete_elem;
+   bpf_map_lookup_elem;
+   bpf_map_update_elem;
+   bpf_obj_get;
+   bpf_obj_get_info_by_fd;
+   bpf_obj_pin;
+   bpf_object__btf_fd;
+   bpf_object__close;
+   bpf_object__find_map_by_name;
+   bpf_object__find_map_by_offset;
+   bpf_object__find_program_by_title;
+   bpf_object__kversion;
+   bpf_object__load;
+   bpf_object__name;
+   bpf_object__next;
+   bpf_object__open;
+   bpf_object__open_buffer;
+   bpf_object__open_xattr;
+   bpf_object__pin;
+   bpf_object__pin_maps;
+   bpf_object__pin_programs;
+   bpf_object__priv;
+   bpf_object__set_priv;
+   bpf_object__unload;
+   bpf_object__unpin_maps;
+   bpf_object__unpin_programs;
+   bpf_perf_event_read_simple;
+   bpf_prog_attach;
+   bpf_prog_detach;
+   bpf_prog_detach2;
+   bpf_prog_get_fd_by_id;
+   bpf_prog_get_next_id;
+   bpf_prog_load;
+   bpf_prog_load_xattr;
+   bpf_prog_query;
+   bpf_prog_test_run;
+   bpf_program__fd;
+   bpf_program__is_kprobe;
+   bpf_program__is_perf_event;
+   bpf_program__is_raw_tracepoint;
+   bpf_program__is_sched_act;
+   bpf_program__is_sched_cls;
+

[PATCH bpf-next v2 3/4] libbpf: Verify versioned symbols

2018-11-23 Thread Andrey Ignatov

Since ABI versioning info is kept separately from the code it's easy to
forget to update it while adding a new API.

Add simple verification that all global symbols exported with LIBBPF_API
are versioned in libbpf.map version script.

The idea is to check that number of global symbols in libbpf-in.o, that
is the input to the linker, matches with number of unique versioned
symbols in libbpf.so, that is the output of the linker. If these numbers
don't match, it may mean some symbol was not versioned and make will
fail.

"Unique" means that if a symbol is present in more than one version of
ABI due to ABI changes, it'll be counted once.

Another option to calculate number of global symbols in the "input"
could be to count number of LIBBPF_ABI entries in C headers but it seems
to be fragile.

Example of output when a symbol is missing in version script:

...
LD   libbpf-in.o
LINK libbpf.a
LINK libbpf.so
  Warning: Num of global symbols in libbpf-in.o (115) does NOT match
  with num of versioned symbols in libbpf.so (114). Please make sure all
  LIBBPF_API symbols are versioned in libbpf.map.
  make: *** [check_abi] Error 1

Signed-off-by: Andrey Ignatov 
---
 tools/lib/bpf/Makefile | 19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 22c5ffe22825..34d9c3619c96 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -147,6 +147,11 @@ BPF_IN:= $(OUTPUT)libbpf-in.o
 LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
 VERSION_SCRIPT := libbpf.map
 
+GLOBAL_SYM_COUNT = $(shell readelf -s $(BPF_IN) | \
+  awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {s++} END{print 
s}')
+VERSIONED_SYM_COUNT = $(shell readelf -s $(OUTPUT)libbpf.so | \
+ grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u 
| wc -l)
+
 CMD_TARGETS = $(LIB_FILE)
 
 CXX_TEST_TARGET = $(OUTPUT)test_libbpf
@@ -159,7 +164,7 @@ TARGETS = $(CMD_TARGETS)
 
 all: fixdep all_cmd
 
-all_cmd: $(CMD_TARGETS)
+all_cmd: $(CMD_TARGETS) check
 
 $(BPF_IN): force elfdep bpfdep
@(test -f ../../include/uapi/linux/bpf.h -a -f 
../../../include/uapi/linux/bpf.h && ( \
@@ -186,6 +191,18 @@ $(OUTPUT)libbpf.a: $(BPF_IN)
 $(OUTPUT)test_libbpf: test_libbpf.cpp $(OUTPUT)libbpf.a
$(QUIET_LINK)$(CXX) $^ -lelf -o $@
 
+check: check_abi
+
+check_abi: $(OUTPUT)libbpf.so
+   @if [ "$(GLOBAL_SYM_COUNT)" != "$(VERSIONED_SYM_COUNT)" ]; then  \
+   echo "Warning: Num of global symbols in $(BPF_IN)"   \
+"($(GLOBAL_SYM_COUNT)) does NOT match with num of"  \
+"versioned symbols in $^ ($(VERSIONED_SYM_COUNT))." \
+"Please make sure all LIBBPF_API symbols are"   \
+"versioned in $(VERSION_SCRIPT)." >&2;  \
+   exit 1;  \
+   fi
+
 define do_install
if [ ! -d '$(DESTDIR_SQ)$2' ]; then \
$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \
-- 
2.17.1

[PATCH bpf-next v2 0/4] libbpf: ABI versioning and documentation

2018-11-23 Thread Andrey Ignatov

This patch set adds ABI versioning and documentation to libbpf.

Patch 1 renames btf_get_from_id to btf__get_from_id to follow naming
convention.
Patch 2 adds version script and has more details on ABI versioning.
Patch 3 adds simple check that all global symbols are versioned.
Patch 4 documents a few aspects of libbpf API and ABI in dev process.

v1->v2:
* add patch from Martin KaFai Lau  to rename btf_get_from_id;
* add documentation for libbpf API and ABI.


Andrey Ignatov (3):
  libbpf: Add version script for DSO
  libbpf: Verify versioned symbols
  libbpf: Document API and ABI conventions

Martin KaFai Lau (1):
  libbpf: Name changing for btf_get_from_id

 tools/bpf/bpftool/map.c|   4 +-
 tools/bpf/bpftool/prog.c   |   2 +-
 tools/lib/bpf/Makefile |  23 +++-
 tools/lib/bpf/README.rst   | 139 +
 tools/lib/bpf/btf.c|   2 +-
 tools/lib/bpf/btf.h|   2 +-
 tools/lib/bpf/libbpf.map   | 121 +
 tools/testing/selftests/bpf/test_btf.c |   2 +-
 8 files changed, 287 insertions(+), 8 deletions(-)
 create mode 100644 tools/lib/bpf/README.rst
 create mode 100644 tools/lib/bpf/libbpf.map

-- 
2.17.1

[PATCH bpf-next v2 1/4] libbpf: Name changing for btf_get_from_id

2018-11-23 Thread Andrey Ignatov

From: Martin KaFai Lau 

s/btf_get_from_id/btf__get_from_id/ to restore the API naming convention.

Signed-off-by: Martin KaFai Lau 
Signed-off-by: Andrey Ignatov 
---
 tools/bpf/bpftool/map.c| 4 ++--
 tools/bpf/bpftool/prog.c   | 2 +-
 tools/lib/bpf/btf.c| 2 +-
 tools/lib/bpf/btf.h| 2 +-
 tools/testing/selftests/bpf/test_btf.c | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index a1ae2a3e9fef..96be42f288f5 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -711,7 +711,7 @@ static int do_dump(int argc, char **argv)
 
prev_key = NULL;
 
-   err = btf_get_from_id(info.btf_id, );
+   err = btf__get_from_id(info.btf_id, );
if (err) {
p_err("failed to get btf");
goto exit_free;
@@ -855,7 +855,7 @@ static int do_lookup(int argc, char **argv)
}
 
/* here means bpf_map_lookup_elem() succeeded */
-   err = btf_get_from_id(info.btf_id, );
+   err = btf__get_from_id(info.btf_id, );
if (err) {
p_err("failed to get btf");
goto exit_free;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 37b1daf19da6..521a1073d1b4 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -622,7 +622,7 @@ static int do_dump(int argc, char **argv)
goto err_free;
}
 
-   if (info.btf_id && btf_get_from_id(info.btf_id, )) {
+   if (info.btf_id && btf__get_from_id(info.btf_id, )) {
p_err("failed to get btf");
goto err_free;
}
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 13ddc4bd24ee..eadcf8dfd295 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -415,7 +415,7 @@ const char *btf__name_by_offset(const struct btf *btf, 
__u32 offset)
return NULL;
 }
 
-int btf_get_from_id(__u32 id, struct btf **btf)
+int btf__get_from_id(__u32 id, struct btf **btf)
 {
struct bpf_btf_info btf_info = { 0 };
__u32 len = sizeof(btf_info);
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 701ad2b6c41f..5336b2f37293 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -73,7 +73,7 @@ LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, 
__u32 type_id);
 LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__fd(const struct btf *btf);
 LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 
offset);
-LIBBPF_API int btf_get_from_id(__u32 id, struct btf **btf);
+LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf);
 
 struct btf_ext *btf_ext__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
 void btf_ext__free(struct btf_ext *btf_ext);
diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index bcbda7037840..dff755a7940b 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -2604,7 +2604,7 @@ static int do_test_file(unsigned int test_num)
goto done;
}
 
-   err = btf_get_from_id(info.btf_id, );
+   err = btf__get_from_id(info.btf_id, );
if (CHECK(err, "cannot get btf from kernel, err: %d", err))
goto done;
 
-- 
2.17.1

Re: [PATCH bpf-next 1/2] libbpf: Add version script for DSO

2018-11-23 Thread Andrey Ignatov

Daniel Borkmann  [Thu, 2018-11-22 02:28 -0800]:
> On 11/21/2018 11:22 PM, Alexei Starovoitov wrote:
> > On 11/21/18 12:18 PM, Yonghong Song wrote:
> >> On 11/21/18 9:40 AM, Andrey Ignatov wrote:
> >>> More and more projects use libbpf and one day it'll likely be packaged
> >>> and distributed as DSO and that requires ABI versioning so that both
> >>> compatible and incompatible changes to ABI can be introduced in a safe
> >>> way in the future without breaking executables dynamically linked with a
> >>> previous version of the library.
> >>>
> >>> Usual way to do ABI versioning is version script for the linker. Add
> >>> such a script for libbpf. All global symbols currently exported via
> >>> LIBBPF_API macro are added to the version script libbpf.map.
> >>>
> >>> The version name LIBBPF_0.0.1 is constructed from the name of the
> >>> library + version specified by $(LIBBPF_VERSION) in Makefile.
> >>>
> >>> Version script does not duplicate the work done by LIBBPF_API macro, it
> >>> rather complements it. The macro is used at compile time and can be used
> >>> by compiler to do optimization that can't be done at link time, it is
> >>> purely about global symbol visibility. The version script, in turn, is
> >>> used at link time and takes care of ABI versioning. Both techniques are
> >>> described in details in [1].
> >>>
> >>> Whenever ABI is changed in the future, version script should be changed
> >>> appropriately.
> >>
> >> Maybe we should clarify the policy of how version numbers should be
> >> change? Each commit which changes default global symbol ABI? Each kernel
> >> release?
> 
> +1, could you add a documentation file into tools/lib/bpf/ where we
> keep note on this process?

That makes sense. I'll add documentation.

I think it'll take time to figure out a policy to maintain ABI that
works well (like when to bump version, etc). I'll describe what is
reasonable from my point of view so that we have a starting point and we
can refine / adjust it to reality later.

> >>> [1] 
> >>> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.akkadia.org_drepper_dsohowto.pdf=DwICaQ=5VD0RTtNlTh3ycd41b3MUw=3jAokpHyGuCuJ834j-tttQ=DaYaGCQXLC7Lqf82VhtHjSPrf6R4RdDMKrDDR2T9XPA=nN4Sz6re4n-pP50ICk8s0M-nu_535bblSiVPeEdGiFk=
> >>>
> >>> Signed-off-by: Andrey Ignatov 
> >>> ---
> >>>tools/lib/bpf/Makefile   |   4 +-
> >>>tools/lib/bpf/libbpf.map | 120 +++
> >>>2 files changed, 123 insertions(+), 1 deletion(-)
> >>>create mode 100644 tools/lib/bpf/libbpf.map
> >>>
> >>> diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
> >>> index 425b480bda75..d76c41fa2d39 100644
> >>> --- a/tools/lib/bpf/Makefile
> >>> +++ b/tools/lib/bpf/Makefile
> >>> @@ -145,6 +145,7 @@ include $(srctree)/tools/build/Makefile.include
> >>>
> >>>BPF_IN:= $(OUTPUT)libbpf-in.o
> >>>LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
> >>> +VERSION_SCRIPT := libbpf.map
> >>>
> >>>CMD_TARGETS = $(LIB_FILE)
> >>>
> >>> @@ -170,7 +171,8 @@ $(BPF_IN): force elfdep bpfdep
> >>>   $(Q)$(MAKE) $(build)=libbpf
> >>>
> >>>$(OUTPUT)libbpf.so: $(BPF_IN)
> >>> - $(QUIET_LINK)$(CC) --shared $^ -o $@
> >>> + $(QUIET_LINK)$(CC) --shared -Wl,--version-script=$(VERSION_SCRIPT) \
> >>> + $^ -o $@
> >>>
> >>>$(OUTPUT)libbpf.a: $(BPF_IN)
> >>>   $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
> >>> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
> >>> new file mode 100644
> >>> index ..9fe416b68c7d
> >>> --- /dev/null
> >>> +++ b/tools/lib/bpf/libbpf.map
> >>> @@ -0,0 +1,120 @@
> >>> +LIBBPF_0.0.1 {
> >>> + global:
> >>> + bpf_btf_get_fd_by_id;
> >>
> >> Do you think we could use this opportunities to
> >> make naming more consistent? For example,
> >> bpf_btf_get_fd_by_id => btf__get_fd_by_id?
> > 
> > I think this one is fine since it matches
> > bpf_[map|prog]_get_fd_by_id()
> > and it's a wrapper.
> > 
> >>> + bpf_create_map;
> >>> + bpf_create_map_in_map;
> >>> + bpf_create_map_in_map_node;
> >>> + bpf_create_map_name;
> >>> + bpf_create_map_node;
> >>> + bpf_create_map_xattr;
> >>> + bpf_load_btf;
> >>> + bpf_load_program;
> >>> + bpf_load_program_xattr;
> >>> + bpf_map__btf_key_type_id;
> >>> + bpf_map__btf_value_type_id;
> >>> + bpf_map__def;
> >>> + bpf_map_delete_elem; > +bpf_map__fd;
> >>> + bpf_map_get_fd_by_id;
> >>> + bpf_map_get_next_id;
> >>> + bpf_map_get_next_key; > +   
> >>> bpf_map__is_offload_neutral;
> >>> + bpf_map_lookup_and_delete_elem;
> >>> + bpf_map_lookup_elem;
> >>> + bpf_map__name;
> >>> + bpf_map__next;
> >>> + bpf_map__pin;
> >>> + bpf_map__prev;
> >>> + bpf_map__priv;
> >>> + bpf_map__reuse_fd;
> >>> + bpf_map__set_ifindex;
> >>> +

[PATCH bpf-next] bpf: align map type names formatting.

2018-11-23 Thread David Calavera


Make the formatting for map_type_name array consistent.

Signed-off-by: David Calavera 
---
tools/bpf/bpftool/map.c | 46 -
1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index b0ebbed7d1a6..cbd3080e72c7 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -52,30 +52,30 @@
#include "main.h"

static const char * const map_type_name[] = {
-   [BPF_MAP_TYPE_UNSPEC]   = "unspec",
-   [BPF_MAP_TYPE_HASH] = "hash",
-   [BPF_MAP_TYPE_ARRAY]= "array",
-   [BPF_MAP_TYPE_PROG_ARRAY]   = "prog_array",
-   [BPF_MAP_TYPE_PERF_EVENT_ARRAY] = "perf_event_array",
-   [BPF_MAP_TYPE_PERCPU_HASH]  = "percpu_hash",
-   [BPF_MAP_TYPE_PERCPU_ARRAY] = "percpu_array",
-   [BPF_MAP_TYPE_STACK_TRACE]  = "stack_trace",
-   [BPF_MAP_TYPE_CGROUP_ARRAY] = "cgroup_array",
-   [BPF_MAP_TYPE_LRU_HASH] = "lru_hash",
-   [BPF_MAP_TYPE_LRU_PERCPU_HASH]  = "lru_percpu_hash",
-   [BPF_MAP_TYPE_LPM_TRIE] = "lpm_trie",
-   [BPF_MAP_TYPE_ARRAY_OF_MAPS]= "array_of_maps",
-   [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps",
-   [BPF_MAP_TYPE_DEVMAP]   = "devmap",
-   [BPF_MAP_TYPE_SOCKMAP]  = "sockmap",
-   [BPF_MAP_TYPE_CPUMAP]   = "cpumap",
-   [BPF_MAP_TYPE_XSKMAP]   = "xskmap",
-   [BPF_MAP_TYPE_SOCKHASH] = "sockhash",
-   [BPF_MAP_TYPE_CGROUP_STORAGE]   = "cgroup_storage",
-   [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray",
+   [BPF_MAP_TYPE_UNSPEC]   = "unspec",
+   [BPF_MAP_TYPE_HASH] = "hash",
+   [BPF_MAP_TYPE_ARRAY]= "array",
+   [BPF_MAP_TYPE_PROG_ARRAY]   = "prog_array",
+   [BPF_MAP_TYPE_PERF_EVENT_ARRAY] = "perf_event_array",
+   [BPF_MAP_TYPE_PERCPU_HASH]  = "percpu_hash",
+   [BPF_MAP_TYPE_PERCPU_ARRAY] = "percpu_array",
+   [BPF_MAP_TYPE_STACK_TRACE]  = "stack_trace",
+   [BPF_MAP_TYPE_CGROUP_ARRAY] = "cgroup_array",
+   [BPF_MAP_TYPE_LRU_HASH] = "lru_hash",
+   [BPF_MAP_TYPE_LRU_PERCPU_HASH]  = "lru_percpu_hash",
+   [BPF_MAP_TYPE_LPM_TRIE] = "lpm_trie",
+   [BPF_MAP_TYPE_ARRAY_OF_MAPS]= "array_of_maps",
+   [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps",
+   [BPF_MAP_TYPE_DEVMAP]   = "devmap",
+   [BPF_MAP_TYPE_SOCKMAP]  = "sockmap",
+   [BPF_MAP_TYPE_CPUMAP]   = "cpumap",
+   [BPF_MAP_TYPE_XSKMAP]   = "xskmap",
+   [BPF_MAP_TYPE_SOCKHASH] = "sockhash",
+   [BPF_MAP_TYPE_CGROUP_STORAGE]   = "cgroup_storage",
+   [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY]  = "reuseport_sockarray",
[BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE]= "percpu_cgroup_storage",
-   [BPF_MAP_TYPE_QUEUE] = "queue",
-   [BPF_MAP_TYPE_STACK] = "stack",
+   [BPF_MAP_TYPE_QUEUE]= "queue",
+   [BPF_MAP_TYPE_STACK]= "stack",
};

static bool map_is_per_cpu(__u32 type)
--
2.17.1

[PATCH] tags: Fix DEFINE_PER_CPU expansion

2018-11-23 Thread Rustam Kovhaev

Building tags produces warning:
 ctags: Warning: kernel/bpf/local_storage.c:10: null expansion of name pattern 
"\1"

Let's use the same fix as in commit <25528213fe9f75f4>, even though it
violates the usual code style.

Signed-off-by: Rustam Kovhaev 
---
 kernel/bpf/local_storage.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index c97a8f968638..9e94b1cc6cf2 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -7,8 +7,7 @@
 #include 
 #include 
 
-DEFINE_PER_CPU(struct bpf_cgroup_storage*,
-  bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DEFINE_PER_CPU(struct bpf_cgroup_storage*, 
bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #ifdef CONFIG_CGROUP_BPF
 
-- 
2.19.1

[PATCH] net: gemini: Fix copy/paste error

2018-11-23 Thread Linus Walleij

From: Andreas Fiedler 

The TX stats should be started with the tx_stats_syncp,
there seems to be a copy/paste error in the driver.

Signed-off-by: Andreas Fiedler 
Signed-off-by: Linus Walleij 
---
 drivers/net/ethernet/cortina/gemini.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cortina/gemini.c 
b/drivers/net/ethernet/cortina/gemini.c
index ceec467f590d..949103db8a8a 100644
--- a/drivers/net/ethernet/cortina/gemini.c
+++ b/drivers/net/ethernet/cortina/gemini.c
@@ -660,7 +660,7 @@ static void gmac_clean_txq(struct net_device *netdev, 
struct gmac_txq *txq,
 
u64_stats_update_begin(>tx_stats_syncp);
port->tx_frag_stats[nfrags]++;
-   u64_stats_update_end(>ir_stats_syncp);
+   u64_stats_update_end(>tx_stats_syncp);
}
}
 
-- 
2.19.1

Re: [PATCH net] packet: copy user buffers before orphan or clone

2018-11-23 Thread Willem de Bruijn

On Fri, Nov 23, 2018 at 2:09 PM David Miller  wrote:
>
> From: Willem de Bruijn 
> Date: Tue, 20 Nov 2018 13:00:18 -0500
>
> > From: Willem de Bruijn 
> >
> > tpacket_snd sends packets with user pages linked into skb frags. It
> > notifies that pages can be reused when the skb is released by setting
> > skb->destructor to tpacket_destruct_skb.
> >
> > This can cause data corruption if the skb is orphaned (e.g., on
> > transmit through veth) or cloned (e.g., on mirror to another psock).
> >
> > Create a kernel-private copy of data in these cases, same as tun/tap
> > zerocopy transmission. Reuse that infrastructure: mark the skb as
> > SKBTX_ZEROCOPY_FRAG, which will trigger copy in skb_orphan_frags(_rx).
> >
> > Unlike other zerocopy packets, do not set shinfo destructor_arg to
> > struct ubuf_info. tpacket_destruct_skb already uses that ptr to notify
> > when the original skb is released and a timestamp is recorded. Do not
> > change this timestamp behavior. The ubuf_info->callback is not needed
> > anyway, as no zerocopy notification is expected.
> >
> > Mark destructor_arg as not-a-uarg by setting the lower bit to 1. The
> > resulting value is not a valid ubuf_info pointer, nor a valid
> > tpacket_snd frame address. Add skb_zcopy_.._nouarg helpers for this.
> >
> > The fix relies on features introduced in commit 52267790ef52 ("sock:
> > add MSG_ZEROCOPY"), so can be backported as is only to 4.14.
> >
> > Tested with from `./in_netns.sh ./txring_overwrite` from
> > http://github.com/wdebruij/kerneltools/tests
> >
> > Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap")
> > Reported-by: Anand H. Krishnan 
> > Signed-off-by: Willem de Bruijn 
>
> Applied, and queued up for -stable.  Thanks for the backporting notes.
>
> Any chance those tests from your kerneltools repo can make their way
> into selftests?

Absolutely. I'll send it to net-next once it has the fix.

Re: [PATCH v3 3/4] libbpf: add bpf_prog_test_run_xattr

2018-11-23 Thread Daniel Borkmann

On 11/22/2018 03:09 PM, Lorenz Bauer wrote:
> Add a new function, which encourages safe usage of the test interface.
> bpf_prog_test_run continues to work as before, but should be considered
> unsafe.
> 
> Signed-off-by: Lorenz Bauer 

Set looks good to me, thanks! Three small things below:

> ---
>  tools/lib/bpf/bpf.c | 27 +++
>  tools/lib/bpf/bpf.h | 13 +
>  2 files changed, 40 insertions(+)
> 
> diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
> index 961e1b9fc592..f8518bef6886 100644
> --- a/tools/lib/bpf/bpf.c
> +++ b/tools/lib/bpf/bpf.c
> @@ -424,6 +424,33 @@ int bpf_prog_test_run(int prog_fd, int repeat, void 
> *data, __u32 size,
>   return ret;
>  }
>  
> +int bpf_prog_test_run_xattr(const struct bpf_prog_test_run_attr *test_attr,
> + __u32 *size_out, __u32 *retval, __u32 *duration)
> +{
> + union bpf_attr attr;
> + int ret;
> +
> + if (!test_attr->data_out && test_attr->size_out > 0)
> + return -EINVAL;
> +
> + bzero(, sizeof(attr));
> + attr.test.prog_fd = test_attr->prog_fd;
> + attr.test.data_in = ptr_to_u64(test_attr->data);
> + attr.test.data_out = ptr_to_u64(test_attr->data_out);
> + attr.test.data_size_in = test_attr->size;
> + attr.test.data_size_out = test_attr->size_out;
> + attr.test.repeat = test_attr->repeat;
> +
> + ret = sys_bpf(BPF_PROG_TEST_RUN, , sizeof(attr));
> + if (size_out)
> + *size_out = attr.test.data_size_out;
> + if (retval)
> + *retval = attr.test.retval;
> + if (duration)
> + *duration = attr.test.duration;
> + return ret;
> +}
> +
>  int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
>  {
>   union bpf_attr attr;
> diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
> index 26a51538213c..570f19f77f42 100644
> --- a/tools/lib/bpf/bpf.h
> +++ b/tools/lib/bpf/bpf.h
> @@ -110,6 +110,19 @@ LIBBPF_API int bpf_prog_attach(int prog_fd, int 
> attachable_fd,
>  LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
>  LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd,
>   enum bpf_attach_type type);
> +
> +struct bpf_prog_test_run_attr {
> + int prog_fd;
> + int repeat;
> + const void *data;
> + __u32 size;
> + void *data_out; /* optional */
> + __u32 size_out;

Small nit: could we name these data_{in,out} and data_size_{in,out} as
well, so it's analog to the ones from the bpf_attr?

> +};
> +
> +LIBBPF_API int bpf_prog_test_run_xattr(const struct bpf_prog_test_run_attr 
> *test_attr,
> +__u32 *size_out, __u32 *retval,
> +__u32 *duration);
>  LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data,
>__u32 size, void *data_out, __u32 *size_out,
>__u32 *retval, __u32 *duration);

Could we add a comment into the header here stating that we discourage
bpf_prog_test_run()'s use?

It would probably also make sense since we go that route that we would
convert the 10 bpf_prog_test_run() instances under test_progs.c at the
same time so that people extending or looking at BPF kselftests don't
copy discouraged bpf_prog_test_run() api as examples from this point
onwards anymore.

Thanks,
Daniel

Re: [PATCH v2 bpf-next 1/1] libbpf: make bpf_object__open default to UNSPEC

2018-11-23 Thread Daniel Borkmann

On 11/23/2018 09:58 PM, Nikita V. Shirokov wrote:
> currently by default libbpf's bpf_object__open requires
> bpf's program to specify  version in a code because of two things:
> 1) default prog type is set to KPROBE
> 2) KPROBE requires (in kernel/bpf/syscall.c) version to be specified
> 
> in this patch i'm changing default prog type to UNSPEC and also changing
> requirments for version's section to be present in object file.
> now it would reflect what we have today in kernel
> (only KPROBE prog type requires for version to be explicitly set).
> 
> v1 -> v2:
>  - RFC tag has been dropped
> 
> Signed-off-by: Nikita V. Shirokov 

Applied, thanks!

Re: [PATCH v2] samples: bpf: fix: error handling regarding kprobe_events

2018-11-23 Thread Daniel Borkmann

On 11/22/2018 11:14 PM, Daniel T. Lee wrote:
> Currently, kprobe_events failure won't be handled properly.
> Due to calling system() indirectly to write to kprobe_events,
> it can't be identified whether an error is derived from kprobe or system.
> 
> // buf = "echo '%c:%s %s' >> /s/k/d/t/kprobe_events"
> err = system(buf);
> if (err < 0) {
> printf("failed to create kprobe ..");
> return -1;
> }
> 
> For example, running ./tracex7 sample in ext4 partition,
> "echo p:open_ctree open_ctree >> /s/k/d/t/kprobe_events"
> gets 256 error code system() failure.
> => The error comes from kprobe, but it's not handled correctly.
> 
> According to man of system(3), it's return value
> just passes the termination status of the child shell
> rather than treating the error as -1. (don't care success)
> 
> Which means, currently it's not working as desired.
> (According to the upper code snippet)
> 
> ex) running ./tracex7 with ext4 env.
> # Current Output
> sh: echo: I/O error
> failed to open event open_ctree
> 
> # Desired Output
> failed to create kprobe 'open_ctree' error 'No such file or directory'
> 
> The problem is, error can't be verified whether from child ps or system.
> 
> But using write() directly can verify the command failure,
> and it will treat all error as -1.
> 
> So I suggest using write() directly to 'kprobe_events'
> rather than calling system().
> 
> Signed-off-by: Daniel T. Lee 

Applied to bpf-next, thanks!

Re: [PATCH bpf-next] bpf: Add BPF_MAP_TYPE_QUEUE and BPF_MAP_TYPE_QUEUE to bpftool-map

2018-11-23 Thread Daniel Borkmann

On 11/23/2018 06:48 PM, David Calavera wrote:
> Hi,
> 
> Sorry for the mistake, I'll send a new patch. Before doing that, I've
> noticed that the array of map names in tools/bpf/bpftool/map.c is very
> inconsistent in formatting, some lines use tabs to align the names, others
> use spaces, and other are not aligned at all. Is there any formatting
> convention for this? I can fix those lines if you have a preferred method
> now that I'm adding new elements to that array.

I've fixed the typo from the subject and applied your patch. If you want to
send a patch with white-space cleanup for all the entries that would be fine
with me, sure. You could align all the '=' with tabs to the one from
percpu_cgroup_storage.

Thanks,
Daniel

> On Fri, Nov 23, 2018 at 2:56 AM Edward Cree  wrote:
> 
>> On 22/11/18 20:59, David Calavera wrote:
>>> I noticed that these two new BPF Maps are not defined in bpftool.
>>> This patch defines those two maps and adds their names to the
>>> bpftool-map documentation.
>>>
>>> Signed-off-by: David Calavera 
>>> ---
>> Subject line says 'QUEUE' twice, should one of those be 'STACK'?
>>>  tools/bpf/bpftool/Documentation/bpftool-map.rst | 3 ++-
>>>  tools/bpf/bpftool/map.c | 2 ++
>>>  2 files changed, 4 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst
>> b/tools/bpf/bpftool/Documentation/bpftool-map.rst
>>> index f55a2daed59b..9e827e342d9e 100644
>>> --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
>>> +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
>>> @@ -42,7 +42,8 @@ MAP COMMANDS
>>>  || **percpu_array** | **stack_trace** | **cgroup_array** |
>> **lru_hash**
>>>  || **lru_percpu_hash** | **lpm_trie** | **array_of_maps** |
>> **hash_of_maps**
>>>  || **devmap** | **sockmap** | **cpumap** | **xskmap** |
>> **sockhash**
>>> -|| **cgroup_storage** | **reuseport_sockarray** |
>> **percpu_cgroup_storage** }
>>> +|| **cgroup_storage** | **reuseport_sockarray** |
>> **percpu_cgroup_storage**
>>> +|| **queue** | **stack** }
>>>
>>>  DESCRIPTION
>>>  ===
>>> diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
>>> index 7bf38f0e152e..68b656b6edcc 100644
>>> --- a/tools/bpf/bpftool/map.c
>>> +++ b/tools/bpf/bpftool/map.c
>>> @@ -74,6 +74,8 @@ static const char * const map_type_name[] = {
>>>   [BPF_MAP_TYPE_CGROUP_STORAGE]   = "cgroup_storage",
>>>   [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray",
>>>   [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE]= "percpu_cgroup_storage",
>>> + [BPF_MAP_TYPE_QUEUE] = "queue",
>>> + [BPF_MAP_TYPE_STACK] = "stack",
>>>  };
>>>
>>>  static bool map_is_per_cpu(__u32 type)
>>
>>
>>
>

[PATCH v2 bpf-next 1/1] libbpf: make bpf_object__open default to UNSPEC

2018-11-23 Thread Nikita V. Shirokov

currently by default libbpf's bpf_object__open requires
bpf's program to specify  version in a code because of two things:
1) default prog type is set to KPROBE
2) KPROBE requires (in kernel/bpf/syscall.c) version to be specified

in this patch i'm changing default prog type to UNSPEC and also changing
requirments for version's section to be present in object file.
now it would reflect what we have today in kernel
(only KPROBE prog type requires for version to be explicitly set).

v1 -> v2:
 - RFC tag has been dropped

Signed-off-by: Nikita V. Shirokov 
---
 tools/lib/bpf/libbpf.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 0f14f7c074c2..ed4212a4c5f9 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -333,7 +333,7 @@ bpf_program__init(void *data, size_t size, char 
*section_name, int idx,
prog->idx = idx;
prog->instances.fds = NULL;
prog->instances.nr = -1;
-   prog->type = BPF_PROG_TYPE_KPROBE;
+   prog->type = BPF_PROG_TYPE_UNSPEC;
prog->btf_fd = -1;
 
return 0;
@@ -1649,12 +1649,12 @@ static bool bpf_prog_type__needs_kver(enum 
bpf_prog_type type)
case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_SK_REUSEPORT:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
-   return false;
case BPF_PROG_TYPE_UNSPEC:
-   case BPF_PROG_TYPE_KPROBE:
case BPF_PROG_TYPE_TRACEPOINT:
-   case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_RAW_TRACEPOINT:
+   case BPF_PROG_TYPE_PERF_EVENT:
+   return false;
+   case BPF_PROG_TYPE_KPROBE:
default:
return true;
}
-- 
2.15.1

Re: [PATCH v2] bpf: fix check of allowed specifiers in bpf_trace_printk

2018-11-23 Thread Daniel Borkmann

On 11/23/2018 05:43 PM, Martynas Pumputis wrote:
> A format string consisting of "%p" or "%s" followed by an invalid
> specifier (e.g. "%p%\n" or "%s%") could pass the check which
> would make format_decode (lib/vsprintf.c) to warn.
> 
> Reported-by: syzbot+1ec5c5ec949c4adaa...@syzkaller.appspotmail.com
> Signed-off-by: Martynas Pumputis 

Applied to bpf, thanks!

Re: [PATCH bpf-next 1/2] libbpf: Add version script for DSO

2018-11-23 Thread Andrey Ignatov

Martin Lau  [Fri, 2018-11-23 10:44 -0800]:
> On Wed, Nov 21, 2018 at 02:22:14PM -0800, Alexei Starovoitov wrote:
> > On 11/21/18 12:18 PM, Yonghong Song wrote:
> > > 
> > > 
> > > On 11/21/18 9:40 AM, Andrey Ignatov wrote:
> > >> More and more projects use libbpf and one day it'll likely be packaged
> > >> and distributed as DSO and that requires ABI versioning so that both
> > >> compatible and incompatible changes to ABI can be introduced in a safe
> > >> way in the future without breaking executables dynamically linked with a
> > >> previous version of the library.
> > >>
> > >> Usual way to do ABI versioning is version script for the linker. Add
> > >> such a script for libbpf. All global symbols currently exported via
> > >> LIBBPF_API macro are added to the version script libbpf.map.
> > >>
> > >> The version name LIBBPF_0.0.1 is constructed from the name of the
> > >> library + version specified by $(LIBBPF_VERSION) in Makefile.
> > >>
> > >> Version script does not duplicate the work done by LIBBPF_API macro, it
> > >> rather complements it. The macro is used at compile time and can be used
> > >> by compiler to do optimization that can't be done at link time, it is
> > >> purely about global symbol visibility. The version script, in turn, is
> > >> used at link time and takes care of ABI versioning. Both techniques are
> > >> described in details in [1].
> > >>
> > >> Whenever ABI is changed in the future, version script should be changed
> > >> appropriately.
> > > 
> > > Maybe we should clarify the policy of how version numbers should be
> > > change? Each commit which changes default global symbol ABI? Each kernel
> > > release?
> > > 
> > >>
> > >> [1] https://www.akkadia.org/drepper/dsohowto.pdf
> > >>
> > >> Signed-off-by: Andrey Ignatov 
> > >> ---
> > >>tools/lib/bpf/Makefile   |   4 +-
> > >>tools/lib/bpf/libbpf.map | 120 +++
> > >>2 files changed, 123 insertions(+), 1 deletion(-)
> > >>create mode 100644 tools/lib/bpf/libbpf.map
> > >>
> > >> diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
> > >> index 425b480bda75..d76c41fa2d39 100644
> > >> --- a/tools/lib/bpf/Makefile
> > >> +++ b/tools/lib/bpf/Makefile
> > >> @@ -145,6 +145,7 @@ include $(srctree)/tools/build/Makefile.include
> > >>
> > >>BPF_IN:= $(OUTPUT)libbpf-in.o
> > >>LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
> > >> +VERSION_SCRIPT := libbpf.map
> > >>
> > >>CMD_TARGETS = $(LIB_FILE)
> > >>
> > >> @@ -170,7 +171,8 @@ $(BPF_IN): force elfdep bpfdep
> > >>  $(Q)$(MAKE) $(build)=libbpf
> > >>
> > >>$(OUTPUT)libbpf.so: $(BPF_IN)
> > >> -$(QUIET_LINK)$(CC) --shared $^ -o $@
> > >> +$(QUIET_LINK)$(CC) --shared 
> > >> -Wl,--version-script=$(VERSION_SCRIPT) \
> > >> +$^ -o $@
> > >>
> > >>$(OUTPUT)libbpf.a: $(BPF_IN)
> > >>  $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
> > >> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
> > >> new file mode 100644
> > >> index ..9fe416b68c7d
> > >> --- /dev/null
> > >> +++ b/tools/lib/bpf/libbpf.map
> > >> @@ -0,0 +1,120 @@
> > >> +LIBBPF_0.0.1 {
> > >> +global:
> > >> +bpf_btf_get_fd_by_id;
> > > 
> > > Do you think we could use this opportunities to
> > > make naming more consistent? For example,
> > > bpf_btf_get_fd_by_id => btf__get_fd_by_id?
> > 
> > I think this one is fine since it matches
> > bpf_[map|prog]_get_fd_by_id()
> > and it's a wrapper.
> Agree with keeping btf's get_fd_by_id() name to match with
> other get_fd_by_id() interfaces.
> 
> > 
> > >> +bpf_create_map;
> > >> +bpf_create_map_in_map;
> > >> +bpf_create_map_in_map_node;
> > >> +bpf_create_map_name;
> > >> +bpf_create_map_node;
> > >> +bpf_create_map_xattr;
> > >> +bpf_load_btf;
> > >> +bpf_load_program;
> > >> +bpf_load_program_xattr;
> > >> +bpf_map__btf_key_type_id;
> > >> +bpf_map__btf_value_type_id;
> > >> +bpf_map__def;
> > >> +bpf_map_delete_elem; > +bpf_map__fd;
> > >> +bpf_map_get_fd_by_id;
> > >> +bpf_map_get_next_id;
> > >> +bpf_map_get_next_key; > +   
> > >> bpf_map__is_offload_neutral;
> > >> +bpf_map_lookup_and_delete_elem;
> > >> +bpf_map_lookup_elem;
> > >> +bpf_map__name;
> > >> +bpf_map__next;
> > >> +bpf_map__pin;
> > >> +bpf_map__prev;
> > >> +bpf_map__priv;
> > >> +bpf_map__reuse_fd;
> > >> +bpf_map__set_ifindex;
> > >> +bpf_map__set_priv;
> > >> +bpf_map__unpin;
> > >> +bpf_map_update_elem;
> > >> +

Re: [net-next 0/6][pull request] Intel Wired LAN Driver Updates 2018-11-21

2018-11-23 Thread David Miller

From: Jeff Kirsher 
Date: Wed, 21 Nov 2018 11:54:17 -0800

> This series contains updates to all of the Intel LAN drivers and
> documentation.
> 
> Shannon Nelson updates the ixgbe kernel documentation to include IPsec
> hardware offload.
> 
> Joe Perches cleans up whitespace issues in the igb driver.
> 
> Jesse update the netdev kernel documentation for NETIF_F_GSO_UDP_L4 to
> align with the actual code.  Also aligned all the NAPI driver code for
> all of the Intel drivers to implement the recommendations of Eric
> Dumazet to check the return code of the napi_complete_done() to
> determine whether or not to enable interrupts or exit poll.
> 
> Paul E. McKenney replaces synchronize_sched() with synchronize_rcu() for
> ixgbe.
> 
> Sasha implements suggestions made by Joe Perches to remove obsolete code
> and to use the dev_err() method.

Pulled, thanks Jeff.

Re: [PATCH net-next] net-gro: use ffs() to speedup napi_gro_flush()

2018-11-23 Thread David Miller

From: Eric Dumazet 
Date: Wed, 21 Nov 2018 11:39:28 -0800

> We very often have few flows/chains to look at, and we
> might increase GRO_HASH_BUCKETS to 32 or 64 in the future.
> 
> Signed-off-by: Eric Dumazet 

Applied.

Re: [PATCH net-next 0/3] tcp: take a bit more care of backlog stress

2018-11-23 Thread Eric Dumazet

On Fri, Nov 23, 2018 at 11:25 AM David Miller  wrote:
> My impression is that patch #2 needs some fixes in order to not
> lose dupacks.  So there will be a respin of this.
>
> Thanks.

You are absolutely right, we will submit a v2 next week after TG holidays.

Thanks.

Re: [PATCH net-next 0/3] tcp: take a bit more care of backlog stress

2018-11-23 Thread David Miller

From: Eric Dumazet 
Date: Wed, 21 Nov 2018 09:52:37 -0800

> While working on the SACK compression issue Jean-Louis Dupond
> reported, we found that his linux box was suffering very hard
> from tail drops on the socket backlog queue, because the opposite
> TCP stack was ont implementing latest RFC recommendations.
> 
> First patch is a cleanup
> 
> Second patch is attempting coalescing when a new packet must
> be added to the backlog queue. Cooking bigger skbs helps
> to keep backlog list smaller and speeds its handling when
> user thread finally releases the socket lock.
> 
> Third patch is implementing head drop as a last resort.
> Head drops are generally better for optimal TCP behavior.

My impression is that patch #2 needs some fixes in order to not
lose dupacks.  So there will be a respin of this.

Thanks.

Re: [PATCH net] net/sched: act_police: add missing spinlock initialization

2018-11-23 Thread David Miller

From: Davide Caratti 
Date: Wed, 21 Nov 2018 18:23:53 +0100

> commit f2cbd4852820 ("net/sched: act_police: fix race condition on state
> variables") introduces a new spinlock, but forgets its initialization.
> Ensure that tcf_police_init() initializes 'tcfp_lock' every time a 'police'
> action is newly created, to avoid the following lockdep splat:
 ...
> Fixes: f2cbd4852820 ("net/sched: act_police: fix race condition on state 
> variables")
> Reported-by: Cong Wang 
> Signed-off-by: Davide Caratti 

Applied.

Re: [PATCH net v2] net: don't keep lonely packets forever in the gro hash

2018-11-23 Thread David Miller

From: Paolo Abeni 
Date: Wed, 21 Nov 2018 18:21:35 +0100

> Eric noted that with UDP GRO and NAPI timeout, we could keep a single
> UDP packet inside the GRO hash forever, if the related NAPI instance
> calls napi_gro_complete() at an higher frequency than the NAPI timeout.
> Willem noted that even TCP packets could be trapped there, till the
> next retransmission.
> This patch tries to address the issue, flushing the old packets -
> those with a NAPI_GRO_CB age before the current jiffy - before scheduling
> the NAPI timeout. The rationale is that such a timeout should be
> well below a jiffy and we are not flushing packets eligible for sane GRO.
> 
> v1  -> v2:
>  - clarified the commit message and comment
> 
> RFC -> v1:
>  - added 'Fixes tags', cleaned-up the wording.
> 
> Reported-by: Eric Dumazet 
> Fixes: 3b47d30396ba ("net: gro: add a per device gro flush timer")
> Signed-off-by: Paolo Abeni 

Applied and queued up for -stable.

Re: [PATCH net] net/ipv6: re-do dad when interface has IFF_NOARP flag change

2018-11-23 Thread David Miller

From: Hangbin Liu 
Date: Wed, 21 Nov 2018 21:52:33 +0800

> When we add a new IPv6 address, we should also join corresponding 
> solicited-node
> multicast address, unless the interface has IFF_NOARP flag, as function
> addrconf_join_solict() did. But if we remove IFF_NOARP flag later, we do
> not do dad and add the mcast address. So we will drop corresponding neighbour
> discovery message that came from other nodes.
> 
> A typical example is after creating a ipvlan with mode l3, setting up an ipv6
> address and changing the mode to l2. Then we will not be able to ping this
> address as the interface doesn't join related solicited-node mcast address.
> 
> Fix it by re-doing dad when interface changed IFF_NOARP flag. Then we will add
> corresponding mcast group and check if there is a duplicate address on the
> network.
> 
> Reported-by: Jianlin Shi 
> Reviewed-by: Stefano Brivio 
> Signed-off-by: Hangbin Liu 

Applied.

Re: [PATCH net] packet: copy user buffers before orphan or clone

2018-11-23 Thread David Miller

From: Willem de Bruijn 
Date: Tue, 20 Nov 2018 13:00:18 -0500

> From: Willem de Bruijn 
> 
> tpacket_snd sends packets with user pages linked into skb frags. It
> notifies that pages can be reused when the skb is released by setting
> skb->destructor to tpacket_destruct_skb.
> 
> This can cause data corruption if the skb is orphaned (e.g., on
> transmit through veth) or cloned (e.g., on mirror to another psock).
> 
> Create a kernel-private copy of data in these cases, same as tun/tap
> zerocopy transmission. Reuse that infrastructure: mark the skb as
> SKBTX_ZEROCOPY_FRAG, which will trigger copy in skb_orphan_frags(_rx).
> 
> Unlike other zerocopy packets, do not set shinfo destructor_arg to
> struct ubuf_info. tpacket_destruct_skb already uses that ptr to notify
> when the original skb is released and a timestamp is recorded. Do not
> change this timestamp behavior. The ubuf_info->callback is not needed
> anyway, as no zerocopy notification is expected.
> 
> Mark destructor_arg as not-a-uarg by setting the lower bit to 1. The
> resulting value is not a valid ubuf_info pointer, nor a valid
> tpacket_snd frame address. Add skb_zcopy_.._nouarg helpers for this.
> 
> The fix relies on features introduced in commit 52267790ef52 ("sock:
> add MSG_ZEROCOPY"), so can be backported as is only to 4.14.
> 
> Tested with from `./in_netns.sh ./txring_overwrite` from
> http://github.com/wdebruij/kerneltools/tests
> 
> Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap")
> Reported-by: Anand H. Krishnan 
> Signed-off-by: Willem de Bruijn 

Applied, and queued up for -stable.  Thanks for the backporting notes.

Any chance those tests from your kerneltools repo can make their way
into selftests?

Thanks.

Re: [PATCH v1 net-next] ip6_tunnel: Adding support of mapping rules for MAP-E tunnel

2018-11-23 Thread David Miller

From: Felix Jia 
Date: Tue, 20 Nov 2018 14:53:24 +1300

> +struct ip6_tnl_rule {
> + u8 version;
> + struct in6_addr ipv6_subnet;
> + u8 ipv6_prefixlen;
> + struct in_addr ipv4_subnet;
> + u8 ipv4_prefixlen;
> + u8 ea_length;
> + u8 psid_offset;

Please arrange the members of this structure better so that there is no
internal padding.  Putting a u8 before an in6_addr puts at least 3 bytes
of wasted padding after the u8, for example.

> + u8 *ptr;
> + struct iphdr *icmpiph = NULL;
> + struct tcphdr *tcph, *icmptcph;
> + struct udphdr *udph, *icmpudph;
> + struct icmphdr *icmph, *icmpicmph;

Please arrange all local variables from longest to shortest line, ie. reverse
christmas tree format.

> + int i, pbw0, pbi0, pbi1;
> + __u32 addr[4];
> + __u32 psid = 0;
> + __u32 mask = 0;
> + __u32 a = ntohl(addr4);
> + __u16 p = ntohs(port4);
> + int psid_prefix_length = 0;
> + int psid_mask;
> + __u32 id0 = 0;
> + __u32 id1 = 0;

Likewise.

Also, many of these explicit "= 0" initializations are unnecessary and
make the declarations more ugly than they need to be.

> +static void
> +ip6_tnl_mape_dst(struct net_device *dev, struct sk_buff *skb,
> +  struct flowi6 *fl6)
> +{
> + struct ip6_tnl *t = netdev_priv(dev);
> + struct iphdr *iph;
> + __be32 saddr4, daddr4, addr;
> + __be16 sport4, dport4, port;
> + __u8 proto;
> + int icmperr;
> + struct ip6_tnl_rule *mr = NULL;

Reverse christmas tree please.

> +static struct ip6_tnl __rcu **
> +ip6_tnl_bucket_r_any(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm 
> *p)
> +{
> + const struct in6_addr *local = >laddr;
> + unsigned int h = 0;
> + int prio = 0;
> + struct in6_addr any;

Likewise.

And so on and so forth for your entire patch.

Re: [PATCH bpf-next 1/2] libbpf: Add version script for DSO

2018-11-23 Thread Martin Lau

On Wed, Nov 21, 2018 at 02:22:14PM -0800, Alexei Starovoitov wrote:
> On 11/21/18 12:18 PM, Yonghong Song wrote:
> > 
> > 
> > On 11/21/18 9:40 AM, Andrey Ignatov wrote:
> >> More and more projects use libbpf and one day it'll likely be packaged
> >> and distributed as DSO and that requires ABI versioning so that both
> >> compatible and incompatible changes to ABI can be introduced in a safe
> >> way in the future without breaking executables dynamically linked with a
> >> previous version of the library.
> >>
> >> Usual way to do ABI versioning is version script for the linker. Add
> >> such a script for libbpf. All global symbols currently exported via
> >> LIBBPF_API macro are added to the version script libbpf.map.
> >>
> >> The version name LIBBPF_0.0.1 is constructed from the name of the
> >> library + version specified by $(LIBBPF_VERSION) in Makefile.
> >>
> >> Version script does not duplicate the work done by LIBBPF_API macro, it
> >> rather complements it. The macro is used at compile time and can be used
> >> by compiler to do optimization that can't be done at link time, it is
> >> purely about global symbol visibility. The version script, in turn, is
> >> used at link time and takes care of ABI versioning. Both techniques are
> >> described in details in [1].
> >>
> >> Whenever ABI is changed in the future, version script should be changed
> >> appropriately.
> > 
> > Maybe we should clarify the policy of how version numbers should be
> > change? Each commit which changes default global symbol ABI? Each kernel
> > release?
> > 
> >>
> >> [1] https://www.akkadia.org/drepper/dsohowto.pdf
> >>
> >> Signed-off-by: Andrey Ignatov 
> >> ---
> >>tools/lib/bpf/Makefile   |   4 +-
> >>tools/lib/bpf/libbpf.map | 120 +++
> >>2 files changed, 123 insertions(+), 1 deletion(-)
> >>create mode 100644 tools/lib/bpf/libbpf.map
> >>
> >> diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
> >> index 425b480bda75..d76c41fa2d39 100644
> >> --- a/tools/lib/bpf/Makefile
> >> +++ b/tools/lib/bpf/Makefile
> >> @@ -145,6 +145,7 @@ include $(srctree)/tools/build/Makefile.include
> >>
> >>BPF_IN:= $(OUTPUT)libbpf-in.o
> >>LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
> >> +VERSION_SCRIPT := libbpf.map
> >>
> >>CMD_TARGETS = $(LIB_FILE)
> >>
> >> @@ -170,7 +171,8 @@ $(BPF_IN): force elfdep bpfdep
> >>$(Q)$(MAKE) $(build)=libbpf
> >>
> >>$(OUTPUT)libbpf.so: $(BPF_IN)
> >> -  $(QUIET_LINK)$(CC) --shared $^ -o $@
> >> +  $(QUIET_LINK)$(CC) --shared -Wl,--version-script=$(VERSION_SCRIPT) \
> >> +  $^ -o $@
> >>
> >>$(OUTPUT)libbpf.a: $(BPF_IN)
> >>$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
> >> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
> >> new file mode 100644
> >> index ..9fe416b68c7d
> >> --- /dev/null
> >> +++ b/tools/lib/bpf/libbpf.map
> >> @@ -0,0 +1,120 @@
> >> +LIBBPF_0.0.1 {
> >> +  global:
> >> +  bpf_btf_get_fd_by_id;
> > 
> > Do you think we could use this opportunities to
> > make naming more consistent? For example,
> > bpf_btf_get_fd_by_id => btf__get_fd_by_id?
> 
> I think this one is fine since it matches
> bpf_[map|prog]_get_fd_by_id()
> and it's a wrapper.
Agree with keeping btf's get_fd_by_id() name to match with
other get_fd_by_id() interfaces.

> 
> >> +  bpf_create_map;
> >> +  bpf_create_map_in_map;
> >> +  bpf_create_map_in_map_node;
> >> +  bpf_create_map_name;
> >> +  bpf_create_map_node;
> >> +  bpf_create_map_xattr;
> >> +  bpf_load_btf;
> >> +  bpf_load_program;
> >> +  bpf_load_program_xattr;
> >> +  bpf_map__btf_key_type_id;
> >> +  bpf_map__btf_value_type_id;
> >> +  bpf_map__def;
> >> +  bpf_map_delete_elem; > +bpf_map__fd;
> >> +  bpf_map_get_fd_by_id;
> >> +  bpf_map_get_next_id;
> >> +  bpf_map_get_next_key; > +   
> >> bpf_map__is_offload_neutral;
> >> +  bpf_map_lookup_and_delete_elem;
> >> +  bpf_map_lookup_elem;
> >> +  bpf_map__name;
> >> +  bpf_map__next;
> >> +  bpf_map__pin;
> >> +  bpf_map__prev;
> >> +  bpf_map__priv;
> >> +  bpf_map__reuse_fd;
> >> +  bpf_map__set_ifindex;
> >> +  bpf_map__set_priv;
> >> +  bpf_map__unpin;
> >> +  bpf_map_update_elem;
> >> +  bpf_object__btf_fd;
> >> +  bpf_object__close;
> >> +  bpf_object__find_map_by_name;
> >> +  bpf_object__find_map_by_offset;
> >> +  bpf_object__find_program_by_title;
> >> +  bpf_object__kversion;
> >> +  bpf_object__load;
> >> +  bpf_object__name;
> >> +  bpf_object__next;
> >> +  bpf_object__open;
> >> +  bpf_object__open_buffer;
> >> +  bpf_object__open_xattr;
> >> +  bpf_object__pin;
> >> +

[PATCH net] net: phy: add workaround for issue where PHY driver doesn't bind to the device

2018-11-23 Thread Heiner Kallweit

After switching the r8169 driver to use phylib some user reported that
their network is broken. This was caused by the genphy PHY driver being
used instead of the dedicated PHY driver for the RTL8211B. Users
reported that loading the Realtek PHY driver module upfront fixes the
issue. See also this mail thread:
https://marc.info/?t=15427978183=1=2
The issue is quite weird and the root cause seems to be somewhere in
the base driver core. The patch works around the issue and may be
removed once the actual issue is fixed.

The Fixes tag refers to the first reported occurrence of the issue.
The issue itself may have been existing much longer and it may affect
users of other network chips as well. Users typically will recognize
this issue only if their PHY stops working when being used with the
genphy driver.

Fixes: f1e911d5d0df ("r8169: add basic phylib support")
Signed-off-by: Heiner Kallweit 
---
I'm not sure how long it will take to find and fix the root cause of
the issue. With this workaround affected users have a working network
again from 4.19.5 at least.
---
 drivers/net/phy/phy_device.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index e06613f2d..0904002b1 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2255,6 +2255,14 @@ int phy_driver_register(struct phy_driver *new_driver, 
struct module *owner)
new_driver->mdiodrv.driver.remove = phy_remove;
new_driver->mdiodrv.driver.owner = owner;
 
+   /* The following works around an issue where the PHY driver doesn't bind
+* to the device, resulting in the genphy driver being used instead of
+* the dedicated driver. The root cause of the issue isn't known yet
+* and seems to be in the base driver core. Once this is fixed we may
+* remove this workaround.
+*/
+   new_driver->mdiodrv.driver.probe_type = PROBE_FORCE_SYNCHRONOUS;
+
retval = driver_register(_driver->mdiodrv.driver);
if (retval) {
pr_err("%s: Error %d in registering driver\n",
-- 
2.19.1

Re: [PATCH] dt-bindings: dsa: Fix typo in "probed"

2018-11-23 Thread Andrew Lunn

On Fri, Nov 23, 2018 at 03:46:50PM -0200, Fabio Estevam wrote:
> The correct form is "can be probed", so fix the typo.
> 
> Signed-off-by: Fabio Estevam 

Reviewed-by: Andrew Lunn 

Andrew

[PATCH] dt-bindings: dsa: Fix typo in "probed"

2018-11-23 Thread Fabio Estevam

The correct form is "can be probed", so fix the typo.

Signed-off-by: Fabio Estevam 
---
 Documentation/devicetree/bindings/net/dsa/dsa.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt 
b/Documentation/devicetree/bindings/net/dsa/dsa.txt
index 3ceeb8d..35694c0 100644
--- a/Documentation/devicetree/bindings/net/dsa/dsa.txt
+++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt
@@ -7,7 +7,7 @@ limitations.
 Current Binding
 ---
 
-Switches are true Linux devices and can be probes by any means. Once
+Switches are true Linux devices and can be probed by any means. Once
 probed, they register to the DSA framework, passing a node
 pointer. This node is expected to fulfil the following binding, and
 may contain additional properties as required by the device it is
-- 
2.7.4

[PATCH net] net: thunderx: set tso_hdrs pointer to NULL in nicvf_free_snd_queue

2018-11-23 Thread Lorenzo Bianconi

Reset snd_queue tso_hdrs pointer to NULL in nicvf_free_snd_queue routine
since it is used to check if tso dma descriptor queue has been previously
allocated. The issue can be triggered with the following reproducer:

$ip link set dev enP2p1s0v0 xdpdrv obj xdp_dummy.o
$ip link set dev enP2p1s0v0 xdpdrv off

[  341.467649] WARNING: CPU: 74 PID: 2158 at mm/vmalloc.c:1511 
__vunmap+0x98/0xe0
[  341.515010] Hardware name: GIGABYTE H270-T70/MT70-HD0, BIOS T49 02/02/2018
[  341.521874] pstate: 6045 (nZCv daif +PAN -UAO)
[  341.526654] pc : __vunmap+0x98/0xe0
[  341.530132] lr : __vunmap+0x98/0xe0
[  341.533609] sp : 1c5db860
[  341.536913] x29: 1c5db860 x28: 0002
[  341.542214] x27: 810feb5090b0 x26: 17e57000
[  341.547515] x25:  x24: fbd0
[  341.552816] x23:  x22: 810feb5090b0
[  341.558117] x21:  x20: 
[  341.563418] x19: 17e57000 x18: 
[  341.568719] x17:  x16: 
[  341.574020] x15: 0010 x14: 
[  341.579321] x13: 8985eb27 x12: 0985eb2f
[  341.584622] x11: 096b3000 x10: 1c5db510
[  341.589923] x9 : ffd0 x8 : 086868e8
[  341.595224] x7 : 3430303030303030 x6 : 06ef
[  341.600525] x5 : 003f x4 : 
[  341.605825] x3 :  x2 : 
[  341.611126] x1 : 096b3728 x0 : 0038
[  341.616428] Call trace:
[  341.618866]  __vunmap+0x98/0xe0
[  341.621997]  vunmap+0x3c/0x50
[  341.624961]  arch_dma_free+0x68/0xa0
[  341.628534]  dma_direct_free+0x50/0x80
[  341.632285]  nicvf_free_resources+0x160/0x2d8 [nicvf]
[  341.637327]  nicvf_config_data_transfer+0x174/0x5e8 [nicvf]
[  341.642890]  nicvf_stop+0x298/0x340 [nicvf]
[  341.647066]  __dev_close_many+0x9c/0x108
[  341.650977]  dev_close_many+0xa4/0x158
[  341.654720]  rollback_registered_many+0x140/0x530
[  341.659414]  rollback_registered+0x54/0x80
[  341.663499]  unregister_netdevice_queue+0x9c/0xe8
[  341.668192]  unregister_netdev+0x28/0x38
[  341.672106]  nicvf_remove+0xa4/0xa8 [nicvf]
[  341.676280]  nicvf_shutdown+0x20/0x30 [nicvf]
[  341.680630]  pci_device_shutdown+0x44/0x88
[  341.684720]  device_shutdown+0x144/0x250
[  341.688640]  kernel_restart_prepare+0x44/0x50
[  341.692986]  kernel_restart+0x20/0x68
[  341.696638]  __se_sys_reboot+0x210/0x238
[  341.700550]  __arm64_sys_reboot+0x24/0x30
[  341.704555]  el0_svc_handler+0x94/0x110
[  341.708382]  el0_svc+0x8/0xc
[  341.711252] ---[ end trace 3f4019c8439959c9 ]---
[  341.715874] page:7e0003ef4000 count:0 mapcount:0 
mapping: index:0x4
[  341.723872] flags: 0x1fffe0()
[  341.727527] raw: 001fffe0 7e0003f1a008 7e0003ef4048 

[  341.735263] raw: 0004   

[  341.742994] page dumped because: VM_BUG_ON_PAGE(page_ref_count(page) == 0)

where xdp_dummy.c is a simple bpf program that forwards the incoming
frames to the network stack (available here:
https://github.com/altoor/xdp_walkthrough_examples/blob/master/sample_1/xdp_dummy.c)

Fixes: 05c773f52b96 ("net: thunderx: Add basic XDP support")
Fixes: 4863dea3fab0 ("net: Adding support for Cavium ThunderX network
controller")

Signed-off-by: Lorenzo Bianconi 
---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 187a249ff2d1..fcaf18fa3904 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -585,10 +585,12 @@ static void nicvf_free_snd_queue(struct nicvf *nic, 
struct snd_queue *sq)
if (!sq->dmem.base)
return;
 
-   if (sq->tso_hdrs)
+   if (sq->tso_hdrs) {
dma_free_coherent(>pdev->dev,
  sq->dmem.q_len * TSO_HEADER_SIZE,
  sq->tso_hdrs, sq->tso_hdrs_phys);
+   sq->tso_hdrs = NULL;
+   }
 
/* Free pending skbs in the queue */
smp_rmb();
-- 
2.19.1

Fw: [Bug 201773] New: IP_FREEBIND doesn’t counteract global

2018-11-23 Thread Stephen Hemminger

Could be a bug, or just how it works?

Begin forwarded message:

Date: Thu, 22 Nov 2018 22:51:12 +
From: bugzilla-dae...@bugzilla.kernel.org
To: step...@networkplumber.org
Subject: [Bug 201773] New: IP_FREEBIND doesn’t counteract global


https://bugzilla.kernel.org/show_bug.cgi?id=201773

Bug ID: 201773
   Summary: IP_FREEBIND doesn’t counteract global
   Product: Networking
   Version: 2.5
Kernel Version: 3.10.0-862.11.6.el7.x86_64
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: IPV4
  Assignee: step...@networkplumber.org
  Reporter: fel...@felipegasper.com
Regression: No

The following should fail, regardless of /proc/sys/net/ipv4/ip_nonlocal_bind:

-
> strace -e socket,setsockopt,bind perl -MSocket -Mautodie -e'socket my $s,
> PF_INET, SOCK_STREAM, 0; setsockopt( $s, IPPROTO_IP, 15, 0 ); bind( $s,
> pack_sockaddr_in( 0, inet_aton("1.2.3.4") ) );'  
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
setsockopt(3, SOL_IP, IP_FREEBIND, [0], 4) = 0
bind(3, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("1.2.3.4")},
16) = 0
+++ exited with 0 +++
-

… however, it appears that setsockopt() doesn’t disable IP_FREEBIND if
ip_nonlocal_bind is set via /proc.

-- 
You are receiving this mail because:
You are the assignee for the bug.

[PATCH net-next 6/8] dpaa2-eth: Add support for XDP_TX

2018-11-23 Thread Ioana Ciocoi Radulescu

Send frames back on the same port for XDP_TX action.
Since the frame buffers have been allocated by us, we can recycle
them directly into the Rx buffer pool instead of requesting a
confirmation frame upon transmission complete.

Signed-off-by: Ioana Radulescu 
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 43 +++-
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h |  2 ++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 1bdcd71..3dabee0 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -240,14 +240,50 @@ static void xdp_release_buf(struct dpaa2_eth_priv *priv,
ch->xdp.drop_cnt = 0;
 }
 
+static int xdp_enqueue(struct dpaa2_eth_priv *priv, struct dpaa2_fd *fd,
+  void *buf_start, u16 queue_id)
+{
+   struct dpaa2_eth_fq *fq;
+   struct dpaa2_faead *faead;
+   u32 ctrl, frc;
+   int i, err;
+
+   /* Mark the egress frame hardware annotation area as valid */
+   frc = dpaa2_fd_get_frc(fd);
+   dpaa2_fd_set_frc(fd, frc | DPAA2_FD_FRC_FAEADV);
+   dpaa2_fd_set_ctrl(fd, DPAA2_FD_CTRL_ASAL);
+
+   /* Instruct hardware to release the FD buffer directly into
+* the buffer pool once transmission is completed, instead of
+* sending a Tx confirmation frame to us
+*/
+   ctrl = DPAA2_FAEAD_A4V | DPAA2_FAEAD_A2V | DPAA2_FAEAD_EBDDV;
+   faead = dpaa2_get_faead(buf_start, false);
+   faead->ctrl = cpu_to_le32(ctrl);
+   faead->conf_fqid = 0;
+
+   fq = >fq[queue_id];
+   for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) {
+   err = dpaa2_io_service_enqueue_qd(fq->channel->dpio,
+ priv->tx_qdid, 0,
+ fq->tx_qdbin, fd);
+   if (err != -EBUSY)
+   break;
+   }
+
+   return err;
+}
+
 static u32 run_xdp(struct dpaa2_eth_priv *priv,
   struct dpaa2_eth_channel *ch,
+  struct dpaa2_eth_fq *rx_fq,
   struct dpaa2_fd *fd, void *vaddr)
 {
dma_addr_t addr = dpaa2_fd_get_addr(fd);
struct bpf_prog *xdp_prog;
struct xdp_buff xdp;
u32 xdp_act = XDP_PASS;
+   int err;
 
rcu_read_lock();
 
@@ -269,6 +305,11 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv,
switch (xdp_act) {
case XDP_PASS:
break;
+   case XDP_TX:
+   err = xdp_enqueue(priv, fd, vaddr, rx_fq->flowid);
+   if (err)
+   xdp_release_buf(priv, ch, addr);
+   break;
default:
bpf_warn_invalid_xdp_action(xdp_act);
case XDP_ABORTED:
@@ -317,7 +358,7 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
percpu_extras = this_cpu_ptr(priv->percpu_extras);
 
if (fd_format == dpaa2_fd_single) {
-   xdp_act = run_xdp(priv, ch, (struct dpaa2_fd *)fd, vaddr);
+   xdp_act = run_xdp(priv, ch, fq, (struct dpaa2_fd *)fd, vaddr);
if (xdp_act != XDP_PASS)
return;
 
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
index 23cf9d9..5530a0e 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
@@ -139,7 +139,9 @@ struct dpaa2_faead {
 };
 
 #define DPAA2_FAEAD_A2V0x2000
+#define DPAA2_FAEAD_A4V0x0800
 #define DPAA2_FAEAD_UPDV   0x1000
+#define DPAA2_FAEAD_EBDDV  0x2000
 #define DPAA2_FAEAD_UPD0x0010
 
 /* Accessors for the hardware annotation fields that we use */
-- 
2.7.4

[PATCH net-next 0/8] dpaa2-eth: Introduce XDP support

2018-11-23 Thread Ioana Ciocoi Radulescu

Add support for XDP programs. Only XDP_PASS, XDP_DROP and XDP_TX
actions are supported for now. Frame header changes are also
allowed.

Ioana Radulescu (8):
  dpaa2-eth: Add basic XDP support
  dpaa2-eth: Allow XDP header adjustments
  dpaa2-eth: Move function
  dpaa2-eth: Release buffers back to pool on XDP_DROP
  dpaa2-eth: Map Rx buffers as bidirectional
  dpaa2-eth: Add support for XDP_TX
  dpaa2-eth: Cleanup channel stats
  dpaa2-eth: Add xdp counters

 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c   | 336 +++--
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h   |  20 +-
 .../net/ethernet/freescale/dpaa2/dpaa2-ethtool.c   |  19 +-
 3 files changed, 337 insertions(+), 38 deletions(-)

-- 
2.7.4

[PATCH net-next 1/8] dpaa2-eth: Add basic XDP support

2018-11-23 Thread Ioana Ciocoi Radulescu

We keep one XDP program reference per channel. The only actions
supported for now are XDP_DROP and XDP_PASS.

Until now we didn't enforce a maximum size for Rx frames based
on MTU value. Change that, since for XDP mode we must ensure no
scatter-gather frames can be received.

Signed-off-by: Ioana Radulescu 
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 182 ++-
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h |   6 +
 2 files changed, 187 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 640967a..5340ac9 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -13,7 +13,8 @@
 #include 
 #include 
 #include 
-
+#include 
+#include 
 #include 
 
 #include "dpaa2-eth.h"
@@ -199,6 +200,45 @@ static struct sk_buff *build_frag_skb(struct 
dpaa2_eth_priv *priv,
return skb;
 }
 
+static u32 run_xdp(struct dpaa2_eth_priv *priv,
+  struct dpaa2_eth_channel *ch,
+  struct dpaa2_fd *fd, void *vaddr)
+{
+   struct bpf_prog *xdp_prog;
+   struct xdp_buff xdp;
+   u32 xdp_act = XDP_PASS;
+
+   rcu_read_lock();
+
+   xdp_prog = READ_ONCE(ch->xdp.prog);
+   if (!xdp_prog)
+   goto out;
+
+   xdp.data = vaddr + dpaa2_fd_get_offset(fd);
+   xdp.data_end = xdp.data + dpaa2_fd_get_len(fd);
+   xdp.data_hard_start = xdp.data;
+   xdp_set_data_meta_invalid();
+
+   xdp_act = bpf_prog_run_xdp(xdp_prog, );
+
+   switch (xdp_act) {
+   case XDP_PASS:
+   break;
+   default:
+   bpf_warn_invalid_xdp_action(xdp_act);
+   case XDP_ABORTED:
+   trace_xdp_exception(priv->net_dev, xdp_prog, xdp_act);
+   case XDP_DROP:
+   ch->buf_count--;
+   free_rx_fd(priv, fd, vaddr);
+   break;
+   }
+
+out:
+   rcu_read_unlock();
+   return xdp_act;
+}
+
 /* Main Rx frame processing routine */
 static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
 struct dpaa2_eth_channel *ch,
@@ -215,6 +255,7 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
struct dpaa2_fas *fas;
void *buf_data;
u32 status = 0;
+   u32 xdp_act;
 
/* Tracing point */
trace_dpaa2_rx_fd(priv->net_dev, fd);
@@ -231,8 +272,14 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
percpu_extras = this_cpu_ptr(priv->percpu_extras);
 
if (fd_format == dpaa2_fd_single) {
+   xdp_act = run_xdp(priv, ch, (struct dpaa2_fd *)fd, vaddr);
+   if (xdp_act != XDP_PASS)
+   return;
+
skb = build_linear_skb(ch, fd, vaddr);
} else if (fd_format == dpaa2_fd_sg) {
+   WARN_ON(priv->xdp_prog);
+
skb = build_frag_skb(priv, ch, buf_data);
skb_free_frag(vaddr);
percpu_extras->rx_sg_frames++;
@@ -1427,6 +1474,137 @@ static int dpaa2_eth_ioctl(struct net_device *dev, 
struct ifreq *rq, int cmd)
return -EINVAL;
 }
 
+static bool xdp_mtu_valid(struct dpaa2_eth_priv *priv, int mtu)
+{
+   int mfl, linear_mfl;
+
+   mfl = DPAA2_ETH_L2_MAX_FRM(mtu);
+   linear_mfl = DPAA2_ETH_RX_BUF_SIZE - DPAA2_ETH_RX_HWA_SIZE -
+dpaa2_eth_rx_head_room(priv);
+
+   return (mfl <= linear_mfl);
+}
+
+static int set_rx_mfl(struct dpaa2_eth_priv *priv, int mtu, bool has_xdp)
+{
+   int mfl, err;
+
+   /* We enforce a maximum Rx frame length based on MTU only if we have
+* an XDP program attached (in order to avoid Rx S/G frames).
+* Otherwise, we accept all incoming frames as long as they are not
+* larger than maximum size supported in hardware
+*/
+   if (has_xdp)
+   mfl = DPAA2_ETH_L2_MAX_FRM(mtu);
+   else
+   mfl = DPAA2_ETH_MFL;
+
+   err = dpni_set_max_frame_length(priv->mc_io, 0, priv->mc_token, mfl);
+   if (err) {
+   netdev_err(priv->net_dev, "dpni_set_max_frame_length failed\n");
+   return err;
+   }
+
+   return 0;
+}
+
+static int dpaa2_eth_change_mtu(struct net_device *dev, int new_mtu)
+{
+   struct dpaa2_eth_priv *priv = netdev_priv(dev);
+   int err;
+
+   if (!priv->xdp_prog)
+   goto out;
+
+   if (!xdp_mtu_valid(priv, new_mtu))
+   return -EINVAL;
+
+   err = set_rx_mfl(priv, new_mtu, true);
+   if (err)
+   return err;
+
+out:
+   dev->mtu = new_mtu;
+   return 0;
+}
+
+static int setup_xdp(struct net_device *dev, struct bpf_prog *prog)
+{
+   struct dpaa2_eth_priv *priv = netdev_priv(dev);
+   struct dpaa2_eth_channel *ch;
+   struct bpf_prog *old;
+   bool up, update_settings;
+   int i, err;
+
+   if (prog && !xdp_mtu_valid(priv, dev->mtu)) {
+

[PATCH net-next 7/8] dpaa2-eth: Cleanup channel stats

2018-11-23 Thread Ioana Ciocoi Radulescu

Remove one unused counter. Move the Tx portal busy counter to
the channel stats, since it logically belongs here.

Reorder fields in channel stats structure to match the ethtool
strings order and make it easier to print them with ethtool -S.

Signed-off-by: Ioana Radulescu 
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c |  3 +--
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 10 --
 drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c | 18 ++
 3 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 3dabee0..995a17d 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -456,7 +456,6 @@ static int consume_frames(struct dpaa2_eth_channel *ch,
return 0;
 
fq->stats.frames += cleaned;
-   ch->stats.frames += cleaned;
 
/* A dequeue operation only pulls frames from a single queue
 * into the store. Return the frame queue as an out param.
@@ -776,7 +775,7 @@ static netdev_tx_t dpaa2_eth_tx(struct sk_buff *skb, struct 
net_device *net_dev)
if (err != -EBUSY)
break;
}
-   percpu_extras->tx_portal_busy += i;
+   fq->channel->stats.tx_portal_busy += i;
if (unlikely(err < 0)) {
percpu_stats->tx_errors++;
/* Clean up everything, including freeing the skb */
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
index 5530a0e..320c0d0 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
@@ -231,8 +231,6 @@ struct dpaa2_eth_drv_stats {
__u64   tx_reallocs;
__u64   rx_sg_frames;
__u64   rx_sg_bytes;
-   /* Enqueues retried due to portal busy */
-   __u64   tx_portal_busy;
 };
 
 /* Per-FQ statistics */
@@ -243,14 +241,14 @@ struct dpaa2_eth_fq_stats {
 
 /* Per-channel statistics */
 struct dpaa2_eth_ch_stats {
+   /* Enqueues retried due to portal busy */
+   __u64   tx_portal_busy;
/* Volatile dequeues retried due to portal busy */
__u64 dequeue_portal_busy;
-   /* Number of CDANs; useful to estimate avg NAPI len */
-   __u64 cdan;
-   /* Number of frames received on queues from this channel */
-   __u64 frames;
/* Pull errors */
__u64 pull_err;
+   /* Number of CDANs; useful to estimate avg NAPI len */
+   __u64 cdan;
 };
 
 /* Maximum number of queues associated with a DPNI */
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
index 26bd5a2..3aa7885 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
@@ -40,8 +40,8 @@ static char dpaa2_ethtool_extras[][ETH_GSTRING_LEN] = {
"[drv] tx realloc frames",
"[drv] rx sg frames",
"[drv] rx sg bytes",
-   "[drv] enqueue portal busy",
/* Channel stats */
+   "[drv] enqueue portal busy",
"[drv] dequeue portal busy",
"[drv] channel pull errors",
"[drv] cdan",
@@ -174,8 +174,6 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device 
*net_dev,
int j, k, err;
int num_cnt;
union dpni_statistics dpni_stats;
-   u64 cdan = 0;
-   u64 portal_busy = 0, pull_err = 0;
struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
struct dpaa2_eth_drv_stats *extras;
struct dpaa2_eth_ch_stats *ch_stats;
@@ -212,16 +210,12 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device 
*net_dev,
}
i += j;
 
-   for (j = 0; j < priv->num_channels; j++) {
-   ch_stats = >channel[j]->stats;
-   cdan += ch_stats->cdan;
-   portal_busy += ch_stats->dequeue_portal_busy;
-   pull_err += ch_stats->pull_err;
+   /* Per-channel stats */
+   for (k = 0; k < priv->num_channels; k++) {
+   ch_stats = >channel[k]->stats;
+   for (j = 0; j < sizeof(*ch_stats) / sizeof(__u64); j++)
+   *((__u64 *)data + i + j) += *((__u64 *)ch_stats + j);
}
-
-   *(data + i++) = portal_busy;
-   *(data + i++) = pull_err;
-   *(data + i++) = cdan;
 }
 
 static int prep_eth_rule(struct ethhdr *eth_value, struct ethhdr *eth_mask,
-- 
2.7.4

[PATCH net-next 5/8] dpaa2-eth: Map Rx buffers as bidirectional

2018-11-23 Thread Ioana Ciocoi Radulescu

In order to support enqueueing Rx FDs back to hardware, we need to
DMA map Rx buffers as bidirectional.

Signed-off-by: Ioana Radulescu 
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 6256154..1bdcd71 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -87,7 +87,7 @@ static void free_rx_fd(struct dpaa2_eth_priv *priv,
addr = dpaa2_sg_get_addr([i]);
sg_vaddr = dpaa2_iova_to_virt(priv->iommu_domain, addr);
dma_unmap_single(dev, addr, DPAA2_ETH_RX_BUF_SIZE,
-DMA_FROM_DEVICE);
+DMA_BIDIRECTIONAL);
 
skb_free_frag(sg_vaddr);
if (dpaa2_sg_is_final([i]))
@@ -145,7 +145,7 @@ static struct sk_buff *build_frag_skb(struct dpaa2_eth_priv 
*priv,
sg_addr = dpaa2_sg_get_addr(sge);
sg_vaddr = dpaa2_iova_to_virt(priv->iommu_domain, sg_addr);
dma_unmap_single(dev, sg_addr, DPAA2_ETH_RX_BUF_SIZE,
-DMA_FROM_DEVICE);
+DMA_BIDIRECTIONAL);
 
sg_length = dpaa2_sg_get_len(sge);
 
@@ -212,7 +212,7 @@ static void free_bufs(struct dpaa2_eth_priv *priv, u64 
*buf_array, int count)
for (i = 0; i < count; i++) {
vaddr = dpaa2_iova_to_virt(priv->iommu_domain, buf_array[i]);
dma_unmap_single(dev, buf_array[i], DPAA2_ETH_RX_BUF_SIZE,
-DMA_FROM_DEVICE);
+DMA_BIDIRECTIONAL);
skb_free_frag(vaddr);
}
 }
@@ -306,7 +306,7 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
 
vaddr = dpaa2_iova_to_virt(priv->iommu_domain, addr);
dma_sync_single_for_cpu(dev, addr, DPAA2_ETH_RX_BUF_SIZE,
-   DMA_FROM_DEVICE);
+   DMA_BIDIRECTIONAL);
 
fas = dpaa2_get_fas(vaddr, false);
prefetch(fas);
@@ -322,13 +322,13 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
return;
 
dma_unmap_single(dev, addr, DPAA2_ETH_RX_BUF_SIZE,
-DMA_FROM_DEVICE);
+DMA_BIDIRECTIONAL);
skb = build_linear_skb(ch, fd, vaddr);
} else if (fd_format == dpaa2_fd_sg) {
WARN_ON(priv->xdp_prog);
 
dma_unmap_single(dev, addr, DPAA2_ETH_RX_BUF_SIZE,
-DMA_FROM_DEVICE);
+DMA_BIDIRECTIONAL);
skb = build_frag_skb(priv, ch, buf_data);
skb_free_frag(vaddr);
percpu_extras->rx_sg_frames++;
@@ -862,7 +862,7 @@ static int add_bufs(struct dpaa2_eth_priv *priv,
buf = PTR_ALIGN(buf, priv->rx_buf_align);
 
addr = dma_map_single(dev, buf, DPAA2_ETH_RX_BUF_SIZE,
- DMA_FROM_DEVICE);
+ DMA_BIDIRECTIONAL);
if (unlikely(dma_mapping_error(dev, addr)))
goto err_map;
 
-- 
2.7.4

[PATCH net-next 4/8] dpaa2-eth: Release buffers back to pool on XDP_DROP

2018-11-23 Thread Ioana Ciocoi Radulescu

Instead of freeing the RX buffers, release them back into the pool.
We wait for the maximum number of buffers supported by a single
release command to accumulate before issuing the command.

Also, don't unmap the Rx buffers at the beginning of the Rx routine
anymore, since that would require remapping them before release.
Instead, just do a DMA sync at first and only unmap if the frame is
meant for the stack.

Signed-off-by: Ioana Radulescu 
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 34 +---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h |  2 ++
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 9ed4d8e..6256154 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -217,10 +217,34 @@ static void free_bufs(struct dpaa2_eth_priv *priv, u64 
*buf_array, int count)
}
 }
 
+static void xdp_release_buf(struct dpaa2_eth_priv *priv,
+   struct dpaa2_eth_channel *ch,
+   dma_addr_t addr)
+{
+   int err;
+
+   ch->xdp.drop_bufs[ch->xdp.drop_cnt++] = addr;
+   if (ch->xdp.drop_cnt < DPAA2_ETH_BUFS_PER_CMD)
+   return;
+
+   while ((err = dpaa2_io_service_release(ch->dpio, priv->bpid,
+  ch->xdp.drop_bufs,
+  ch->xdp.drop_cnt)) == -EBUSY)
+   cpu_relax();
+
+   if (err) {
+   free_bufs(priv, ch->xdp.drop_bufs, ch->xdp.drop_cnt);
+   ch->buf_count -= ch->xdp.drop_cnt;
+   }
+
+   ch->xdp.drop_cnt = 0;
+}
+
 static u32 run_xdp(struct dpaa2_eth_priv *priv,
   struct dpaa2_eth_channel *ch,
   struct dpaa2_fd *fd, void *vaddr)
 {
+   dma_addr_t addr = dpaa2_fd_get_addr(fd);
struct bpf_prog *xdp_prog;
struct xdp_buff xdp;
u32 xdp_act = XDP_PASS;
@@ -250,8 +274,7 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv,
case XDP_ABORTED:
trace_xdp_exception(priv->net_dev, xdp_prog, xdp_act);
case XDP_DROP:
-   ch->buf_count--;
-   free_rx_fd(priv, fd, vaddr);
+   xdp_release_buf(priv, ch, addr);
break;
}
 
@@ -282,7 +305,8 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
trace_dpaa2_rx_fd(priv->net_dev, fd);
 
vaddr = dpaa2_iova_to_virt(priv->iommu_domain, addr);
-   dma_unmap_single(dev, addr, DPAA2_ETH_RX_BUF_SIZE, DMA_FROM_DEVICE);
+   dma_sync_single_for_cpu(dev, addr, DPAA2_ETH_RX_BUF_SIZE,
+   DMA_FROM_DEVICE);
 
fas = dpaa2_get_fas(vaddr, false);
prefetch(fas);
@@ -297,10 +321,14 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
if (xdp_act != XDP_PASS)
return;
 
+   dma_unmap_single(dev, addr, DPAA2_ETH_RX_BUF_SIZE,
+DMA_FROM_DEVICE);
skb = build_linear_skb(ch, fd, vaddr);
} else if (fd_format == dpaa2_fd_sg) {
WARN_ON(priv->xdp_prog);
 
+   dma_unmap_single(dev, addr, DPAA2_ETH_RX_BUF_SIZE,
+DMA_FROM_DEVICE);
skb = build_frag_skb(priv, ch, buf_data);
skb_free_frag(vaddr);
percpu_extras->rx_sg_frames++;
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
index 2873a15..23cf9d9 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
@@ -285,6 +285,8 @@ struct dpaa2_eth_fq {
 
 struct dpaa2_eth_ch_xdp {
struct bpf_prog *prog;
+   u64 drop_bufs[DPAA2_ETH_BUFS_PER_CMD];
+   int drop_cnt;
 };
 
 struct dpaa2_eth_channel {
-- 
2.7.4

[PATCH net-next 8/8] dpaa2-eth: Add xdp counters

2018-11-23 Thread Ioana Ciocoi Radulescu

Add counters for xdp processed frames to the channel statistics.

Signed-off-by: Ioana Radulescu 
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 7 ++-
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 4 
 drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c | 3 +++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 995a17d..4305e76 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -307,8 +307,12 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv,
break;
case XDP_TX:
err = xdp_enqueue(priv, fd, vaddr, rx_fq->flowid);
-   if (err)
+   if (err) {
xdp_release_buf(priv, ch, addr);
+   ch->stats.xdp_tx_err++;
+   } else {
+   ch->stats.xdp_tx++;
+   }
break;
default:
bpf_warn_invalid_xdp_action(xdp_act);
@@ -316,6 +320,7 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv,
trace_xdp_exception(priv->net_dev, xdp_prog, xdp_act);
case XDP_DROP:
xdp_release_buf(priv, ch, addr);
+   ch->stats.xdp_drop++;
break;
}
 
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
index 320c0d0..1690fc1 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
@@ -249,6 +249,10 @@ struct dpaa2_eth_ch_stats {
__u64 pull_err;
/* Number of CDANs; useful to estimate avg NAPI len */
__u64 cdan;
+   /* XDP counters */
+   __u64 xdp_drop;
+   __u64 xdp_tx;
+   __u64 xdp_tx_err;
 };
 
 /* Maximum number of queues associated with a DPNI */
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
index 3aa7885..dddc437 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
@@ -45,6 +45,9 @@ static char dpaa2_ethtool_extras[][ETH_GSTRING_LEN] = {
"[drv] dequeue portal busy",
"[drv] channel pull errors",
"[drv] cdan",
+   "[drv] xdp drop",
+   "[drv] xdp tx",
+   "[drv] xdp tx errors",
 };
 
 #define DPAA2_ETH_NUM_EXTRA_STATS  ARRAY_SIZE(dpaa2_ethtool_extras)
-- 
2.7.4

[PATCH net-next 3/8] dpaa2-eth: Move function

2018-11-23 Thread Ioana Ciocoi Radulescu

We'll use function free_bufs() on the XDP path as well, so move
it higher in order to avoid a forward declaration.

Signed-off-by: Ioana Radulescu 
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 34 
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 5be3008..9ed4d8e 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -200,6 +200,23 @@ static struct sk_buff *build_frag_skb(struct 
dpaa2_eth_priv *priv,
return skb;
 }
 
+/* Free buffers acquired from the buffer pool or which were meant to
+ * be released in the pool
+ */
+static void free_bufs(struct dpaa2_eth_priv *priv, u64 *buf_array, int count)
+{
+   struct device *dev = priv->net_dev->dev.parent;
+   void *vaddr;
+   int i;
+
+   for (i = 0; i < count; i++) {
+   vaddr = dpaa2_iova_to_virt(priv->iommu_domain, buf_array[i]);
+   dma_unmap_single(dev, buf_array[i], DPAA2_ETH_RX_BUF_SIZE,
+DMA_FROM_DEVICE);
+   skb_free_frag(vaddr);
+   }
+}
+
 static u32 run_xdp(struct dpaa2_eth_priv *priv,
   struct dpaa2_eth_channel *ch,
   struct dpaa2_fd *fd, void *vaddr)
@@ -794,23 +811,6 @@ static int set_tx_csum(struct dpaa2_eth_priv *priv, bool 
enable)
return 0;
 }
 
-/* Free buffers acquired from the buffer pool or which were meant to
- * be released in the pool
- */
-static void free_bufs(struct dpaa2_eth_priv *priv, u64 *buf_array, int count)
-{
-   struct device *dev = priv->net_dev->dev.parent;
-   void *vaddr;
-   int i;
-
-   for (i = 0; i < count; i++) {
-   vaddr = dpaa2_iova_to_virt(priv->iommu_domain, buf_array[i]);
-   dma_unmap_single(dev, buf_array[i], DPAA2_ETH_RX_BUF_SIZE,
-DMA_FROM_DEVICE);
-   skb_free_frag(vaddr);
-   }
-}
-
 /* Perform a single release command to add buffers
  * to the specified buffer pool
  */
-- 
2.7.4

[PATCH net-next 2/8] dpaa2-eth: Allow XDP header adjustments

2018-11-23 Thread Ioana Ciocoi Radulescu

Reserve XDP_PACKET_HEADROOM bytes in Rx buffers to allow XDP
programs to increase frame header size.

Signed-off-by: Ioana Radulescu 
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 43 ++--
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 5340ac9..5be3008 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -216,11 +216,15 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv,
 
xdp.data = vaddr + dpaa2_fd_get_offset(fd);
xdp.data_end = xdp.data + dpaa2_fd_get_len(fd);
-   xdp.data_hard_start = xdp.data;
+   xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
xdp_set_data_meta_invalid();
 
xdp_act = bpf_prog_run_xdp(xdp_prog, );
 
+   /* xdp.data pointer may have changed */
+   dpaa2_fd_set_offset(fd, xdp.data - vaddr);
+   dpaa2_fd_set_len(fd, xdp.data_end - xdp.data);
+
switch (xdp_act) {
case XDP_PASS:
break;
@@ -1480,7 +1484,7 @@ static bool xdp_mtu_valid(struct dpaa2_eth_priv *priv, 
int mtu)
 
mfl = DPAA2_ETH_L2_MAX_FRM(mtu);
linear_mfl = DPAA2_ETH_RX_BUF_SIZE - DPAA2_ETH_RX_HWA_SIZE -
-dpaa2_eth_rx_head_room(priv);
+dpaa2_eth_rx_head_room(priv) - XDP_PACKET_HEADROOM;
 
return (mfl <= linear_mfl);
 }
@@ -1528,6 +1532,32 @@ static int dpaa2_eth_change_mtu(struct net_device *dev, 
int new_mtu)
return 0;
 }
 
+static int update_rx_buffer_headroom(struct dpaa2_eth_priv *priv, bool has_xdp)
+{
+   struct dpni_buffer_layout buf_layout = {0};
+   int err;
+
+   err = dpni_get_buffer_layout(priv->mc_io, 0, priv->mc_token,
+DPNI_QUEUE_RX, _layout);
+   if (err) {
+   netdev_err(priv->net_dev, "dpni_get_buffer_layout failed\n");
+   return err;
+   }
+
+   /* Reserve extra headroom for XDP header size changes */
+   buf_layout.data_head_room = dpaa2_eth_rx_head_room(priv) +
+   (has_xdp ? XDP_PACKET_HEADROOM : 0);
+   buf_layout.options = DPNI_BUF_LAYOUT_OPT_DATA_HEAD_ROOM;
+   err = dpni_set_buffer_layout(priv->mc_io, 0, priv->mc_token,
+DPNI_QUEUE_RX, _layout);
+   if (err) {
+   netdev_err(priv->net_dev, "dpni_set_buffer_layout failed\n");
+   return err;
+   }
+
+   return 0;
+}
+
 static int setup_xdp(struct net_device *dev, struct bpf_prog *prog)
 {
struct dpaa2_eth_priv *priv = netdev_priv(dev);
@@ -1553,11 +1583,18 @@ static int setup_xdp(struct net_device *dev, struct 
bpf_prog *prog)
if (up)
dpaa2_eth_stop(dev);
 
-   /* While in xdp mode, enforce a maximum Rx frame size based on MTU */
+   /* While in xdp mode, enforce a maximum Rx frame size based on MTU.
+* Also, when switching between xdp/non-xdp modes we need to reconfigure
+* our Rx buffer layout. Buffer pool was drained on dpaa2_eth_stop,
+* so we are sure no old format buffers will be used from now on.
+*/
if (update_settings) {
err = set_rx_mfl(priv, dev->mtu, !!prog);
if (err)
goto out_err;
+   err = update_rx_buffer_headroom(priv, !!prog);
+   if (err)
+   goto out_err;
}
 
old = xchg(>xdp_prog, prog);
-- 
2.7.4

[PATCH v2] bpf: fix check of allowed specifiers in bpf_trace_printk

2018-11-23 Thread Martynas Pumputis

A format string consisting of "%p" or "%s" followed by an invalid
specifier (e.g. "%p%\n" or "%s%") could pass the check which
would make format_decode (lib/vsprintf.c) to warn.

Reported-by: syzbot+1ec5c5ec949c4adaa...@syzkaller.appspotmail.com
Signed-off-by: Martynas Pumputis 
---
 kernel/trace/bpf_trace.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 08fcfe440c63..9864a35c8bb5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -196,11 +196,13 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, 
u64, arg1,
i++;
} else if (fmt[i] == 'p' || fmt[i] == 's') {
mod[fmt_cnt]++;
-   i++;
-   if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+   /* disallow any further format extensions */
+   if (fmt[i + 1] != 0 &&
+   !isspace(fmt[i + 1]) &&
+   !ispunct(fmt[i + 1]))
return -EINVAL;
fmt_cnt++;
-   if (fmt[i - 1] == 's') {
+   if (fmt[i] == 's') {
if (str_seen)
/* allow only one '%s' per fmt string */
return -EINVAL;
-- 
2.19.1

ixgbe driver hits null pointer in net/core/dev.c

2018-11-23 Thread Nathanael Davison

Hi all,

Running Kernel 4.19.x on Xen in machines using intel ixgbe driver the driver 
crashes on startup with the kernel trace below.

It appears that the loop on line 2432 of net/core/dev.c iterates beyond the end 
of the dev_maps->attr_map array, resulting in dereferencing garbage. A 
workaround is to add the condition tci 8b 11 45 8d 4a ff 44 89 c8 83 f8 ff 4c 63 c0 74 2d 
49 83 c0 08
Nov 23 12:49:47 localhost kernel: [   63.471870] RSP: e02b:c90040ccf9e0 
EFLAGS: 00010202
Nov 23 12:49:47 localhost kernel: [   63.471878] RAX:  RBX: 
88000456d280 RCX: 00140001
Nov 23 12:49:47 localhost kernel: [   63.471921] RDX:  RSI: 
0006 RDI: 88000456d280
Nov 23 12:49:47 localhost kernel: [   63.471930] RBP: 0001 R08: 
 R09: 
Nov 23 12:49:47 localhost kernel: [   63.471940] R10: 0001 R11: 
cd40 R12: 
Nov 23 12:49:47 localhost kernel: [   63.471949] R13: 0006 R14: 
0001 R15: 0002
Nov 23 12:49:47 localhost kernel: [   63.471966] FS:  7f77bf9a3740() 
GS:88002d84() knlGS:
Nov 23 12:49:47 localhost kernel: [   63.471976] CS:  e033 DS:  ES:  
CR0: 80050033
Nov 23 12:49:47 localhost kernel: [   63.471985] CR2: 00140001 CR3: 
2738c000 CR4: 2660
Nov 23 12:49:47 localhost kernel: [   63.471999] Call Trace:
Nov 23 12:49:47 localhost kernel: [   63.472008]  
__netif_set_xps_queue+0x735/0x910
Nov 23 12:49:47 localhost kernel: [   63.472018]  netif_set_xps_queue+0x28/0x40
Nov 23 12:49:47 localhost kernel: [   63.472028]  
ixgbe_configure_tx_ring+0x16d/0x270 [ixgbe]
Nov 23 12:49:47 localhost kernel: [   63.472039]  
ixgbe_configure_rx_ring+0x693/0x11f0 [ixgbe]
Nov 23 12:49:47 localhost kernel: [   63.472049]  ixgbe_open+0x21b/0x7b0 [ixgbe]
Nov 23 12:49:47 localhost kernel: [   63.472059]  
ixgbe_dcb_hw_config_82599+0xab2/0x1360 [ixgbe]
Nov 23 12:49:47 localhost kernel: [   63.472069]  dcbnl_setstate+0x3f/0x90
Nov 23 12:49:47 localhost kernel: [   63.472076]  dcb_doit+0x124/0x1d0
Nov 23 12:49:47 localhost kernel: [   63.472085]  rtnetlink_rcv_msg+0x2a2/0x320
Nov 23 12:49:47 localhost kernel: [   63.472093]  ? 
_raw_spin_unlock_irqrestore+0x14/0x20
Nov 23 12:49:47 localhost kernel: [   63.472101]  ? 
__skb_try_recv_datagram+0xd3/0x180
Nov 23 12:49:47 localhost kernel: [   63.472109]  ? 
rtnl_calcit.isra.31+0x110/0x110
Nov 23 12:49:47 localhost kernel: [   63.472118]  netlink_rcv_skb+0xd4/0x110
Nov 23 12:49:47 localhost kernel: [   63.472126]  netlink_unicast+0x182/0x230
Nov 23 12:49:47 localhost kernel: [   63.472133]  netlink_sendmsg+0x2ed/0x3e0
Nov 23 12:49:47 localhost kernel: [   63.472142]  sock_sendmsg+0x36/0x50
Nov 23 12:49:47 localhost kernel: [   63.472149]  __sys_sendto+0xdc/0x160
Nov 23 12:49:47 localhost kernel: [   63.472158]  ? 
__call_rcu.constprop.74+0xc8/0x1d0
Nov 23 12:49:47 localhost kernel: [   63.472167]  __x64_sys_sendto+0x24/0x30
Nov 23 12:49:47 localhost kernel: [   63.472175]  do_syscall_64+0x4e/0x100
Nov 23 12:49:47 localhost kernel: [   63.472183]  
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Nov 23 12:49:47 localhost kernel: [   63.472191] RIP: 0033:0x7f77bee96353
Nov 23 12:49:47 localhost kernel: [   63.472198] Code: 48 8b 0d 38 2b 2c 00 f7 
d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d ad 8c 2c 00 00 75 13 49 89 
ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 1b c9 00 00 
48 89 04 24
Nov 23 12:49:47 localhost kernel: [   63.472219] RSP: 002b:7fff3d4a1568 
EFLAGS: 0246 ORIG_RAX: 002c
Nov 23 12:49:47 localhost kernel: [   63.472230] RAX: ffda RBX: 
562e22ccc970 RCX: 7f77bee96353
Nov 23 12:49:47 localhost kernel: [   63.472239] RDX: 0028 RSI: 
562e22ccc970 RDI: 0005
Nov 23 12:49:47 localhost kernel: [   63.472249] RBP: 562e22ccc970 R08: 
7fff3d4a1570 R09: 000c
Nov 23 12:49:47 localhost kernel: [   63.472259] R10:  R11: 
0246 R12: 7fff3d4a1570
Nov 23 12:49:47 localhost kernel: [   63.472269] R13: 0028 R14: 
 R15: 0009
Nov 23 12:49:47 localhost kernel: [   63.472279] Modules linked in: 8021q garp 
mrp stp llc openvswitch nsh nf_nat_ipv6 nf_nat_ipv4 nf_conncount nf_nat 
ipt_REJECT nf_reject_ipv4 xt_tcpudp xt_multiport xt_conntrack nf_conntrack 
nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c iptable_filter dm_multipath dm_mod 
dcdbas intel_powerclamp coretemp sg ipmi_si bnx2 ipmi_devintf ixgbe 
ipmi_msghandler i7core_edac acpi_power_meter lpc_ich hed mdio nls_utf8 isofs 
loop nfsd auth_rpcgss oid_registry nfs_acl lockd grace sunrpc ip_tables 
x_tables sha1_ssse3 sha1_generic ipv6 sd_mod sr_mod cdrom hid_generic 
ata_generic pata_acpi usbhid hid ata_piix libata mptsas scsi_transport_sas 
mptscsih mptbase ehci_pci ehci_hcd scsi_dh_rdac scsi_dh_hp_sw scsi_dh_emc 
scsi_dh_alua

Re: [PATCHv2 1/2] can: xilinx: add can 2.0 support

2018-11-23 Thread Marc Kleine-Budde

On 10/12/18 6:25 AM, shubhrajyoti.da...@gmail.com wrote:
> From: Shubhrajyoti Datta 
> 
> Add support for can 2.0.
> 
> Signed-off-by: Shubhrajyoti Datta 

Added to linux-can-next.

Tnx,
Marc

-- 
Pengutronix e.K.  | Marc Kleine-Budde   |
Industrial Linux Solutions| Phone: +49-231-2826-924 |
Vertretung West/Dortmund  | Fax:   +49-5121-206917- |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |



signature.asc
Description: OpenPGP digital signature

Re: [PATCH bpf-next] bpf: Add BPF_MAP_TYPE_QUEUE and BPF_MAP_TYPE_QUEUE to bpftool-map

2018-11-23 Thread Edward Cree

On 22/11/18 20:59, David Calavera wrote:
> I noticed that these two new BPF Maps are not defined in bpftool.
> This patch defines those two maps and adds their names to the
> bpftool-map documentation.
>
> Signed-off-by: David Calavera 
> ---
Subject line says 'QUEUE' twice, should one of those be 'STACK'?
>  tools/bpf/bpftool/Documentation/bpftool-map.rst | 3 ++-
>  tools/bpf/bpftool/map.c | 2 ++
>  2 files changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst 
> b/tools/bpf/bpftool/Documentation/bpftool-map.rst
> index f55a2daed59b..9e827e342d9e 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
> @@ -42,7 +42,8 @@ MAP COMMANDS
>  || **percpu_array** | **stack_trace** | **cgroup_array** | 
> **lru_hash**
>  || **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | 
> **hash_of_maps**
>  || **devmap** | **sockmap** | **cpumap** | **xskmap** | 
> **sockhash**
> -|| **cgroup_storage** | **reuseport_sockarray** | 
> **percpu_cgroup_storage** }
> +|| **cgroup_storage** | **reuseport_sockarray** | 
> **percpu_cgroup_storage**
> +|| **queue** | **stack** }
>  
>  DESCRIPTION
>  ===
> diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
> index 7bf38f0e152e..68b656b6edcc 100644
> --- a/tools/bpf/bpftool/map.c
> +++ b/tools/bpf/bpftool/map.c
> @@ -74,6 +74,8 @@ static const char * const map_type_name[] = {
>   [BPF_MAP_TYPE_CGROUP_STORAGE]   = "cgroup_storage",
>   [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray",
>   [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE]= "percpu_cgroup_storage",
> + [BPF_MAP_TYPE_QUEUE] = "queue",
> + [BPF_MAP_TYPE_STACK] = "stack",
>  };
>  
>  static bool map_is_per_cpu(__u32 type)

Re: [PATCH bpf-next] bpf: libbpf: retry program creation without the name

2018-11-23 Thread Quentin Monnet


2018-11-21 09:28 UTC-0800 ~ Stanislav Fomichev 

On 11/21, Quentin Monnet wrote:

2018-11-20 15:26 UTC-0800 ~ Stanislav Fomichev 

On 11/20, Alexei Starovoitov wrote:

On Wed, Nov 21, 2018 at 12:18:57AM +0100, Daniel Borkmann wrote:

On 11/21/2018 12:04 AM, Alexei Starovoitov wrote:

On Tue, Nov 20, 2018 at 01:19:05PM -0800, Stanislav Fomichev wrote:

On 11/20, Alexei Starovoitov wrote:

On Mon, Nov 19, 2018 at 04:46:25PM -0800, Stanislav Fomichev wrote:

[Recent commit 23499442c319 ("bpf: libbpf: retry map creation without
the name") fixed this issue for maps, let's do the same for programs.]

Since commit 88cda1c9da02 ("bpf: libbpf: Provide basic API support
to specify BPF obj name"), libbpf unconditionally sets bpf_attr->name
for programs. Pre v4.14 kernels don't know about programs names and
return an error about unexpected non-zero data. Retry sys_bpf without
a program name to cover older kernels.

Signed-off-by: Stanislav Fomichev 
---
  tools/lib/bpf/bpf.c | 10 ++
  1 file changed, 10 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 961e1b9fc592..cbe9d757c646 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -212,6 +212,16 @@ int bpf_load_program_xattr(const struct 
bpf_load_program_attr *load_attr,
if (fd >= 0 || !log_buf || !log_buf_sz)
return fd;
  
+	if (fd < 0 && errno == E2BIG && load_attr->name) {

+   /* Retry the same syscall, but without the name.
+* Pre v4.14 kernels don't support prog names.
+*/


I'm afraid that will put unnecessary stress on the kernel.
This check needs to be tighter.
Like E2BIG and anything in the log_buf probably means that
E2BIG came from the verifier and nothing to do with prog_name.
Asking kernel to repeat is an unnecessary work.

In general we need to think beyond this single prog_name field.
There are bunch of other fields in bpf_load_program_xattr() and older kernels
won't support them. Are we going to zero them out one by one
and retry? I don't think that would be practical.

I general, we don't want to zero anything out. However,
for this particular problem the rationale is the following:
In commit 88cda1c9da02 we started unconditionally setting {prog,map}->name
from the 'higher' libbpfc layer which breaks users on the older kernels.


Also libbpf silently ignoring prog_name is not great for debugging.
A warning is needed.
But it cannot be done out of lib/bpf/bpf.c, since it's a set of syscall
wrappers.
Imo such "old kernel -> lets retry" feature should probably be done
at lib/bpf/libbpf.c level. inside load_program().

For maps bpftools calls bpf_create_map_xattr directly, that's why
for maps I did the retry on the lower level (and why for programs I initially
thought about doing the same). However, in this case maybe asking
user to omit 'name' argument might be a better option.

For program names, I agree, we might think about doing it on the higher
level (although I'm not sure whether we want to have different API
expectations, i.e. bpf_create_map_xattr ignoring the name and
bpf_load_program_xattr not ignoring the name).

So given that rationale above, what do you think is the best way to
move forward?
1. Same patch, but tighten the retry check inside bpf_load_program_xattr ?
2. Move this retry logic into load_program and have different handling
for bpf_create_map_xattr vs bpf_load_program_xattr ?
3. Do 2 and move the retry check for maps from bpf_create_map_xattr
into bpf_object__create_maps ?

(I'm slightly leaning towards #3)


me too. I think it's cleaner for maps to do it in
bpf_object__create_maps().
Originally bpf.c was envisioned to be a thin layer on top of bpf syscall.
Whereas 'smart bits' would go into libbpf.c


Can't we create in bpf_object__load() a small helper bpf_object__probe_caps()
which would figure this out _once_ upon start with a few things to probe for
availability in the underlying kernel for maps and programs? E.g. programs
it could try to inject a tiny 'r0 = 0; exit' snippet where we figure out
things like prog name support etc. Given underlying kernel doesn't change, we
would only try this once and it doesn't require fallback every time.


+1. great idea!

Sounds good, let me try to do it.

It sounds more like a recent LPC proposal/idea to have some sys_bpf option
to query BPF features. This new bpf_object__probe_caps can probably query
that in the future if we eventually add support for it.



Hi,

LPC proposal indeed. I've been working on implementing this kind of
probes in bpftool. I don't probe name support for now (but I can
certainly add it), but I detect supported program types, map types,
header functions, and a couple of other parameters. The idea (initially
from Daniel) was to dump "#define" declarations that could later be
included in a header file and used for a BPF project (or alternatively,
JSON output).

Oh, nice, I didn't know someone was already working on it!


I felt like bpftool was

Re: [RFC PATCH bpf-next] libbpf: make bpf_object__open default to UNSPEC

2018-11-23 Thread Wangnan (F)




On 2018/11/23 5:52, Daniel Borkmann wrote:
> [ +Wang ]
> 
> On 11/22/2018 07:03 AM, Nikita V. Shirokov wrote:
>> currently by default libbpf's bpf_object__open requires
>> bpf's program to specify  version in a code because of two things:
>> 1) default prog type is set to KPROBE
>> 2) KPROBE requires (in kernel/bpf/syscall.c) version to be specified
>>
>> in this RFC i'm proposing change default to UNSPEC and also changing
>> logic of libbpf that it would reflect what we have today in kernel
>> (aka only KPROBE type requires for version to be explicitly set).
>>
>> reason for change:
>> currently only libbpf requires by default version to be
>> explicitly set. it would be really hard for mainteiners of other custom
>> bpf loaders to migrate to libbpf (as they dont control user's code
>> and migration to the new loader (libbpf) wont be transparent for end
>> user).
>>
>> what is going to be broken after this change:
>> if someone were relying on default to be KPROBE for bpf_object__open
>> his code will stop to work. however i'm really doubtfull that anyone
>> is using this for kprobe type of programs (instead of, say, bcc or
>> other tracing frameworks)
>>
>> other possible solutions (for discussion, would require more machinery):
>> add another function like bpf_object__open w/ default to unspec
>>
>> Signed-off-by: Nikita V. Shirokov 
>> ---
>>  tools/lib/bpf/libbpf.c | 8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
>> index 0f14f7c074c2..ed4212a4c5f9 100644
>> --- a/tools/lib/bpf/libbpf.c
>> +++ b/tools/lib/bpf/libbpf.c
>> @@ -333,7 +333,7 @@ bpf_program__init(void *data, size_t size, char 
>> *section_name, int idx,
>>  prog->idx = idx;
>>  prog->instances.fds = NULL;
>>  prog->instances.nr = -1;
>> -prog->type = BPF_PROG_TYPE_KPROBE;
>> +prog->type = BPF_PROG_TYPE_UNSPEC;
>>  prog->btf_fd = -1;
> 
> Seems this was mostly for historic reasons, but for a generic library this
> would indeed be an odd convention for default. Wang, given 5f44e4c810bf
> ("tools lib bpf: New API to adjust type of a BPF program"), are you in any
> way relying on this default or using things like bpf_program__set_kprobe()
> instead which you've added there? If latter, I'd say we should then change
> it better now than later when there's even more lib usage (and in particular
> before we add official ABI versioning).

OK. I don't rely on that now.

Thank you.

64 matches

Mail list logo