date:20181126

Re: consistency for statistics with XDP mode

2018-11-26 Thread Toshiaki Makita

On 2018/11/26 10:37, Toshiaki Makita wrote:
> On 2018/11/23 1:43, David Ahern wrote:
>> On 11/21/18 5:53 PM, Toshiaki Makita wrote:
 We really need consistency in the counters and at a minimum, users
 should be able to track packet and byte counters for both Rx and Tx
 including XDP.

 It seems to me the Rx and Tx packet, byte and dropped counters returned
 for the standard device stats (/proc/net/dev, ip -s li show, ...) should
 include all packets managed by the driver regardless of whether they are
 forwarded / dropped in XDP or go up the Linux stack. This also aligns
>>>
>>> Agreed. When I introduced virtio_net XDP counters, I just forgot to
>>> update tx packets/bytes counters on ndo_xdp_xmit. Probably I thought it
>>> is handled by free_old_xmit_skbs.
>>
>> Do you have some time to look at adding the Tx counters to virtio_net?
> 
> hoping I can make some time within a couple of days.

Hmm... It looks like free_old_xmit_skbs() calls dev_consume_skb_any()
for xdp_frame when napi_tx is enabled. I will fix this beforehand.

-- 
Toshiaki Makita

[PATCH net-next 2/2] bnx2x: Add storm FW version to ethtool driver query output.

2018-11-26 Thread Sudarsana Reddy Kalluru

The patch populates the Storm FW version in the ethtool driver query data.

Signed-off-by: Sudarsana Reddy Kalluru 
Signed-off-by: Ariel Elior 
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 68aae3e..749d0ef 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -1112,6 +1112,12 @@ static void bnx2x_get_drvinfo(struct net_device *dev,
strlcpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
 
+   memset(version, 0, sizeof(version));
+   snprintf(version, ETHTOOL_FWVERS_LEN, " storm %d.%d.%d.%d",
+BCM_5710_FW_MAJOR_VERSION, BCM_5710_FW_MINOR_VERSION,
+BCM_5710_FW_REVISION_VERSION, BCM_5710_FW_ENGINEERING_VERSION);
+   strlcat(info->version, version, sizeof(info->version));
+
if (SHMEM2_HAS(bp, extended_dev_info_shared_addr)) {
ext_dev_info_offset = SHMEM2_RD(bp,
extended_dev_info_shared_addr);
-- 
1.8.3.1

[PATCH net-next 0/2] bnx2x: Popoulate firmware versions in driver info query.

2018-11-26 Thread Sudarsana Reddy Kalluru

From: Sudarsana Reddy Kalluru 

The patch series populates MBI and storm firware versions in the ethtool
driver info query.

Please consider applying it to 'net-next' tree.

Sudarsana Reddy Kalluru (2):
  bnx2x: Add MBI version to ethtool driver query output.
  bnx2x: Add storm FW version to ethtool driver query output.

 .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c| 30 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h|  5 
 2 files changed, 34 insertions(+), 1 deletion(-)

-- 
1.8.3.1

[PATCH net-next 1/2] bnx2x: Add MBI version to ethtool driver query output.

2018-11-26 Thread Sudarsana Reddy Kalluru

The patch populates the MBI version in the ethtool driver query data.
Adding 'extended_dev_info_shared_cfg' structure describing the nvram
structure, this is required to access the mbi version string.

Signed-off-by: Sudarsana Reddy Kalluru 
Signed-off-by: Ariel Elior 
---
 .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c| 24 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h|  5 +
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index a4a90b6c..68aae3e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -1105,11 +1105,33 @@ static void bnx2x_get_drvinfo(struct net_device *dev,
  struct ethtool_drvinfo *info)
 {
struct bnx2x *bp = netdev_priv(dev);
+   char version[ETHTOOL_FWVERS_LEN];
+   int ext_dev_info_offset;
+   u32 mbi;
 
strlcpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
 
-   bnx2x_fill_fw_str(bp, info->fw_version, sizeof(info->fw_version));
+   if (SHMEM2_HAS(bp, extended_dev_info_shared_addr)) {
+   ext_dev_info_offset = SHMEM2_RD(bp,
+   extended_dev_info_shared_addr);
+   mbi = REG_RD(bp, ext_dev_info_offset +
+offsetof(struct extended_dev_info_shared_cfg,
+ mbi_version));
+   if (mbi) {
+   memset(version, 0, sizeof(version));
+   snprintf(version, ETHTOOL_FWVERS_LEN, "mbi %d.%d.%d ",
+(mbi & 0xff00) >> 24,
+(mbi & 0x00ff) >> 16,
+(mbi & 0xff00) >> 8);
+   strlcpy(info->fw_version, version,
+   sizeof(info->fw_version));
+   }
+   }
+
+   memset(version, 0, sizeof(version));
+   bnx2x_fill_fw_str(bp, version, ETHTOOL_FWVERS_LEN);
+   strlcat(info->fw_version, version, sizeof(info->fw_version));
 
strlcpy(info->bus_info, pci_name(bp->pdev), sizeof(info->bus_info));
 }
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
index f8b8103..d9057c8 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
@@ -1140,6 +1140,11 @@ struct shm_dev_info {/* size 
*/
 
 };
 
+struct extended_dev_info_shared_cfg {
+   u32 reserved[18];
+   u32 mbi_version;
+   u32 mbi_date;
+};
 
 #if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
#error "Missing either LITTLE_ENDIAN or BIG_ENDIAN definition."
-- 
1.8.3.1

[PATCH net-next] cxgb4: number of VFs supported is not always 16

2018-11-26 Thread Ganesh Goudar

Total number of VFs supported by PF is used determine the last
byte of VF's mac address. Number of VFs supported is not always
16, use the variable nvfs to get the number of VFs supported
rather than hard coding it to 16.

Signed-off-by: Casey Leedom 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 7f76ad9..6ba9099 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2646,7 +2646,7 @@ static void cxgb4_mgmt_fill_vf_station_mac_addr(struct 
adapter *adap)
 
for (vf = 0, nvfs = pci_sriov_get_totalvfs(adap->pdev);
vf < nvfs; vf++) {
-   macaddr[5] = adap->pf * 16 + vf;
+   macaddr[5] = adap->pf * nvfs + vf;
ether_addr_copy(adap->vfinfo[vf].vf_mac_addr, macaddr);
}
 }
-- 
2.1.0

Re: [PATCH v5 5/6] vxlan: handle underlay VRF changes

2018-11-26 Thread Roopa Prabhu

On Mon, Nov 26, 2018 at 5:04 PM Alexis Bauvin  wrote:
>
> When underlay VRF changes, either because the lower device itself changed,
> or its VRF changed, this patch releases the current socket of the VXLAN
> device and recreates another one in the right VRF. This allows for
> on-the-fly change of the underlay VRF of a VXLAN device.
>
> Signed-off-by: Alexis Bauvin 
> Reviewed-by: Amine Kherbouche 
> Tested-by: Amine Kherbouche 
> ---

re-iterating my comments on the patch this time.

this version still unconditionally calls reopen even if the current
state of the device is closed (eg vxlan_stop).
generally not in favor of the unconditional open/close in the driver.
Lets see if there are other options.
I interpreted one of Davids suggestions to force the change ordering
from user-space by returning an error.
ie Make the user do a down and up of the vxlan device if he wants to
change the vrf of the default remote dev.

This patch needs more thought, the rest are ok to go in if you
separate them out.

>  drivers/net/vxlan.c | 82 +
>  1 file changed, 82 insertions(+)
>
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 8ba0a57ff958..131ee80a38f9 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -3720,6 +3720,33 @@ struct net_device *vxlan_dev_create(struct net *net, 
> const char *name,
>  }
>  EXPORT_SYMBOL_GPL(vxlan_dev_create);
>
> +static int vxlan_reopen(struct vxlan_net *vn, struct vxlan_dev *vxlan)
> +{
> +   int ret = 0;
> +
> +   if (vxlan_addr_multicast(>default_dst.remote_ip) &&
> +   !vxlan_group_used(vn, vxlan))
> +   ret = vxlan_igmp_leave(vxlan);
> +   vxlan_sock_release(vxlan);
> +
> +   if (ret < 0)
> +   return ret;
> +
> +   ret = vxlan_sock_add(vxlan);
> +   if (ret < 0)
> +   return ret;
> +
> +   if (vxlan_addr_multicast(>default_dst.remote_ip)) {
> +   ret = vxlan_igmp_join(vxlan);
> +   if (ret == -EADDRINUSE)
> +   ret = 0;
> +   if (ret)
> +   vxlan_sock_release(vxlan);
> +   }
> +
> +   return ret;
> +}
> +
>  static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
>  struct net_device *dev)
>  {
> @@ -3742,6 +3769,55 @@ static void vxlan_handle_lowerdev_unregister(struct 
> vxlan_net *vn,
> unregister_netdevice_many(_kill);
>  }
>
> +static void vxlan_handle_change_upper(struct vxlan_net *vn,
> + struct net_device *dev)
> +{
> +   struct vxlan_dev *vxlan, *next;
> +
> +   list_for_each_entry_safe(vxlan, next, >vxlan_list, next) {
> +   struct net_device *lower;
> +   int err;
> +
> +   lower = __dev_get_by_index(vxlan->net,
> +  vxlan->cfg.remote_ifindex);
> +   if (!netdev_is_upper_master(lower, dev))
> +   continue;
> +
> +   err = vxlan_reopen(vn, vxlan);
> +   if (err < 0)
> +   netdev_err(vxlan->dev, "Failed to reopen socket: 
> %d\n",
> +  err);
> +   }
> +}
> +
> +static void vxlan_handle_change(struct vxlan_net *vn, struct net_device *dev)
> +{
> +   struct vxlan_dev *vxlan = netdev_priv(dev);
> +   struct vxlan_sock *sock;
> +   int l3mdev_index = 0;
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +   bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
> +   bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
> +
> +   sock = ipv6 ? rcu_dereference(vxlan->vn6_sock)
> +   : rcu_dereference(vxlan->vn4_sock);
> +#else
> +   sock = rcu_dereference(vxlan->vn4_sock);
> +#endif
> +
> +   if (vxlan->cfg.remote_ifindex)
> +   l3mdev_index = l3mdev_master_upper_ifindex_by_index(
> +   vxlan->net, vxlan->cfg.remote_ifindex);
> +   if (sock->sock->sk->sk_bound_dev_if != l3mdev_index) {
> +   int ret = vxlan_reopen(vn, vxlan);
> +
> +   if (ret < 0)
> +   netdev_err(vxlan->dev, "Failed to reopen socket: 
> %d\n",
> +  ret);
> +   }
> +}
> +
>  static int vxlan_netdevice_event(struct notifier_block *unused,
>  unsigned long event, void *ptr)
>  {
> @@ -3756,6 +3832,12 @@ static int vxlan_netdevice_event(struct notifier_block 
> *unused,
> } else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO ||
>event == NETDEV_UDP_TUNNEL_DROP_INFO) {
> vxlan_offload_rx_ports(dev, event == 
> NETDEV_UDP_TUNNEL_PUSH_INFO);
> +   } else if (event == NETDEV_CHANGEUPPER) {
> +   vxlan_handle_change_upper(vn, dev);
> +   } else if (event == NETDEV_CHANGE) {
> +   if (dev->rtnl_link_ops &&
> +

[PATCH bpf] sparc: Adjust bpf JIT prologue for PSEUDO calls.

2018-11-26 Thread David Miller



Move all arguments into output registers from input registers.

This path is exercised by test_verifier.c's "calls: two calls with
args" test.  Adjust BPF_TAILCALL_PROLOGUE_SKIP as needed.

Let's also make the prologue length a constant size regardless of
the combination of ->saw_frame_pointer and ->saw_tail_call
settings.

Signed-off-by: David S. Miller 

diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 222785a..5fda4f7 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -791,7 +791,7 @@ static int emit_compare_and_branch(const u8 code, const u8 
dst, u8 src,
 }
 
 /* Just skip the save instruction and the ctx register move.  */
-#define BPF_TAILCALL_PROLOGUE_SKIP 16
+#define BPF_TAILCALL_PROLOGUE_SKIP 32
 #define BPF_TAILCALL_CNT_SP_OFF(STACK_BIAS + 128)
 
 static void build_prologue(struct jit_ctx *ctx)
@@ -824,9 +824,15 @@ static void build_prologue(struct jit_ctx *ctx)
const u8 vfp = bpf2sparc[BPF_REG_FP];
 
emit(ADD | IMMED | RS1(FP) | S13(STACK_BIAS) | RD(vfp), ctx);
+   } else {
+   emit_nop(ctx);
}
 
emit_reg_move(I0, O0, ctx);
+   emit_reg_move(I1, O1, ctx);
+   emit_reg_move(I2, O2, ctx);
+   emit_reg_move(I3, O3, ctx);
+   emit_reg_move(I4, O4, ctx);
/* If you add anything here, adjust BPF_TAILCALL_PROLOGUE_SKIP above. */
 }

[PATCH bpf-next 1/3] bpf: btf: check name validity for various types

2018-11-26 Thread Yonghong Song

Commit 2667a2626f4d ("bpf: btf: Add BTF_KIND_FUNC
and BTF_KIND_FUNC_PROTO") checked the name validity
for BTF_KIND_FUNC/BTF_KIND_FUNC_PROTO types such that:
 . BTF_KIND_FUNC must have a valid identifier name
 . BTF_KIND_PROTO must have a null name
 . The argument name of BTF_KIND_FUNC/BTF_KIND_FUNC_PROTO,
   if not null, must be a valid identifier.

This patch added name checking for the following types:
 . BTF_KIND_PTR, BTF_KIND_ARRAY, BTF_KIND_VOLATILE,
   BTF_KIND_CONST, BTF_KIND_RESTRICT:
 the name must be null
 . BTF_KIND_STRUCT, BTF_KIND_UNION: the struct/member name
 is either null or a valid identifier
 . BTF_KIND_ENUM: the enum type name is either null or a valid
 identifier; the enumerator name must be a valid identifier.
 . BTF_KIND_FWD: the name must be a valid identifier
 . BTF_KIND_TYPEDEF: the name must be a valid identifier

For those places a valid name is required, the name must be
a valid C identifier. This can be relaxed later if we found
use cases for a different (non-C) frontend.

Acked-by: Martin KaFai Lau 
Signed-off-by: Yonghong Song 
---
 kernel/bpf/btf.c | 57 
 1 file changed, 57 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a09b2f94ab25..793acba40b4c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1195,6 +1195,22 @@ static int btf_ref_type_check_meta(struct 
btf_verifier_env *env,
return -EINVAL;
}
 
+   /* typedef type must have a valid name, and other ref types,
+* volatile, const, restrict, should have a null name.
+*/
+   if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
+   if (!t->name_off ||
+   !btf_name_valid_identifier(env->btf, t->name_off)) {
+   btf_verifier_log_type(env, t, "Invalid name");
+   return -EINVAL;
+   }
+   } else {
+   if (t->name_off) {
+   btf_verifier_log_type(env, t, "Invalid name");
+   return -EINVAL;
+   }
+   }
+
btf_verifier_log_type(env, t, NULL);
 
return 0;
@@ -1353,6 +1369,13 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env 
*env,
return -EINVAL;
}
 
+   /* fwd type must have a valid name */
+   if (!t->name_off ||
+   !btf_name_valid_identifier(env->btf, t->name_off)) {
+   btf_verifier_log_type(env, t, "Invalid name");
+   return -EINVAL;
+   }
+
btf_verifier_log_type(env, t, NULL);
 
return 0;
@@ -1409,6 +1432,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env 
*env,
return -EINVAL;
}
 
+   /* array type should not have a name */
+   if (t->name_off) {
+   btf_verifier_log_type(env, t, "Invalid name");
+   return -EINVAL;
+   }
+
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
@@ -1585,6 +1614,13 @@ static s32 btf_struct_check_meta(struct btf_verifier_env 
*env,
return -EINVAL;
}
 
+   /* struct type either no name or a valid one */
+   if (t->name_off &&
+   !btf_name_valid_identifier(env->btf, t->name_off)) {
+   btf_verifier_log_type(env, t, "Invalid name");
+   return -EINVAL;
+   }
+
btf_verifier_log_type(env, t, NULL);
 
last_offset = 0;
@@ -1596,6 +1632,12 @@ static s32 btf_struct_check_meta(struct btf_verifier_env 
*env,
return -EINVAL;
}
 
+   /* struct member either no name or a valid one */
+   if (member->name_off &&
+   !btf_name_valid_identifier(btf, member->name_off)) {
+   btf_verifier_log_member(env, t, member, "Invalid name");
+   return -EINVAL;
+   }
/* A member cannot be in type void */
if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
btf_verifier_log_member(env, t, member,
@@ -1783,6 +1825,13 @@ static s32 btf_enum_check_meta(struct btf_verifier_env 
*env,
return -EINVAL;
}
 
+   /* enum type either no name or a valid one */
+   if (t->name_off &&
+   !btf_name_valid_identifier(env->btf, t->name_off)) {
+   btf_verifier_log_type(env, t, "Invalid name");
+   return -EINVAL;
+   }
+
btf_verifier_log_type(env, t, NULL);
 
for (i = 0; i < nr_enums; i++) {
@@ -1792,6 +1841,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env 
*env,
return -EINVAL;
}
 
+   /* enum member must have a valid name */
+   if (!enums[i].name_off ||
+   !btf_name_valid_identifier(btf, enums[i].name_off)) {
+   btf_verifier_log_type(env, t,

[PATCH bpf-next 2/3] tools/bpf: fix two test_btf unit test cases

2018-11-26 Thread Yonghong Song

From: Martin KaFai Lau 

There are two unit test cases, which should encode
TYPEDEF type, but instead encode PTR type.
The error is flagged out after enforcing name
checking in the previous patch.

Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
Signed-off-by: Martin KaFai Lau 
Signed-off-by: Yonghong Song 
---
 tools/testing/selftests/bpf/test_btf.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index bae7308b7ec5..99a92923e3f9 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -447,13 +447,13 @@ static struct btf_raw_test raw_tests[] = {
/* const void *//* [2] */
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
/* const void* */   /* [3] */
-   BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
+   BTF_PTR_ENC(2),
/* typedef const void * const_void_ptr */
-   BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
-   /* struct A { *//* [4] */
+   BTF_TYPEDEF_ENC(NAME_TBD, 3),   /* [4] */
+   /* struct A { *//* [5] */
BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 
sizeof(void *)),
/* const_void_ptr m; */
-   BTF_MEMBER_ENC(NAME_TBD, 3, 0),
+   BTF_MEMBER_ENC(NAME_TBD, 4, 0),
/* } */
BTF_END_RAW,
},
@@ -510,11 +510,11 @@ static struct btf_raw_test raw_tests[] = {
/* const void *//* [2] */
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
/* const void* */   /* [3] */
-   BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
-   /* typedef const void * const_void_ptr */   /* [4] */
-   BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
-   /* const_void_ptr[4] */ /* [5] */
-   BTF_TYPE_ARRAY_ENC(3, 1, 4),
+   BTF_PTR_ENC(2),
+   /* typedef const void * const_void_ptr */
+   BTF_TYPEDEF_ENC(NAME_TBD, 3),   /* [4] */
+   /* const_void_ptr[4] */
+   BTF_TYPE_ARRAY_ENC(4, 1, 4),/* [5] */
BTF_END_RAW,
},
.str_sec = "\0const_void_ptr",
-- 
2.17.1

[PATCH bpf-next 3/3] tools/bpf: add addition type tests to test_btf

2018-11-26 Thread Yonghong Song

The following additional unit testcases are added to test_btf:
  BTF raw test[65] (typedef (invalid name, name_off = 0)): OK
  BTF raw test[66] (typedef (invalid name, invalid identifier)): OK
  BTF raw test[67] (ptr type (invalid name, name_off <> 0)): OK
  BTF raw test[68] (volatile type (invalid name, name_off <> 0)): OK
  BTF raw test[69] (const type (invalid name, name_off <> 0)): OK
  BTF raw test[70] (restrict type (invalid name, name_off <> 0)): OK
  BTF raw test[71] (fwd type (invalid name, name_off = 0)): OK
  BTF raw test[72] (fwd type (invalid name, invalid identifier)): OK
  BTF raw test[73] (array type (invalid name, name_off <> 0)): OK
  BTF raw test[74] (struct type (name_off = 0)): OK
  BTF raw test[75] (struct type (invalid name, invalid identifier)): OK
  BTF raw test[76] (struct member (name_off = 0)): OK
  BTF raw test[77] (struct member (invalid name, invalid identifier)): OK
  BTF raw test[78] (enum type (name_off = 0)): OK
  BTF raw test[79] (enum type (invalid name, invalid identifier)): OK
  BTF raw test[80] (enum member (invalid name, name_off = 0)): OK
  BTF raw test[81] (enum member (invalid name, invalid identifier)): OK

Acked-by: Martin KaFai Lau 
Signed-off-by: Yonghong Song 
---
 tools/testing/selftests/bpf/test_btf.c | 362 +
 1 file changed, 362 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index 99a92923e3f9..a7267815311f 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -1849,6 +1849,368 @@ static struct btf_raw_test raw_tests[] = {
.err_str = "Invalid type_id",
 },
 
+{
+   .descr = "typedef (invalid name, name_off = 0)",
+   .raw_types = {
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+   BTF_TYPEDEF_ENC(0, 1),  /* [2] */
+   BTF_END_RAW,
+   },
+   .str_sec = "\0__int",
+   .str_sec_size = sizeof("\0__int"),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "typedef_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 1,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .btf_load_err = true,
+   .err_str = "Invalid name",
+},
+
+{
+   .descr = "typedef (invalid name, invalid identifier)",
+   .raw_types = {
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+   BTF_TYPEDEF_ENC(NAME_TBD, 1),   /* [2] */
+   BTF_END_RAW,
+   },
+   .str_sec = "\0__!int",
+   .str_sec_size = sizeof("\0__!int"),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "typedef_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 1,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .btf_load_err = true,
+   .err_str = "Invalid name",
+},
+
+{
+   .descr = "ptr type (invalid name, name_off <> 0)",
+   .raw_types = {
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] 
*/
+   BTF_TYPE_ENC(NAME_TBD,
+BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 1),  /* [2] 
*/
+   BTF_END_RAW,
+   },
+   .str_sec = "\0__int",
+   .str_sec_size = sizeof("\0__int"),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "ptr_type_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 1,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .btf_load_err = true,
+   .err_str = "Invalid name",
+},
+
+{
+   .descr = "volatile type (invalid name, name_off <> 0)",
+   .raw_types = {
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] 
*/
+   BTF_TYPE_ENC(NAME_TBD,
+BTF_INFO_ENC(BTF_KIND_VOLATILE, 0, 0), 1), /* [2] 
*/
+   BTF_END_RAW,
+   },
+   .str_sec = "\0__int",
+   .str_sec_size = sizeof("\0__int"),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "volatile_type_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 1,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .btf_load_err = true,
+   .err_str = "Invalid name",
+},
+
+{
+   .descr = "const type (invalid name, name_off <> 0)",
+   .raw_types = {
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] 
*/
+   BTF_TYPE_ENC(NAME_TBD,
+BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1),/* [2] 
*/
+   BTF_END_RAW,
+   },
+   .str_sec = "\0__int",
+   .str_sec_size = sizeof("\0__int"),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "const_type_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 1,
+   .value_type_id =

[PATCH bpf-next 0/3] bpf: btf: check name validity for various types

2018-11-26 Thread Yonghong Song

Commit 2667a2626f4d ("bpf: btf: Add BTF_KIND_FUNC
and BTF_KIND_FUNC_PROTO") checked the name validity
for BTF_KIND_FUNC/FUNC_PROTO types.

This patch added name checking for PTR, ARRAY, VOLATILE, TYPEDEF,
CONST, RESTRICT, STRUCT, UNION, ENUM and FWD types. Such a strict
name checking makes BTF more sound in the kernel and future
BTF-to-header-file converesion ([1]) less fragile.

[1]: http://vger.kernel.org/lpc-bpf2018.html#session-2

Yonghong Song (3):
  bpf: btf: check name validity for various types
  tools/bpf: fix two test_btf unit test cases
  tools/bpf: add addition type tests to test_btf

 kernel/bpf/btf.c   |  57 
 tools/testing/selftests/bpf/test_btf.c | 380 -
 2 files changed, 428 insertions(+), 9 deletions(-)

-- 
2.17.1

Re: [iproute2-next PATCH v4] tc: flower: Classify packets based port ranges

2018-11-26 Thread Stephen Hemminger

On Mon, 26 Nov 2018 17:56:10 -0800
"Nambiar, Amritha"  wrote:

> On 11/26/2018 4:43 PM, David Ahern wrote:
> > On 11/26/18 5:23 PM, Nambiar, Amritha wrote:  
> >>> Can tc flower use something similar to ip ru with single port or port
> >>> range handled like this?
> >>>
> >>> },{
> >>> "priority": 32764,
> >>> "src": "172.16.1.0",
> >>> "srclen": 24,
> >>> "ipproto": "tcp",
> >>> "sport": 1100,
> >>> "table": "main"
> >>> },{
> >>> "priority": 32765,
> >>> "src": "172.16.1.0",
> >>> "srclen": 24,
> >>> "ipproto": "tcp",
> >>> "sport_start": 1000,
> >>> "sport_end": 1010,
> >>> "table": "main"
> >>> },{
> >>>
> >>>  
> >>
> >> Does it have to be separate fields "sport_start" and "sport_end"?
> >> Removing the space and 'range' keyword will make the output format
> >> consistent with the input format and print as "sport " for
> >> single port and "sport -" for range.
> >> Example:
> >>
> >> ... flower ip_proto tcp src_port 12 skip_hw action will print as:
> >>   ip_proto tcp
> >>   src_port 12
> >>   skip_hw
> >>   not_in_hw
> >> action
> >>
> >> ... flower ip_proto tcp src_port 100-200 skip_hw action :
> >>   ip_proto tcp
> >>   src_port 100-200
> >>   skip_hw
> >>   not_in_hw
> >> action  
> > 
> > non-json output needs to match what the user gives on the command line.
> > 
> > My comment was about consistency with json output when possible. I am
> > not a json expert by any means. Other commands have a single key value
> > pair, so I suspect the json here needs to follow suit (ie., not
> > "src_port": "1000-1010" but separate start and end entries).
> >   
> I'm not quite familiar with json. Maybe, Jiri can give feedback here.

JSON support strings and numeric and objects.
The more common JSON way of expressing this would be either as object for sport

{
   "priority": 32765,
   "src": "172.16.1.0",
"srclen": 24,
   "ipproto": "tcp",
"sport" : {
"start" : 1000,
"end" : 1010
},
"table: "main"
}

or as an array:
{
   "priority": 32765,
   "src": "172.16.1.0",
"srclen": 24,
   "ipproto": "tcp",
"sport" : [ 1000, 1010 ],
"table: "main"
}

My point is don't build some semantic meaning directly into the tag part of the 
syntax.

Re: [PATCH bpf-next v2 0/4] libbpf: ABI versioning and documentation

2018-11-26 Thread Alexei Starovoitov

On Fri, Nov 23, 2018 at 04:44:31PM -0800, Andrey Ignatov wrote:
> This patch set adds ABI versioning and documentation to libbpf.
> 
> Patch 1 renames btf_get_from_id to btf__get_from_id to follow naming
> convention.
> Patch 2 adds version script and has more details on ABI versioning.
> Patch 3 adds simple check that all global symbols are versioned.
> Patch 4 documents a few aspects of libbpf API and ABI in dev process.
> 
> v1->v2:
> * add patch from Martin KaFai Lau  to rename btf_get_from_id;
> * add documentation for libbpf API and ABI.

All looks great to me.
Thank you for adding the doc.
Applied to bpf-next.

We need to discuss the release model and version bumps.
For example I don't think it's necessary to bump the version
and update libbpf.map every time the new function is added.
I mean we can synchronize release of libbpf with the release of the kernel
or release it every N weeks.
So if we add new api functions during this release we can simply
add them to libbpf.map while keeping the version as 0.0.1

I'd also consider the first 0.0.1 release to be experimental
while we're figuring out the right process.
For the next kernel/libbpf release I propose to bump it to 1.0.0

Another idea is to version it just like kernel and make libbpf version
always equal kernel version.
But I think that would be an overkill. libbpf isn't tightly coupled to
the kernel. Like we just merged the patch (prog_name/map_name probing
that allows new libbpf to work with older kernel.

Re: [PATCH bpf] bpf, doc: add entries of who looks over which jits

2018-11-26 Thread Alexei Starovoitov

On Tue, Nov 27, 2018 at 01:21:00AM +0100, Daniel Borkmann wrote:
> Make the high-level BPF JIT entry a general 'catch-all' and add
> architecture specific entries to make it more clear who actively
> maintains which BPF JIT compiler. The list (L) address implies
> that this eventually lands in the bpf patchwork bucket. Goal is
> that this set of responsible developers listed here is always up
> to date and a point of contact for helping out in e.g. feature
> development, fixes, review or testing patches in order to help
> long-term in ensuring quality of the BPF JITs and therefore BPF
> core under a given architecture. Every new JIT in future /must/
> have an entry here as well.
> 
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 
> Acked-by: Naveen N. Rao 
> Acked-by: Sandipan Das 
> Acked-by: Martin Schwidefsky 
> Acked-by: Heiko Carstens 
> Acked-by: David S. Miller 
> Acked-by: Zi Shen Lim 
> Acked-by: Paul Burton 
> Acked-by: Jakub Kicinski 
> Acked-by: Wang YanQing 

Applied to bpf tree.

Re: [PATCH bpf-next 0/2] bpf: support proper non-jit func info

2018-11-26 Thread Alexei Starovoitov

On Sat, Nov 24, 2018 at 11:20:43PM -0800, Yonghong Song wrote:
> Commit 838e96904ff3 ("bpf: Introduce bpf_func_info")
> added bpf func info support. The userspace is able
> to get better ksym's for bpf programs with jit, and
> is able to print out func prototypes.
> 
> For a program containing func-to-func calls, the existing
> implementation returns user specified number of function
> calls and BTF types if jit is enabled. If the jit is not
> enabled, it only returns the type for the main function.
> 
> This is undesirable. Interpreter may still be used
> and we should keep feature identical regardless of
> whether jit is enabled or not.
> This patch fixed this discrepancy.
> 
> The following example shows bpftool output for
> the bpf program in selftests test_btf_haskv.o when jit
> is disabled:
>   $ bpftool prog dump xlated id 1490
>   int _dummy_tracepoint(struct dummy_tracepoint_args * arg):
>  0: (85) call pc+2#__bpf_prog_run_args32
>  1: (b7) r0 = 0
>  2: (95) exit
>   int test_long_fname_1(struct dummy_tracepoint_args * arg):
>  3: (85) call pc+1#__bpf_prog_run_args32
>  4: (95) exit
>   int test_long_fname_2(struct dummy_tracepoint_args * arg):
>  5: (b7) r2 = 0
>  6: (63) *(u32 *)(r10 -4) = r2
>  7: (79) r1 = *(u64 *)(r1 +8)
>  8: (15) if r1 == 0x0 goto pc+9
>  9: (bf) r2 = r10
> 10: (07) r2 += -4
> 11: (18) r1 = map[id:1173]
> 13: (85) call bpf_map_lookup_elem#77088
> 14: (15) if r0 == 0x0 goto pc+3
> 15: (61) r1 = *(u32 *)(r0 +4)
> 16: (07) r1 += 1
> 17: (63) *(u32 *)(r0 +4) = r1
> 18: (95) exit
>   $ bpftool prog dump jited id 1490
> no instructions returned

Applied to bpf-next. Thanks

Re: [iproute2-next PATCH v4] tc: flower: Classify packets based port ranges

2018-11-26 Thread Nambiar, Amritha

On 11/26/2018 4:43 PM, David Ahern wrote:
> On 11/26/18 5:23 PM, Nambiar, Amritha wrote:
>>> Can tc flower use something similar to ip ru with single port or port
>>> range handled like this?
>>>
>>> },{
>>> "priority": 32764,
>>> "src": "172.16.1.0",
>>> "srclen": 24,
>>> "ipproto": "tcp",
>>> "sport": 1100,
>>> "table": "main"
>>> },{
>>> "priority": 32765,
>>> "src": "172.16.1.0",
>>> "srclen": 24,
>>> "ipproto": "tcp",
>>> "sport_start": 1000,
>>> "sport_end": 1010,
>>> "table": "main"
>>> },{
>>>
>>>
>>
>> Does it have to be separate fields "sport_start" and "sport_end"?
>> Removing the space and 'range' keyword will make the output format
>> consistent with the input format and print as "sport " for
>> single port and "sport -" for range.
>> Example:
>>
>> ... flower ip_proto tcp src_port 12 skip_hw action will print as:
>>   ip_proto tcp
>>   src_port 12
>>   skip_hw
>>   not_in_hw
>> action
>>
>> ... flower ip_proto tcp src_port 100-200 skip_hw action :
>>   ip_proto tcp
>>   src_port 100-200
>>   skip_hw
>>   not_in_hw
>> action
> 
> non-json output needs to match what the user gives on the command line.
> 
> My comment was about consistency with json output when possible. I am
> not a json expert by any means. Other commands have a single key value
> pair, so I suspect the json here needs to follow suit (ie., not
> "src_port": "1000-1010" but separate start and end entries).
> 
I'm not quite familiar with json. Maybe, Jiri can give feedback here.

Re: [PATCH bpf] sparc: Correct ctx->saw_frame_pointer logic.

2018-11-26 Thread Alexei Starovoitov

On Mon, Nov 26, 2018 at 02:52:18PM -0800, David Miller wrote:
> 
> We need to initialize the frame pointer register not just if it is
> seen as a source operand, but also if it is seen as the destination
> operand of a store or an atomic instruction (which effectively is a
> source operand).
> 
> This is exercised by test_verifier's "non-invalid fp arithmetic"
> 
> Signed-off-by: David S. Miller 

Applied to bpf tree. Thanks!

Re: [PATCH bpf] sparc: Fix JIT fused branch convergance.

2018-11-26 Thread Alexei Starovoitov

On Mon, Nov 26, 2018 at 01:03:46PM -0800, David Miller wrote:
> 
> On T4 and later sparc64 cpus we can use the fused compare and branch
> instruction.
> 
> However, it can only be used if the branch destination is in the range
> of a signed 10-bit immediate offset.  This amounts to 1024
> instructions forwards or backwards.
> 
> After the commit referenced in the Fixes: tag, the largest possible
> size program seen by the JIT explodes by a significant factor.
> 
> As a result of this convergance takes many more passes since the
> expanded "BPF_LDX | BPF_MSH | BPF_B" code sequence, for example,
> contains several embedded branch on condition instructions.
> 
> On each pass, as suddenly new fused compare and branch instances
> become valid, this makes thousands more in range for the next pass.
> And so on and so forth.
> 
> This is most greatly exemplified by "BPF_MAXINSNS: exec all MSH" which
> takes 35 passes to converge, and shrinks the image by about 64K.
> 
> To decrease the cost of this number of convergance passes, do the
> convergance pass before we have the program image allocated, just like
> other JITs (such as x86) do.
> 
> Fixes: e0cea7ce988c ("bpf: implement ld_abs/ld_ind in native bpf")
> Signed-off-by: David S. Miller 

argh. nice catch.
Applied to bpf tree.

Re: [PATCH bpf 0/2] Fix for arm64 jit

2018-11-26 Thread Alexei Starovoitov

On Mon, Nov 26, 2018 at 02:05:37PM +0100, Daniel Borkmann wrote:
> This set contains a fix for arm64 BPF JIT. First patch generalizes
> ppc64 way of retrieving subprog into bpf_jit_get_func_addr() as core
> code and uses the same on arm64 in second patch. Tested on both arm64
> and ppc64.
> 
> Thanks!

Applied, Thanks

[PATCH v5 3/6] vxlan: add support for underlay in non-default VRF

2018-11-26 Thread Alexis Bauvin

Creating a VXLAN device with is underlay in the non-default VRF makes
egress route lookup fail or incorrect since it will resolve in the
default VRF, and ingress fail because the socket listens in the default
VRF.

This patch binds the underlying UDP tunnel socket to the l3mdev of the
lower device of the VXLAN device. This will listen in the proper VRF and
output traffic from said l3mdev, matching l3mdev routing rules and
looking up the correct routing table.

When the VXLAN device does not have a lower device, or the lower device
is in the default VRF, the socket will not be bound to any interface,
keeping the previous behaviour.

The underlay l3mdev is deduced from the VXLAN lower device
(IFLA_VXLAN_LINK).

+--+ +-+
|  | | |
| vrf-blue | | vrf-red |
|  | | |
++-+ +++
 ||
 ||
++-+ +++
|  | | |
| br-blue  | | br-red  |
|  | | |
++-+ +---+-+---+
 |   | |
 | +-+ +-+
 | | |
++-++--++   +++
|  |  lower device  |   |   | |
|   eth0   | <- - - - - - - | vxlan-red |   | tap-red | (... more taps)
|  ||   |   | |
+--++---+   +-+

Signed-off-by: Alexis Bauvin 
Reviewed-by: Amine Kherbouche 
Reviewed-by: David Ahern 
Tested-by: Amine Kherbouche 
---
 drivers/net/vxlan.c | 32 
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 27bd586b94b0..8ba0a57ff958 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -212,7 +212,7 @@ static inline struct vxlan_rdst *first_remote_rtnl(struct 
vxlan_fdb *fdb)
  * and enabled unshareable flags.
  */
 static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
- __be16 port, u32 flags)
+ __be16 port, u32 flags, int ifindex)
 {
struct vxlan_sock *vs;
 
@@ -221,7 +221,8 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, 
sa_family_t family,
hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
if (inet_sk(vs->sock->sk)->inet_sport == port &&
vxlan_get_sk_family(vs) == family &&
-   vs->flags == flags)
+   vs->flags == flags &&
+   vs->sock->sk->sk_bound_dev_if == ifindex)
return vs;
}
return NULL;
@@ -261,7 +262,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, 
int ifindex,
 {
struct vxlan_sock *vs;
 
-   vs = vxlan_find_sock(net, family, port, flags);
+   vs = vxlan_find_sock(net, family, port, flags, ifindex);
if (!vs)
return NULL;
 
@@ -2172,6 +2173,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct 
net_device *dev,
struct rtable *rt;
__be16 df = 0;
 
+   if (!ifindex)
+   ifindex = sock4->sock->sk->sk_bound_dev_if;
+
rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
 dst->sin.sin_addr.s_addr,
 _ip.sin.sin_addr.s_addr,
@@ -2210,6 +2214,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct 
net_device *dev,
} else {
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
 
+   if (!ifindex)
+   ifindex = sock6->sock->sk->sk_bound_dev_if;
+
ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
label, >sin6.sin6_addr,
_ip.sin6.sin6_addr,
@@ -2813,7 +2820,7 @@ static const struct ethtool_ops vxlan_ethtool_ops = {
 };
 
 static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
-   __be16 port, u32 flags)
+   __be16 port, u32 flags, int ifindex)
 {
struct socket *sock;
struct udp_port_cfg udp_conf;
@@ -2831,6 +2838,7 @@ static struct socket *vxlan_create_sock(struct net *net, 
bool ipv6,
}
 
udp_conf.local_udp_port = port;
+   udp_conf.bind_ifindex = ifindex;
 
/* Open UDP socket */
err = udp_sock_create(net, _conf, );
@@ -2842,7 +2850,8 @@ static struct socket *vxlan_create_sock(struct net *net, 
bool ipv6,
 
 /* Create new

[PATCH v5 2/6] l3mdev: add function to retreive upper master

2018-11-26 Thread Alexis Bauvin

Existing functions to retreive the l3mdev of a device did not walk the
master chain to find the upper master. This patch adds a function to
find the l3mdev, even indirect through e.g. a bridge:

+--+
|  |
| vrf-blue |
|  |
++-+
 |
 |
++-+
|  |
| br-blue  |
|  |
++-+
 |
 |
++-+
|  |
|   eth0   |
|  |
+--+

This will properly resolve the l3mdev of eth0 to vrf-blue.

Signed-off-by: Alexis Bauvin 
Reviewed-by: Amine Kherbouche 
Reviewed-by: David Ahern 
Tested-by: Amine Kherbouche 
---
 include/net/l3mdev.h | 22 ++
 net/l3mdev/l3mdev.c  | 18 ++
 2 files changed, 40 insertions(+)

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 3832099289c5..78fa0ac4613c 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -101,6 +101,17 @@ struct net_device *l3mdev_master_dev_rcu(const struct 
net_device *_dev)
return master;
 }
 
+int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex);
+static inline
+int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
+{
+   rcu_read_lock();
+   ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex);
+   rcu_read_unlock();
+
+   return ifindex;
+}
+
 u32 l3mdev_fib_table_rcu(const struct net_device *dev);
 u32 l3mdev_fib_table_by_index(struct net *net, int ifindex);
 static inline u32 l3mdev_fib_table(const struct net_device *dev)
@@ -207,6 +218,17 @@ static inline int l3mdev_master_ifindex_by_index(struct 
net *net, int ifindex)
return 0;
 }
 
+static inline
+int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
+{
+   return 0;
+}
+static inline
+int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
+{
+   return 0;
+}
+
 static inline
 struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
 {
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 8da86ceca33d..309dee76724e 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -46,6 +46,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu);
 
+/**
+ * l3mdev_master_upper_ifindex_by_index - get index of upper l3 master
+ *device
+ * @net: network namespace for device index lookup
+ * @ifindex: targeted interface
+ */
+int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
+{
+   struct net_device *dev;
+
+   dev = dev_get_by_index_rcu(net, ifindex);
+   while (dev && !netif_is_l3_master(dev))
+   dev = netdev_master_upper_dev_get(dev);
+
+   return dev ? dev->ifindex : 0;
+}
+EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu);
+
 /**
  * l3mdev_fib_table - get FIB table id associated with an L3
  * master interface
--

[PATCH v5 0/6] Add VRF support for VXLAN underlay

2018-11-26 Thread Alexis Bauvin

v4 -> v5:
- move test script to its own patch (6/6)
- add schematic for test script
- apply David Ahern comments to the test script

v3 -> v4:
- rename vxlan_is_in_l3mdev_chain to netdev_is_upper master
- move it to net/core/dev.c
- make it return bool instead of int
- check if remote_ifindex is zero before resolving the l3mdev
- add testing script

v2 -> v3:
- fix build when CONFIG_NET_IPV6 is off
- fix build "unused l3mdev_master_upper_ifindex_by_index" build error with some
  configs

v1 -> v2:
- move vxlan_get_l3mdev from vxlan driver to l3mdev driver as
  l3mdev_master_upper_ifindex_by_index
- vxlan: rename variables named l3mdev_ifindex to ifindex

v0 -> v1:
- fix typos

We are trying to isolate the VXLAN traffic from different VMs with VRF as shown
in the schemas below:

+-+   ++
| +--+|   | ++ |
| |  ||   | || |
| | tap-red  ||   | |  tap-blue  | |
| |  ||   | || |
| ++-+|   | +-+--+ |
|  |  |   |   ||
|  |  |   |   ||
| ++---+  |   |  +++   |
| ||  |   |  | |   |
| | br-red |  |   |  | br-blue |   |
| ||  |   |  | |   |
| ++---+  |   |  +++   |
|  |  |   |   ||
|  |  |   |   ||
|  |  |   |   ||
| +++ |   | +--+   |
| | | |   | |  |   |
| |  vxlan-red  | |   | |  vxlan-blue  |   |
| | | |   | |  |   |
| +--+--+ |   | +---+--+   |
|||   | |  |
||   VRF  |   | |  VRF |
||   red  |   | | blue |
+-+   ++
 |  |
 |  |
 +-+
 |   |  |  |
 |   |  |  |
 |   | +--+ |  |
 |   | |  | |  |
 |   +-+  eth0.2030   +-+  |
 | |  10.0.0.1/24 ||
 | +-++VRF |
 |   |green|
 +-+
 |
 |
++---+
||
|  eth0  |
||
++


iproute2 commands to reproduce the setup:

ip link add green type vrf table 1
ip link set green up
ip link add eth0.2030 link eth0 type vlan id 2030
ip link set eth0.2030 master green
ip addr add 10.0.0.1/24 dev eth0.2030
ip link set eth0.2030 up

ip link add blue type vrf table 2
ip link set blue up
ip link add br-blue type bridge
ip link set br-blue master blue
ip link set br-blue up
ip link add vxlan-blue type vxlan id 2 local 10.0.0.1 dev eth0.2030 \
 port 4789
ip link set vxlan-blue master br-blue
ip link set vxlan-blue up
ip link set tap-blue master br-blue
ip link set tap-blue up

ip link add red type vrf table 3
ip link set red up
ip link add br-red type bridge
ip link set br-red master red
ip link set br-red up
ip link add vxlan-red type vxlan id 3 local 10.0.0.1 dev eth0.2030 \
 port 4789
ip link set vxlan-red master br-red
ip link set vxlan-red up
ip link set tap-red master br-red
ip link set tap-red up

We faced some issue in the datapath, here are the details:

* Egress traffic:
The vxlan packets are sent directly to the default VRF because it's where the
socket is bound, therefore the traffic has a default route via eth0. the
workarount is to force this traffic to VRF green with ip rules.

* Ingress traffic:
When receiving the traffic on eth0.2030 the vxlan socket is unreachable from
VRF green. The workaround is to enable *udp_l3mdev_accept* sysctl, but
this breaks isolation between overlay and underlay: packets sent from
blue or red by e.g. a guest VM will be accepted by the socket, allowing
injection of VXLAN packets from the overlay.

This patch serie fixes the issues describe above by allowing VXLAN socket to be
bound to a specific VRF device therefore looking up in the correct table.

Alexis Bauvin (6):
  udp_tunnel: add

[PATCH v5 6/6] test/net: Add script for VXLAN underlay in a VRF

2018-11-26 Thread Alexis Bauvin

This script tests the support of a VXLAN underlay in a non-default VRF.

It does so by simulating two hypervisors and two VMs, an extended L2
between the VMs with the hypervisors as VTEPs with the underlay in a
VRF, and finally by pinging the two VMs.

It also tests that moving the underlay from a VRF to another works when
down/up the VXLAN interface.

Signed-off-by: Alexis Bauvin 
Reviewed-by: Amine Kherbouche 
Tested-by: Amine Kherbouche 
---
 tools/testing/selftests/net/Makefile  |   1 +
 .../selftests/net/test_vxlan_under_vrf.sh | 129 ++
 2 files changed, 130 insertions(+)
 create mode 100755 tools/testing/selftests/net/test_vxlan_under_vrf.sh

diff --git a/tools/testing/selftests/net/Makefile 
b/tools/testing/selftests/net/Makefile
index 919aa2ac00af..d45295ca56ce 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -7,6 +7,7 @@ CFLAGS += -I../../../../usr/include/
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh 
rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh
 TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
+TEST_PROGS += test_vxlan_under_vrf.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/test_vxlan_under_vrf.sh 
b/tools/testing/selftests/net/test_vxlan_under_vrf.sh
new file mode 100755
index ..09f9ed92cbe4
--- /dev/null
+++ b/tools/testing/selftests/net/test_vxlan_under_vrf.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking VXLAN underlay in a non-default VRF.
+#
+# It simulates two hypervisors running a VM each using four network namespaces:
+# two for the HVs, two for the VMs.
+# A small VXLAN tunnel is made between the two hypervisors to have the two vms
+# in the same virtual L2:
+#
+# +---+
+---+
+# |   ||   
|
+# |vm-1 netns ||vm-2 netns 
|
+# |   ||   
|
+# |  +-+  ||  +-+  
|
+# |  |   veth-hv   |  ||  |   veth-hv   |  
|
+# |  | 10.0.0.1/24 |  ||  | 10.0.0.2/24 |  
|
+# |  +-+  ||  +-+  
|
+# |.  || . 
|
+# +---+
+---+
+#  . .
+#  . .
+#  . .
+# +---+   
++
+# |.  |   |  . 
|
+# |  +--+ |   | +--+   
|
+# |  | veth-tap | |   | | veth-tap |   
|
+# |  ++-+ |   | ++-+   
|
+# |   |   |   |  | 
|
+# |+--+--+  +--+  |   |  +--+ +--+--+  
|
+# || br0 |  | vrf-underlay |  |   |  | vrf-underlay | | br0 |  
|
+# |+--+--+  +---+--+  |   |  +--+---+ +--+--+  
|
+# |   | | |   | || 
|
+# |   +---+++---+---+ |   | +---+---++---++
|
+# |   | vxlan0 || veth0 |.|...|.| veth0 || vxlan0 |
|
+# |   ++| 172.16.0.1/24 | |   | | 172.16.0.2/24 |++
|
+# | +---+ |   | +---+  
|
+# |   |   |
|
+# | hv-1 netns|   |   hv-2 netns   
|
+# |   |   |
|
+# +---+   
++
+#
+# This tests both the connectivity between vm-1 and vm-2, and that the underlay
+# can be moved in and out of the vrf by unsetting and setting veth0's master.
+
+set -e
+
+cleanup() {
+ip link del veth-hv-1 2>/dev/null || true
+ip link del veth-tap 2>/dev/null || true
+
+for ns in hv-1 hv-2 vm-1 vm-2; do
+ip netns del $ns || true
+done
+}
+
+# Clean start
+cleanup &> /dev/null
+
+[[ $1 == "clean" ]] && exit 0
+
+trap cleanup EXIT
+
+# Setup "Hypervisors" simulated

[PATCH v5 5/6] vxlan: handle underlay VRF changes

2018-11-26 Thread Alexis Bauvin

When underlay VRF changes, either because the lower device itself changed,
or its VRF changed, this patch releases the current socket of the VXLAN
device and recreates another one in the right VRF. This allows for
on-the-fly change of the underlay VRF of a VXLAN device.

Signed-off-by: Alexis Bauvin 
Reviewed-by: Amine Kherbouche 
Tested-by: Amine Kherbouche 
---
 drivers/net/vxlan.c | 82 +
 1 file changed, 82 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 8ba0a57ff958..131ee80a38f9 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3720,6 +3720,33 @@ struct net_device *vxlan_dev_create(struct net *net, 
const char *name,
 }
 EXPORT_SYMBOL_GPL(vxlan_dev_create);
 
+static int vxlan_reopen(struct vxlan_net *vn, struct vxlan_dev *vxlan)
+{
+   int ret = 0;
+
+   if (vxlan_addr_multicast(>default_dst.remote_ip) &&
+   !vxlan_group_used(vn, vxlan))
+   ret = vxlan_igmp_leave(vxlan);
+   vxlan_sock_release(vxlan);
+
+   if (ret < 0)
+   return ret;
+
+   ret = vxlan_sock_add(vxlan);
+   if (ret < 0)
+   return ret;
+
+   if (vxlan_addr_multicast(>default_dst.remote_ip)) {
+   ret = vxlan_igmp_join(vxlan);
+   if (ret == -EADDRINUSE)
+   ret = 0;
+   if (ret)
+   vxlan_sock_release(vxlan);
+   }
+
+   return ret;
+}
+
 static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
 struct net_device *dev)
 {
@@ -3742,6 +3769,55 @@ static void vxlan_handle_lowerdev_unregister(struct 
vxlan_net *vn,
unregister_netdevice_many(_kill);
 }
 
+static void vxlan_handle_change_upper(struct vxlan_net *vn,
+ struct net_device *dev)
+{
+   struct vxlan_dev *vxlan, *next;
+
+   list_for_each_entry_safe(vxlan, next, >vxlan_list, next) {
+   struct net_device *lower;
+   int err;
+
+   lower = __dev_get_by_index(vxlan->net,
+  vxlan->cfg.remote_ifindex);
+   if (!netdev_is_upper_master(lower, dev))
+   continue;
+
+   err = vxlan_reopen(vn, vxlan);
+   if (err < 0)
+   netdev_err(vxlan->dev, "Failed to reopen socket: %d\n",
+  err);
+   }
+}
+
+static void vxlan_handle_change(struct vxlan_net *vn, struct net_device *dev)
+{
+   struct vxlan_dev *vxlan = netdev_priv(dev);
+   struct vxlan_sock *sock;
+   int l3mdev_index = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+   bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
+   bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
+
+   sock = ipv6 ? rcu_dereference(vxlan->vn6_sock)
+   : rcu_dereference(vxlan->vn4_sock);
+#else
+   sock = rcu_dereference(vxlan->vn4_sock);
+#endif
+
+   if (vxlan->cfg.remote_ifindex)
+   l3mdev_index = l3mdev_master_upper_ifindex_by_index(
+   vxlan->net, vxlan->cfg.remote_ifindex);
+   if (sock->sock->sk->sk_bound_dev_if != l3mdev_index) {
+   int ret = vxlan_reopen(vn, vxlan);
+
+   if (ret < 0)
+   netdev_err(vxlan->dev, "Failed to reopen socket: %d\n",
+  ret);
+   }
+}
+
 static int vxlan_netdevice_event(struct notifier_block *unused,
 unsigned long event, void *ptr)
 {
@@ -3756,6 +3832,12 @@ static int vxlan_netdevice_event(struct notifier_block 
*unused,
} else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO ||
   event == NETDEV_UDP_TUNNEL_DROP_INFO) {
vxlan_offload_rx_ports(dev, event == 
NETDEV_UDP_TUNNEL_PUSH_INFO);
+   } else if (event == NETDEV_CHANGEUPPER) {
+   vxlan_handle_change_upper(vn, dev);
+   } else if (event == NETDEV_CHANGE) {
+   if (dev->rtnl_link_ops &&
+   !strcmp(dev->rtnl_link_ops->kind, vxlan_link_ops.kind))
+   vxlan_handle_change(vn, dev);
}
 
return NOTIFY_DONE;
--

[PATCH v5 4/6] netdev: add netdev_is_upper_master

2018-11-26 Thread Alexis Bauvin

In preparation of next patch, this function allows to check if a device
is a master, be it direct or indirect, of another one. It walks up the
master chain until it finds the device, or there is no more master.

This allows to check e.g. if br-blue is a master of eth0:

   +--+
   | vrf-blue |
   ++-+
|
   +++
   | br-blue |
   +++
|
+---+---+
| bond0 |
+--+-+--+
   | |
+--+ +--+
|   |
+---+--+ +--+---+
| eth0 | | eth1 |
+--+ +--+

Signed-off-by: Alexis Bauvin 
Reviewed-by: Amine Kherbouche 
Tested-by: Amine Kherbouche 
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c| 17 +
 2 files changed, 18 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d837dad24b4c..102f79337d7c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4212,6 +4212,7 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
   struct net_device *lower_dev);
 void netdev_lower_state_changed(struct net_device *lower_dev,
void *lower_state_info);
+bool netdev_is_upper_master(struct net_device *dev, struct net_device *master);
 
 /* RSS keys are 40 or 52 bytes long */
 #define NETDEV_RSS_KEY_LEN 52
diff --git a/net/core/dev.c b/net/core/dev.c
index 93243479085f..9222434a5332 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7225,6 +7225,23 @@ void netdev_lower_state_changed(struct net_device 
*lower_dev,
 }
 EXPORT_SYMBOL(netdev_lower_state_changed);
 
+/**
+ * netdev_is_upper_master - Test if a device is a master, direct or indirect,
+ *  of another one.
+ * @dev: device to start looking from
+ * @master: device to test if master of dev
+ */
+bool netdev_is_upper_master(struct net_device *dev, struct net_device *master)
+{
+   if (!dev)
+   return false;
+
+   if (dev == master)
+   return true;
+   return netdev_is_upper_master(netdev_master_upper_dev_get(dev), master);
+}
+EXPORT_SYMBOL(netdev_is_upper_master);
+
 static void dev_change_rx_flags(struct net_device *dev, int flags)
 {
const struct net_device_ops *ops = dev->netdev_ops;
--

[PATCH v5 1/6] udp_tunnel: add config option to bind to a device

2018-11-26 Thread Alexis Bauvin

UDP tunnel sockets are always opened unbound to a specific device. This
patch allow the socket to be bound on a custom device, which
incidentally makes UDP tunnels VRF-aware if binding to an l3mdev.

Signed-off-by: Alexis Bauvin 
Reviewed-by: Amine Kherbouche 
Reviewed-by: David Ahern 
Tested-by: Amine Kherbouche 
---
 include/net/udp_tunnel.h  |  1 +
 net/ipv4/udp_tunnel.c | 10 ++
 net/ipv6/ip6_udp_tunnel.c |  9 +
 3 files changed, 20 insertions(+)

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index fe680ab6b15a..9f7970d010f9 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -30,6 +30,7 @@ struct udp_port_cfg {
 
__be16  local_udp_port;
__be16  peer_udp_port;
+   int bind_ifindex;
unsigned intuse_udp_checksums:1,
use_udp6_tx_checksums:1,
use_udp6_rx_checksums:1,
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..dc68e15a4f72 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -20,6 +20,16 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg 
*cfg,
if (err < 0)
goto error;
 
+   if (cfg->bind_ifindex) {
+   struct net_device *dev;
+
+   dev = __dev_get_by_index(net, cfg->bind_ifindex);
+   err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
+   dev->name, strlen(dev->name) + 1);
+   if (err < 0)
+   goto error;
+   }
+
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index b283f293ee4a..fc3811ef8787 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -31,6 +31,15 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg 
*cfg,
if (err < 0)
goto error;
}
+   if (cfg->bind_ifindex) {
+   struct net_device *dev;
+
+   dev = __dev_get_by_index(net, cfg->bind_ifindex);
+   err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
+   dev->name, strlen(dev->name) + 1);
+   if (err < 0)
+   goto error;
+   }
 
udp6_addr.sin6_family = AF_INET6;
memcpy(_addr.sin6_addr, >local_ip6,
--

Re: [RFC v4 3/5] vxlan: add support for underlay in non-default VRF

2018-11-26 Thread Alexis Bauvin

Le 27 nov. 2018 à 01:46, David Ahern  a écrit :
> On 11/26/18 5:41 PM, Alexis Bauvin wrote:
>> Le 26 nov. 2018 à 18:54, David Ahern  a écrit :
>>> On 11/26/18 9:32 AM, Alexis Bauvin wrote:
 Thanks for the review. I’ll send a v5 if you have no other comment on
 this version!
>>> 
>>> A few comments on the test script; see attached which has the changes.
>>> 
>>> Mainly the cleanup does not need to be called at the end since you setup
>>> the exit trap. The cleanup calls ip to delete veth-hv-1 and veth-tap but
>>> those are moved to other namespaces.
>> 
>> This was on purpose to be sure to cleanup the interfaces in case the
>> script crashes for some reason and left interfaces outside of the
>> namespace.
> 
> ok.
> 
>>> It would be good to copy the topology ascii art into the test script as
>>> well for future users.
>> 
>> Will include this:
> 
> 
> 
> Thanks for adding.
> 
>> 
>>> Also, add the test as a separate patch at the end and include it in
>>> tools/testing/selftests/net/Makefile
>> 
>> Regarding the discussion on patch 5, it should be better to send it first
>> after patch 3, and remove the down/up from it after current patch 5,
>> right?
> 
> Typically the test case is added at the end verifying the end goal of
> the patch set as opposed to being part of a patch (3 in your case) and
> then amended by a later patch.

Ok. Will make the test script the very last of the series. Thanks for the
pointers.

[iproute2-next PATCH v5] tc: flower: Classify packets based port ranges

2018-11-26 Thread Amritha Nambiar

Added support for filtering based on port ranges.
UAPI changes have been accepted into net-next.

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port 20-30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 85 sec used 3 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port 100-200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 58 sec used 2 sec
Action statistics:
Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

v5:
Simplified some code and used 'sscanf' for parsing. Removed
space in output format.

v4:
Added man updates explaining filtering based on port ranges.
Removed 'range' keyword.

v3:
Modified flower_port_range_attr_type calls.

v2:
Addressed Jiri's comment to sync output format with input

Signed-off-by: Amritha Nambiar 
---
 man/man8/tc-flower.8 |   13 +---
 tc/f_flower.c|   78 --
 2 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 8be8882..adff41e 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -56,8 +56,9 @@ flower \- flow based traffic control filter
 .IR MASKED_IP_TTL " | { "
 .BR dst_ip " | " src_ip " } "
 .IR PREFIX " | { "
-.BR dst_port " | " src_port " } "
-.IR port_number " } | "
+.BR dst_port " | " src_port " } { "
+.IR port_number " | "
+.IR min_port_number-max_port_number " } | "
 .B tcp_flags
 .IR MASKED_TCP_FLAGS " | "
 .B type
@@ -220,10 +221,12 @@ must be a valid IPv4 or IPv6 address, depending on the 
\fBprotocol\fR
 option to tc filter, optionally followed by a slash and the prefix length.
 If the prefix is missing, \fBtc\fR assumes a full-length host match.
 .TP
-.BI dst_port " NUMBER"
+.IR \fBdst_port " { "  NUMBER " | " " MIN_VALUE-MAX_VALUE "  }
 .TQ
-.BI src_port " NUMBER"
-Match on layer 4 protocol source or destination port number. Only available for
+.IR \fBsrc_port " { "  NUMBER " | " " MIN_VALUE-MAX_VALUE "  }
+Match on layer 4 protocol source or destination port number. Alternatively, the
+mininum and maximum values can be specified to match on a range of layer 4
+protocol source or destination port numbers. Only available for
 .BR ip_proto " values " udp ", " tcp  " and " sctp
 which have to be specified in beforehand.
 .TP
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 65fca04..9a01b4a 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -473,24 +473,57 @@ static int flower_port_attr_type(__u8 ip_proto, enum 
flower_endpoint endpoint)
return -1;
 }
 
+static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint 
type,
+  __be16 *min_port_type,
+  __be16 *max_port_type)
+{
+   if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP ||
+   ip_proto == IPPROTO_SCTP) {
+   if (type == FLOWER_ENDPOINT_SRC) {
+   *min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX;
+   } else {
+   *min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX;
+   }
+   } else {
+   return -1;
+   }
+   return 0;
+}
+
 static int flower_parse_port(char *str, __u8 ip_proto,
 enum flower_endpoint endpoint,
 struct nlmsghdr *n)
 {
+   __u16 min, max;
int ret;
-   int type;
-   __be16 port;
 
-   type = flower_port_attr_type(ip_proto, endpoint);
-   if (type < 0)
-   return -1;
+   ret = sscanf(str, "%hu-%hu", , );
 
-   ret = get_be16(, str, 10);
-   if (ret)
-   return -1;
+   if (ret == 1) {
+   int type;
 
-   addattr16(n, MAX_MSG, type, port);
+   type = flower_port_attr_type(ip_proto, endpoint);
+   if (type < 0)
+   return -1;
+   addattr16(n,

Re: [RFC v4 3/5] vxlan: add support for underlay in non-default VRF

2018-11-26 Thread David Ahern

On 11/26/18 5:41 PM, Alexis Bauvin wrote:
> Le 26 nov. 2018 à 18:54, David Ahern  a écrit :
>> On 11/26/18 9:32 AM, Alexis Bauvin wrote:
>>> Thanks for the review. I’ll send a v5 if you have no other comment on
>>> this version!
>>
>> A few comments on the test script; see attached which has the changes.
>>
>> Mainly the cleanup does not need to be called at the end since you setup
>> the exit trap. The cleanup calls ip to delete veth-hv-1 and veth-tap but
>> those are moved to other namespaces.
> 
> This was on purpose to be sure to cleanup the interfaces in case the
> script crashes for some reason and left interfaces outside of the
> namespace.

ok.

>> It would be good to copy the topology ascii art into the test script as
>> well for future users.
> 
> Will include this:



Thanks for adding.

> 
>> Also, add the test as a separate patch at the end and include it in
>> tools/testing/selftests/net/Makefile
> 
> Regarding the discussion on patch 5, it should be better to send it first
> after patch 3, and remove the down/up from it after current patch 5,
> right?

Typically the test case is added at the end verifying the end goal of
the patch set as opposed to being part of a patch (3 in your case) and
then amended by a later patch.

Re: [iproute2-next PATCH v4] tc: flower: Classify packets based port ranges

2018-11-26 Thread David Ahern

On 11/26/18 5:23 PM, Nambiar, Amritha wrote:
>> Can tc flower use something similar to ip ru with single port or port
>> range handled like this?
>>
>> },{
>> "priority": 32764,
>> "src": "172.16.1.0",
>> "srclen": 24,
>> "ipproto": "tcp",
>> "sport": 1100,
>> "table": "main"
>> },{
>> "priority": 32765,
>> "src": "172.16.1.0",
>> "srclen": 24,
>> "ipproto": "tcp",
>> "sport_start": 1000,
>> "sport_end": 1010,
>> "table": "main"
>> },{
>>
>>
> 
> Does it have to be separate fields "sport_start" and "sport_end"?
> Removing the space and 'range' keyword will make the output format
> consistent with the input format and print as "sport " for
> single port and "sport -" for range.
> Example:
> 
> ... flower ip_proto tcp src_port 12 skip_hw action will print as:
>   ip_proto tcp
>   src_port 12
>   skip_hw
>   not_in_hw
> action
> 
> ... flower ip_proto tcp src_port 100-200 skip_hw action :
>   ip_proto tcp
>   src_port 100-200
>   skip_hw
>   not_in_hw
> action

non-json output needs to match what the user gives on the command line.

My comment was about consistency with json output when possible. I am
not a json expert by any means. Other commands have a single key value
pair, so I suspect the json here needs to follow suit (ie., not
"src_port": "1000-1010" but separate start and end entries).

Re: [RFC v4 3/5] vxlan: add support for underlay in non-default VRF

2018-11-26 Thread Alexis Bauvin

Le 26 nov. 2018 à 18:54, David Ahern  a écrit :
> On 11/26/18 9:32 AM, Alexis Bauvin wrote:
>> Thanks for the review. I’ll send a v5 if you have no other comment on
>> this version!
> 
> A few comments on the test script; see attached which has the changes.
> 
> Mainly the cleanup does not need to be called at the end since you setup
> the exit trap. The cleanup calls ip to delete veth-hv-1 and veth-tap but
> those are moved to other namespaces.

This was on purpose to be sure to cleanup the interfaces in case the
script crashes for some reason and left interfaces outside of the
namespace.

> 'ip netns exec NAME ip ...' is more
> efficiently done as 'ip -netns NAME ...'. The test results should align
> like this:
> 
> Checking HV connectivity  [ OK ]
> Check VM connectivity through VXLAN (underlay in the default VRF) [ OK ]
> Check VM connectivity through VXLAN (underlay in a VRF)   [ OK ]
> 
> So it is easy for users to see the PASS/FAIL.

Awesome, thanks!

> It would be good to copy the topology ascii art into the test script as
> well for future users.

Will include this:

+---+  +---+
|   |  |   |
|vm-1 netns |  |vm-2 netns |
|   |  |   |
|  +-+  |  |  +-+  |
|  |   veth-hv   |  |  |  |   veth-hv   |  |
|  | 10.0.0.1/24 |  |  |  | 10.0.0.2/24 |  |
|  +-+  |  |  +-+  |
| . |  | . |
+---+  +---+
  .  .
  .  .
  .  .
++   +-+
| .  |   |   . |
|   +--+ |   | +--+|
|   | veth-tap | |   | | veth-tap ||
|   ++-+ |   | ++-+|
||   |   |  |  |
| +--+--+  +--+  |   |  +--+ +--+--+   |
| | br0 |  | vrf-underlay |  |   |  | vrf-underlay | | br0 |   |
| +--+--+  +---+--+  |   |  +--+---+ +--+--+   |
|| | |   | ||  |
|+---+++---+---+ |   | +---+---++---++ |
|| vxlan0 || veth0 |.|...|.| veth0 || vxlan0 | |
|++| 172.16.0.1/24 | |   | | 172.16.0.2/24 |++ |
|  +---+ |   | +---+   |
||   | |
|  hv-1 netns|   |   hv-2 netns|
||   | |
++   +-+

> Also, add the test as a separate patch at the end and include it in
> tools/testing/selftests/net/Makefile

Regarding the discussion on patch 5, it should be better to send it first
after patch 3, and remove the down/up from it after current patch 5,
right?

> Finally, I think you should drop the RFC and send it as a 'ready for
> inclusion’.

Great thanks!

>

Re: Can decnet be deprecated?

2018-11-26 Thread Stephen Hemminger

On Sat, 24 Nov 2018 17:12:48 -0700
David Ahern  wrote:

> IPX was moved to staging at the end of last year. Can decnet follow
> suit? git log seems to indicate no active development in a very long time.
> 
> David

I have no problem with dropping decnet support from iproute first.
Presumably anyone still using it is going to be on old distro tools anyway.

Re: [PATCH net] bonding: fix 802.3ad state sent to partner when unbinding slave

2018-11-26 Thread Jay Vosburgh

Toni Peltonen  wrote:

>Previously when unbinding a slave the 802.3ad implementation only told
>partner that the port is not suitable for aggregation by setting the port
>aggregation state from aggregatable to individual. This is not enough. If the
>physical layer still stays up and we only unbinded this port from the bond 
>there
>is nothing in the aggregation status alone to prevent the partner from sending
>traffic towards us. To ensure that the partner doesn't consider this
>port at all anymore we should also disable collecting and distributing to
>signal that this actor is going away.
>
>I have tested this behaviour againts Arista EOS switches with mlx5 cards
>(physical link stays up when even when interface is down) and simulated
>the same situation virtually Linux <-> Linux with two network namespaces
>running two veth device pairs. In both cases setting aggregation to
>individual doesn't alone prevent traffic from being to sent towards this
>port given that the link stays up in partners end. Partner still keeps
>it's end in collecting + distributing state and continues until timeout is
>reached. In most cases this means we are losing the traffic partner sends
>towards our port while we wait for timeout. This is most visible with slow
>periodic time (LAPC rate slow).

"LAPC" -> "LACP"

>Other open source implementations like Open VSwitch and libreswitch, and
>vendor implementations like Arista EOS, seem to disable collecting +
>distributing to when doing similar port disabling/detaching/removing change.
>With this patch kernel implementation would behave the same way and ensure
>partner doesn't consider our actor viable anymore.

After re-reading the relevant bits of 802.1AX (particularly
5.4.9 on recordPDU and update_Selected) I'm going to suggest also
clearing AD_STATE_SYNCHRONIZATION, based on:

Partner_Oper_Port_State.Synchronization is also set to TRUE if
the value of Actor_State.Aggregation in the received PDU is set
to FALSE (i.e., indicates an Individual link),
Actor_State.Synchronization in the received PDU is set to TRUE,
and LACP will actively maintain the link.

Per the above, learing _SYNC in the LACPDU should un-sync the
port, inducing the Mux state machine (figure 5-15) to exit C_D state and
go to ATTACHED state (disabling Coll/Dist).

But, either way, as this is a hint to get the link partner to
stop using the port, this looks reasonable to me.  

Acked-by: Jay Vosburgh 

-J  

>Signed-off-by: Toni Peltonen 
>---
> drivers/net/bonding/bond_3ad.c | 2 ++
> 1 file changed, 2 insertions(+)
>
>diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
>index f43fb2f958a5..6776c33753dc 100644
>--- a/drivers/net/bonding/bond_3ad.c
>+++ b/drivers/net/bonding/bond_3ad.c
>@@ -2086,6 +2086,8 @@ void bond_3ad_unbind_slave(struct slave *slave)
>  aggregator->aggregator_identifier);
> 
>   /* Tell the partner that this port is not suitable for aggregation */
>+  port->actor_oper_port_state &= ~AD_STATE_COLLECTING;
>+  port->actor_oper_port_state &= ~AD_STATE_DISTRIBUTING;
>   port->actor_oper_port_state &= ~AD_STATE_AGGREGATION;
>   __update_lacpdu_from_port(port);
>   ad_lacpdu_send(port);
>-- 
>2.19.0

---
-Jay Vosburgh, jay.vosbu...@canonical.com

Re: [iproute2-next PATCH v4] tc: flower: Classify packets based port ranges

2018-11-26 Thread Nambiar, Amritha

On 11/21/2018 1:42 PM, David Ahern wrote:
> On 11/20/18 11:17 PM, Amritha Nambiar wrote:
>> diff --git a/tc/f_flower.c b/tc/f_flower.c
>> index 65fca04..722647d 100644
>> --- a/tc/f_flower.c
>> +++ b/tc/f_flower.c
>> @@ -494,6 +494,68 @@ static int flower_parse_port(char *str, __u8 ip_proto,
>>  return 0;
>>  }
>>  
>> +static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint 
>> type,
>> +   __be16 *min_port_type,
>> +   __be16 *max_port_type)
>> +{
>> +if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP ||
>> +ip_proto == IPPROTO_SCTP) {
>> +if (type == FLOWER_ENDPOINT_SRC) {
>> +*min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN;
>> +*max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX;
>> +} else {
>> +*min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN;
>> +*max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX;
>> +}
>> +} else {
>> +return -1;
>> +}
>> +
>> +return 0;
>> +}
>> +
>> +static int flower_parse_port_range(__be16 *min, __be16 *max, __u8 ip_proto,
> 
> why not just min and max directly since they are not set here but only
> referenced by value. Also, you do not parse anything in this function so
> the helper is misnamed.
> 
> But I think this can be done simpler using what was done in ip/iprule.c ...
> 

Okay, will modify this.

> 
>> +   enum flower_endpoint endpoint,
>> +   struct nlmsghdr *n)
>> +{
>> +__be16 min_port_type, max_port_type;
>> +
>> +if (htons(*max) <= htons(*min)) {
>> +fprintf(stderr, "max value should be greater than min value\n");
>> +return -1;
>> +}
>> +
>> +if (flower_port_range_attr_type(ip_proto, endpoint, _port_type,
>> +_port_type))
>> +return -1;
>> +
>> +addattr16(n, MAX_MSG, min_port_type, *min);
>> +addattr16(n, MAX_MSG, max_port_type, *max);
>> +
>> +return 0;
>> +}
>> +
>> +static int get_range(__be16 *min, __be16 *max, char *argv)
>> +{
>> +char *r;
>> +
>> +r = strchr(argv, '-');
>> +if (r) {
>> +*r = '\0';
>> +if (get_be16(min, argv, 10)) {
>> +fprintf(stderr, "invalid min range\n");
>> +return -1;
>> +}
>> +if (get_be16(max, r + 1, 10)) {
>> +fprintf(stderr, "invalid max range\n");
>> +return -1;
>> +}
>> +} else {
>> +return -1;
>> +}
>> +return 0;
>> +}
>> +
>>  #define TCP_FLAGS_MAX_MASK 0xfff
>>  
>>  static int flower_parse_tcp_flags(char *str, int flags_type, int mask_type,
>> @@ -1061,20 +1123,47 @@ static int flower_parse_opt(struct filter_util *qu, 
>> char *handle,
>>  return -1;
>>  }
>>  } else if (matches(*argv, "dst_port") == 0) {
>> +__be16 min, max;
>> +
>>  NEXT_ARG();
>> -ret = flower_parse_port(*argv, ip_proto,
>> -FLOWER_ENDPOINT_DST, n);
>> -if (ret < 0) {
>> -fprintf(stderr, "Illegal \"dst_port\"\n");
>> -return -1;
>> +
>> +if (!get_range(, , *argv)) {
>> +ret = flower_parse_port_range(, ,
>> +  ip_proto,
>> +  
>> FLOWER_ENDPOINT_DST,
>> +  n);
>> +if (ret < 0) {
>> +fprintf(stderr, "Illegal \"dst_port 
>> range\"\n");
>> +return -1;
>> +}
>> +} else {
>> +ret = flower_parse_port(*argv, ip_proto,
>> +FLOWER_ENDPOINT_DST, n);
>> +if (ret < 0) {
>> +fprintf(stderr, "Illegal 
>> \"dst_port\"\n");
>> +return -1;
>> +}
> 
> Take a look at ip/iprule.c, line 921:
>   } else if (strcmp(*argv, "sport") == 0) {
>   ...
>   }
> 
> Using sscanf and handling the ret to be 1 or 2 should simplify the above.
> 

Okay, will simplify using sscanf.

>>  }
>>  } else if (matches(*argv, "src_port") == 0) {
>> +__be16 min, max;
>> +
>>  NEXT_ARG();
>> -ret = flower_parse_port(*argv, ip_proto,
>> -FLOWER_ENDPOINT_SRC, n);
>> -

[PATCH bpf] bpf, doc: add entries of who looks over which jits

2018-11-26 Thread Daniel Borkmann

Make the high-level BPF JIT entry a general 'catch-all' and add
architecture specific entries to make it more clear who actively
maintains which BPF JIT compiler. The list (L) address implies
that this eventually lands in the bpf patchwork bucket. Goal is
that this set of responsible developers listed here is always up
to date and a point of contact for helping out in e.g. feature
development, fixes, review or testing patches in order to help
long-term in ensuring quality of the BPF JITs and therefore BPF
core under a given architecture. Every new JIT in future /must/
have an entry here as well.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
Acked-by: Naveen N. Rao 
Acked-by: Sandipan Das 
Acked-by: Martin Schwidefsky 
Acked-by: Heiko Carstens 
Acked-by: David S. Miller 
Acked-by: Zi Shen Lim 
Acked-by: Paul Burton 
Acked-by: Jakub Kicinski 
Acked-by: Wang YanQing 
---
 MAINTAINERS | 63 -
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 03c46f4..bfaa411 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2801,7 +2801,7 @@ T:git 
git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
 Q: https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
 S: Supported
-F: arch/x86/net/bpf_jit*
+F: arch/*/net/*
 F: Documentation/networking/filter.txt
 F: Documentation/bpf/
 F: include/linux/bpf*
@@ -2821,6 +2821,67 @@ F:   tools/bpf/
 F: tools/lib/bpf/
 F: tools/testing/selftests/bpf/
 
+BPF JIT for ARM
+M: Shubham Bansal 
+L: netdev@vger.kernel.org
+S: Maintained
+F: arch/arm/net/
+
+BPF JIT for ARM64
+M: Daniel Borkmann 
+M: Alexei Starovoitov 
+M: Zi Shen Lim 
+L: netdev@vger.kernel.org
+S: Supported
+F: arch/arm64/net/
+
+BPF JIT for MIPS (32-BIT AND 64-BIT)
+M: Paul Burton 
+L: netdev@vger.kernel.org
+S: Maintained
+F: arch/mips/net/
+
+BPF JIT for NFP NICs
+M: Jakub Kicinski 
+L: netdev@vger.kernel.org
+S: Supported
+F: drivers/net/ethernet/netronome/nfp/bpf/
+
+BPF JIT for POWERPC (32-BIT AND 64-BIT)
+M: Naveen N. Rao 
+M: Sandipan Das 
+L: netdev@vger.kernel.org
+S: Maintained
+F: arch/powerpc/net/
+
+BPF JIT for S390
+M: Martin Schwidefsky 
+M: Heiko Carstens 
+L: netdev@vger.kernel.org
+S: Maintained
+F: arch/s390/net/
+X: arch/s390/net/pnet.c
+
+BPF JIT for SPARC (32-BIT AND 64-BIT)
+M: David S. Miller 
+L: netdev@vger.kernel.org
+S: Maintained
+F: arch/sparc/net/
+
+BPF JIT for X86 32-BIT
+M: Wang YanQing 
+L: netdev@vger.kernel.org
+S: Maintained
+F: arch/x86/net/bpf_jit_comp32.c
+
+BPF JIT for X86 64-BIT
+M: Alexei Starovoitov 
+M: Daniel Borkmann 
+L: netdev@vger.kernel.org
+S: Supported
+F: arch/x86/net/
+X: arch/x86/net/bpf_jit_comp32.c
+
 BROADCOM B44 10/100 ETHERNET DRIVER
 M: Michael Chan 
 L: netdev@vger.kernel.org
-- 
2.9.5

[no subject]

2018-11-26 Thread Offer

-- 
-- 
Guten Tag, Wir sind eine registrierte private Geldverleiher. Wir geben
Kredite an Firmen, Einzelpersonen, die ihre finanzielle Status auf der
ganzen Welt aktualisieren müssen, mit minimalen jährlichen Zinsen von
2% .reply, wenn nötig.

Good Day, We are a registered private money lender. We give out loans
to firms, Individual who need to update their financial status all
over the world, with Minimal annual Interest Rates of 2%reply if
needed.

Re: iproute2 compile and linking errors on Fedora 19

2018-11-26 Thread Stephen Hemminger

On Tue, 31 Oct 2017 16:28:20 -0700
Cong Wang  wrote:

> On Tue, Oct 31, 2017 at 2:10 PM, Stephen Hemminger
>  wrote:
> >
> > IPPROTO_MH comes from include/uapi/linux/in6.h
> > Maybe it is trying to use old kernel headers from libc.  
> 
> So newer iproute2 is not supposed to work with older
> kernel header??

iproute2 is supposed to be completely self contained with its own sanitized 
kernel
headers. It should not be using kernel headers directly (or through /usr).

[PATCH net] bonding: fix 802.3ad state sent to partner when unbinding slave

2018-11-26 Thread Toni Peltonen

Previously when unbinding a slave the 802.3ad implementation only told
partner that the port is not suitable for aggregation by setting the port
aggregation state from aggregatable to individual. This is not enough. If the
physical layer still stays up and we only unbinded this port from the bond there
is nothing in the aggregation status alone to prevent the partner from sending
traffic towards us. To ensure that the partner doesn't consider this
port at all anymore we should also disable collecting and distributing to
signal that this actor is going away.

I have tested this behaviour againts Arista EOS switches with mlx5 cards
(physical link stays up when even when interface is down) and simulated
the same situation virtually Linux <-> Linux with two network namespaces
running two veth device pairs. In both cases setting aggregation to
individual doesn't alone prevent traffic from being to sent towards this
port given that the link stays up in partners end. Partner still keeps
it's end in collecting + distributing state and continues until timeout is
reached. In most cases this means we are losing the traffic partner sends
towards our port while we wait for timeout. This is most visible with slow
periodic time (LAPC rate slow).

Other open source implementations like Open VSwitch and libreswitch, and
vendor implementations like Arista EOS, seem to disable collecting +
distributing to when doing similar port disabling/detaching/removing change.
With this patch kernel implementation would behave the same way and ensure
partner doesn't consider our actor viable anymore.

Signed-off-by: Toni Peltonen 
---
 drivers/net/bonding/bond_3ad.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index f43fb2f958a5..6776c33753dc 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2086,6 +2086,8 @@ void bond_3ad_unbind_slave(struct slave *slave)
   aggregator->aggregator_identifier);
 
/* Tell the partner that this port is not suitable for aggregation */
+   port->actor_oper_port_state &= ~AD_STATE_COLLECTING;
+   port->actor_oper_port_state &= ~AD_STATE_DISTRIBUTING;
port->actor_oper_port_state &= ~AD_STATE_AGGREGATION;
__update_lacpdu_from_port(port);
ad_lacpdu_send(port);
-- 
2.19.0

Re: [RFC PATCH 0/3] sk_buff: add skb extension infrastructure

2018-11-26 Thread Eric Dumazet




On 11/26/2018 03:38 AM, Florian Westphal wrote:
> The (out-of-tree) Multipath-TCP implementation needs a significant amount
> of extra space in the skb control buffer.
> 
> Increasing skb->cb[] size in mainline is a non-starter for memory and
> and performance reasons (f.e. increase in cb size also moves several
> frequently-accessed fields to other cache lines).
>

One thing that could be done without too much impact is to provide a cbext[]
only for TCP packets in write/rtx queue, that is not in sk_buff but
on the struct sk_buff_fclones

This extra space would not be 0-initialized at alloc_skb()
and would not be copied at skb_clone()

I mentioned this idea a while back already.


diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 
73902acf2b71c8800d81b744a936a7420f33b459..c4bfc2fd98eb9723c0219d5cd8bf5b28afaf5398
 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1018,6 +1018,8 @@ struct sk_buff_fclones {
struct sk_buff  skb2;
 
refcount_t  fclone_ref;
+
+   charcbext[128] __aligned(8);
 };
 
 /**

[PATCH bpf] sparc: Correct ctx->saw_frame_pointer logic.

2018-11-26 Thread David Miller



We need to initialize the frame pointer register not just if it is
seen as a source operand, but also if it is seen as the destination
operand of a store or an atomic instruction (which effectively is a
source operand).

This is exercised by test_verifier's "non-invalid fp arithmetic"

Signed-off-by: David S. Miller 

diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 222785a..ec4da4d 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -1270,6 +1270,9 @@ static int build_insn(const struct bpf_insn *insn, struct 
jit_ctx *ctx)
const u8 tmp2 = bpf2sparc[TMP_REG_2];
u32 opcode = 0, rs2;
 
+   if (insn->dst_reg == BPF_REG_FP)
+   ctx->saw_frame_pointer = true;
+
ctx->tmp_2_used = true;
emit_loadimm(imm, tmp2, ctx);
 
@@ -1308,6 +1311,9 @@ static int build_insn(const struct bpf_insn *insn, struct 
jit_ctx *ctx)
const u8 tmp = bpf2sparc[TMP_REG_1];
u32 opcode = 0, rs2;
 
+   if (insn->dst_reg == BPF_REG_FP)
+   ctx->saw_frame_pointer = true;
+
switch (BPF_SIZE(code)) {
case BPF_W:
opcode = ST32;
@@ -1340,6 +1346,9 @@ static int build_insn(const struct bpf_insn *insn, struct 
jit_ctx *ctx)
const u8 tmp2 = bpf2sparc[TMP_REG_2];
const u8 tmp3 = bpf2sparc[TMP_REG_3];
 
+   if (insn->dst_reg == BPF_REG_FP)
+   ctx->saw_frame_pointer = true;
+
ctx->tmp_1_used = true;
ctx->tmp_2_used = true;
ctx->tmp_3_used = true;
@@ -1360,6 +1369,9 @@ static int build_insn(const struct bpf_insn *insn, struct 
jit_ctx *ctx)
const u8 tmp2 = bpf2sparc[TMP_REG_2];
const u8 tmp3 = bpf2sparc[TMP_REG_3];
 
+   if (insn->dst_reg == BPF_REG_FP)
+   ctx->saw_frame_pointer = true;
+
ctx->tmp_1_used = true;
ctx->tmp_2_used = true;
ctx->tmp_3_used = true;

[PATCH v2 net-next] tcp: remove hdrlen argument from tcp_queue_rcv()

2018-11-26 Thread Eric Dumazet

Only one caller needs to pull TCP headers, so lets
move __skb_pull() to the caller side.

Signed-off-by: Eric Dumazet 
Acked-by: Yuchung Cheng 
---
v2: sent as a standalone patch.

 net/ipv4/tcp_input.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 
568dbf3b711af75e5f4f0a309f8943579e913494..f32397890b6dcbc34976954c4be142108efa04d8
 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4603,13 +4603,12 @@ static void tcp_data_queue_ofo(struct sock *sk, struct 
sk_buff *skb)
}
 }
 
-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, 
int hdrlen,
- bool *fragstolen)
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
+ bool *fragstolen)
 {
int eaten;
struct sk_buff *tail = skb_peek_tail(>sk_receive_queue);
 
-   __skb_pull(skb, hdrlen);
eaten = (tail &&
 tcp_try_coalesce(sk, tail,
  skb, fragstolen)) ? 1 : 0;
@@ -4660,7 +4659,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, 
size_t size)
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
 
-   if (tcp_queue_rcv(sk, skb, 0, )) {
+   if (tcp_queue_rcv(sk, skb, )) {
WARN_ON_ONCE(fragstolen); /* should not happen */
__kfree_skb(skb);
}
@@ -4720,7 +4719,7 @@ static void tcp_data_queue(struct sock *sk, struct 
sk_buff *skb)
goto drop;
}
 
-   eaten = tcp_queue_rcv(sk, skb, 0, );
+   eaten = tcp_queue_rcv(sk, skb, );
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -5596,8 +5595,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff 
*skb)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
 
/* Bulk data transfer: receiver */
-   eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
- );
+   __skb_pull(skb, tcp_header_len);
+   eaten = tcp_queue_rcv(sk, skb, );
 
tcp_event_data_recv(sk, skb);
 
-- 
2.20.0.rc0.387.gc7a69e6b6c-goog

[PATCH mlx5-next 06/13] net/mlx5: Remove unused events callback and logic

2018-11-26 Thread Saeed Mahameed

The mlx5_interface->event callback is not used by mlx5e/mlx5_ib anymore.

We totally remove the delayed events logic work around, since with
the dynamic notifier registration API it is not needed anymore, mlx5_ib
can register its notifier and start receiving events exactly at the moment
it is ready to handle them.

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c | 109 +-
 .../net/ethernet/mellanox/mlx5/core/events.c  |   8 +-
 .../net/ethernet/mellanox/mlx5/core/health.c  |   3 +-
 .../net/ethernet/mellanox/mlx5/core/main.c|  10 --
 .../ethernet/mellanox/mlx5/core/mlx5_core.h   |   3 -
 include/linux/mlx5/driver.h   |  10 +-
 6 files changed, 11 insertions(+), 132 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c 
b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index d63ba8813829..d2ed14bc37c3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -32,7 +32,6 @@
 
 #include 
 #include "mlx5_core.h"
-#include "lib/mlx5.h"
 
 static LIST_HEAD(intf_list);
 static LIST_HEAD(mlx5_dev_list);
@@ -46,75 +45,11 @@ struct mlx5_device_context {
unsigned long   state;
 };
 
-struct mlx5_delayed_event {
-   struct list_headlist;
-   struct mlx5_core_dev*dev;
-   enum mlx5_dev_event event;
-   unsigned long   param;
-};
-
 enum {
MLX5_INTERFACE_ADDED,
MLX5_INTERFACE_ATTACHED,
 };
 
-static void add_delayed_event(struct mlx5_priv *priv,
- struct mlx5_core_dev *dev,
- enum mlx5_dev_event event,
- unsigned long param)
-{
-   struct mlx5_delayed_event *delayed_event;
-
-   delayed_event = kzalloc(sizeof(*delayed_event), GFP_ATOMIC);
-   if (!delayed_event) {
-   mlx5_core_err(dev, "event %d is missed\n", event);
-   return;
-   }
-
-   mlx5_core_dbg(dev, "Accumulating event %d\n", event);
-   delayed_event->dev = dev;
-   delayed_event->event = event;
-   delayed_event->param = param;
-   list_add_tail(_event->list, >waiting_events_list);
-}
-
-static void delayed_event_release(struct mlx5_device_context *dev_ctx,
- struct mlx5_priv *priv)
-{
-   struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, 
priv);
-   struct mlx5_delayed_event *de;
-   struct mlx5_delayed_event *n;
-   struct list_head temp;
-
-   INIT_LIST_HEAD();
-
-   spin_lock_irq(>ctx_lock);
-
-   priv->is_accum_events = false;
-   list_splice_init(>waiting_events_list, );
-   if (!dev_ctx->context)
-   goto out;
-   list_for_each_entry_safe(de, n, , list)
-   dev_ctx->intf->event(dev, dev_ctx->context, de->event, 
de->param);
-
-out:
-   spin_unlock_irq(>ctx_lock);
-
-   list_for_each_entry_safe(de, n, , list) {
-   list_del(>list);
-   kfree(de);
-   }
-}
-
-/* accumulating events that can come after mlx5_ib calls to
- * ib_register_device, till adding that interface to the events list.
- */
-static void delayed_event_start(struct mlx5_priv *priv)
-{
-   spin_lock_irq(>ctx_lock);
-   priv->is_accum_events = true;
-   spin_unlock_irq(>ctx_lock);
-}
 
 void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 {
@@ -130,8 +65,6 @@ void mlx5_add_device(struct mlx5_interface *intf, struct 
mlx5_priv *priv)
 
dev_ctx->intf = intf;
 
-   delayed_event_start(priv);
-
dev_ctx->context = intf->add(dev);
if (dev_ctx->context) {
set_bit(MLX5_INTERFACE_ADDED, _ctx->state);
@@ -143,8 +76,6 @@ void mlx5_add_device(struct mlx5_interface *intf, struct 
mlx5_priv *priv)
spin_unlock_irq(>ctx_lock);
}
 
-   delayed_event_release(dev_ctx, priv);
-
if (!dev_ctx->context)
kfree(dev_ctx);
 }
@@ -188,26 +119,20 @@ static void mlx5_attach_interface(struct mlx5_interface 
*intf, struct mlx5_priv
if (!dev_ctx)
return;
 
-   delayed_event_start(priv);
if (intf->attach) {
if (test_bit(MLX5_INTERFACE_ATTACHED, _ctx->state))
-   goto out;
+   return;
if (intf->attach(dev, dev_ctx->context))
-   goto out;
-
+   return;
set_bit(MLX5_INTERFACE_ATTACHED, _ctx->state);
} else {
if (test_bit(MLX5_INTERFACE_ADDED, _ctx->state))
-   goto out;
+   return;
dev_ctx->context = intf->add(dev);
if (!dev_ctx->context)
-   goto out;
-
+   return;
set_bit(MLX5_INTERFACE_ADDED, _ctx->state);
}
-
-out:
-   delayed_event_release(dev_ctx, priv);
 }
 
 void

[PATCH mlx5-next 13/13] net/mlx5: Debug print for forwarded async events

2018-11-26 Thread Saeed Mahameed

Print a debug message for every async FW event forwarded to mlx5
interfaces (mlx5e netdev and mlx5_ib rdma module).

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/events.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c 
b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 9e6e216faac3..e92df7020a26 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -229,7 +229,10 @@ static int forward_event(struct notifier_block *nb, 
unsigned long event, void *d
 {
struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, 
nb);
struct mlx5_events   *events   = event_nb->ctx;
+   struct mlx5_eqe  *eqe  = data;
 
+   mlx5_core_dbg(events->dev, "Async eqe type %s, subtype (%d) forward to 
interfaces\n",
+ eqe_type_str(eqe->type), eqe->sub_type);
atomic_notifier_call_chain(>nh, event, data);
return NOTIFY_OK;
 }
-- 
2.19.1

[PATCH mlx5-next 09/13] IB/mlx5: Handle raw delay drop general event

2018-11-26 Thread Saeed Mahameed

Handle FW general event rq delay drop as it was received from FW via mlx5
notifiers API, instead of handling the processed software version of that
event. After this patch we can safely remove all software processed FW
events types and definitions.

Signed-off-by: Saeed Mahameed 
---
 drivers/infiniband/hw/mlx5/main.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c 
b/drivers/infiniband/hw/mlx5/main.c
index a0668b923f78..7e6af18e7d82 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4236,6 +4236,18 @@ static void delay_drop_handler(struct work_struct *work)
mutex_unlock(_drop->lock);
 }
 
+static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe 
*eqe,
+struct ib_event *ibev)
+{
+   switch (eqe->sub_type) {
+   case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
+   schedule_work(>delay_drop.delay_drop_work);
+   break;
+   default: /* do nothing */
+   return;
+   }
+}
+
 static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
  struct ib_event *ibev)
 {
@@ -4308,9 +4320,9 @@ static void mlx5_ib_handle_event(struct work_struct 
*_work)
if (handle_port_change(ibdev, work->param, ))
goto out;
break;
-   case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT:
-   schedule_work(>delay_drop.delay_drop_work);
-   goto out;
+   case MLX5_EVENT_TYPE_GENERAL_EVENT:
+   handle_general_event(ibdev, work->param, );
+   /* fall through */
default:
goto out;
}
-- 
2.19.1

[PATCH mlx5-next 08/13] net/mlx5: Allow forwarding event type general event as is

2018-11-26 Thread Saeed Mahameed

FW general event is used by mlx5_ib for RQ delay drop timeout event
handling, in this patch we allow to forward FW general event type to mlx5
notifiers chain so mlx5_ib can handle it and to deprecate the software
version of it.

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/events.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c 
b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index ab66f5d65a04..735a9b038a73 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -36,6 +36,7 @@ static struct mlx5_nb events_nbs_ref[] = {
 
/* Events to be forwarded (as is) to mlx5 core interfaces 
(mlx5e/mlx5_ib) */
{.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_PORT_CHANGE },
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_GENERAL_EVENT },
 };
 
 struct mlx5_events {
-- 
2.19.1

[PATCH mlx5-next 04/13] net/mlx5: Forward all mlx5 events to mlx5 notifiers chain

2018-11-26 Thread Saeed Mahameed

This to allow seamless migration to the new notifier chain API, and to
eventually deprecate interfaces dev->event callback.

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c | 3 +++
 include/linux/mlx5/driver.h   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c 
b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 7eedbea38a78..d63ba8813829 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -32,6 +32,7 @@
 
 #include 
 #include "mlx5_core.h"
+#include "lib/mlx5.h"
 
 static LIST_HEAD(intf_list);
 static LIST_HEAD(mlx5_dev_list);
@@ -425,6 +426,8 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum 
mlx5_dev_event event,
dev_ctx->intf->event(dev, dev_ctx->context, event, 
param);
 
spin_unlock_irqrestore(>ctx_lock, flags);
+
+   mlx5_notifier_call_chain(dev->priv.events, event, (void *)param);
 }
 
 void mlx5_dev_list_lock(void)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b96929d0cc9c..14ca74707275 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -195,7 +195,7 @@ struct mlx5_rsc_debug {
 };
 
 enum mlx5_dev_event {
-   MLX5_DEV_EVENT_SYS_ERROR,
+   MLX5_DEV_EVENT_SYS_ERROR = 128, /* 0 - 127 are FW events */
MLX5_DEV_EVENT_PORT_UP,
MLX5_DEV_EVENT_PORT_DOWN,
MLX5_DEV_EVENT_PORT_INITIALIZED,
-- 
2.19.1

[PATCH mlx5-next 12/13] net/mlx5: Forward SRQ resource events

2018-11-26 Thread Saeed Mahameed

Allow forwarding of SRQ events to mlx5_core interfaces, e.g. mlx5_ib.
Use mlx5_notifier_register/unregister in srq.c in order to allow seamless
transition of srq.c to infiniband subsystem.

Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/events.c  |  3 ++
 drivers/net/ethernet/mellanox/mlx5/core/srq.c | 38 +--
 include/linux/mlx5/driver.h   |  3 +-
 3 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c 
b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 201c5f6091ea..9e6e216faac3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -44,6 +44,9 @@ static struct mlx5_nb events_nbs_ref[] = {
{.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_PATH_MIG_FAILED },
{.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR },
{.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_WQ_ACCESS_ERROR },
+   /* SRQ events */
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_SRQ_CATAS_ERROR },
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_SRQ_RQ_LIMIT },
 };
 
 struct mlx5_events {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c 
b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
index 0563866c13f2..79c5f0d57956 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -40,15 +40,21 @@
 #include "mlx5_core.h"
 #include "lib/eq.h"
 
-static int srq_event_notifier(struct mlx5_srq_table *table,
+static int srq_event_notifier(struct notifier_block *nb,
  unsigned long type, void *data)
 {
+   struct mlx5_srq_table *table;
struct mlx5_core_dev *dev;
struct mlx5_core_srq *srq;
struct mlx5_priv *priv;
struct mlx5_eqe *eqe;
u32 srqn;
 
+   if (type != MLX5_EVENT_TYPE_SRQ_CATAS_ERROR &&
+   type != MLX5_EVENT_TYPE_SRQ_RQ_LIMIT)
+   return NOTIFY_DONE;
+
+   table = container_of(nb, struct mlx5_srq_table, nb);
priv  = container_of(table, struct mlx5_priv, srq_table);
dev   = container_of(priv, struct mlx5_core_dev, priv);
 
@@ -77,26 +83,6 @@ static int srq_event_notifier(struct mlx5_srq_table *table,
return NOTIFY_OK;
 }
 
-static int catas_err_notifier(struct notifier_block *nb,
- unsigned long type, void *data)
-{
-   struct mlx5_srq_table *table;
-
-   table = mlx5_nb_cof(nb, struct mlx5_srq_table, catas_err_nb);
-   /* type == MLX5_EVENT_TYPE_SRQ_CATAS_ERROR */
-   return srq_event_notifier(table, type, data);
-}
-
-static int rq_limit_notifier(struct notifier_block *nb,
-unsigned long type, void *data)
-{
-   struct mlx5_srq_table *table;
-
-   table = mlx5_nb_cof(nb, struct mlx5_srq_table, rq_limit_nb);
-   /* type == MLX5_EVENT_TYPE_SRQ_RQ_LIMIT */
-   return srq_event_notifier(table, type, data);
-}
-
 static int get_pas_size(struct mlx5_srq_attr *in)
 {
u32 log_page_size = in->log_page_size + 12;
@@ -743,17 +729,13 @@ void mlx5_init_srq_table(struct mlx5_core_dev *dev)
spin_lock_init(>lock);
INIT_RADIX_TREE(>tree, GFP_ATOMIC);
 
-   MLX5_NB_INIT(>catas_err_nb, catas_err_notifier, SRQ_CATAS_ERROR);
-   mlx5_eq_notifier_register(dev, >catas_err_nb);
-
-   MLX5_NB_INIT(>rq_limit_nb, rq_limit_notifier, SRQ_RQ_LIMIT);
-   mlx5_eq_notifier_register(dev, >rq_limit_nb);
+   table->nb.notifier_call = srq_event_notifier;
+   mlx5_notifier_register(dev, >nb);
 }
 
 void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev)
 {
struct mlx5_srq_table *table = >priv.srq_table;
 
-   mlx5_eq_notifier_unregister(dev, >rq_limit_nb);
-   mlx5_eq_notifier_unregister(dev, >catas_err_nb);
+   mlx5_notifier_unregister(dev, >nb);
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 4f078b7f6620..27a481b159ed 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -465,8 +465,7 @@ struct mlx5_qp_table {
 };
 
 struct mlx5_srq_table {
-   struct mlx5_nb  catas_err_nb;
-   struct mlx5_nb  rq_limit_nb;
+   struct notifier_block   nb;
/* protect radix tree
 */
spinlock_t  lock;
-- 
2.19.1

[PATCH mlx5-next 05/13] IB/mlx5: Use the new mlx5 core notifier API

2018-11-26 Thread Saeed Mahameed

Remove the deprecated mlx5_interface->event mlx5_ib callback and use new
mlx5 notifier API to subscribe for mlx5 events.

For native mlx5_ib devices profiles pf_profile/nic_rep_profile register
the notifier callback mlx5_ib_handle_event which treats the notifier
context as mlx5_ib_dev.

For vport repesentors, don't register any notifier, same as before, they
didn't receive any mlx5 events.

For slave port (mlx5_ib_multiport_info) register a different notifier
callback mlx5_ib_event_slave_port, which knows that the event is coming
for mlx5_ib_multiport_info and prepares the event job accordingly.
Before this on the event handler work we had to ask mlx5_core if this is
a slave port mlx5_core_is_mp_slave(work->dev), now it is not needed
anymore.
mlx5_ib_multiport_info notifier registration is done on
mlx5_ib_bind_slave_port and de-registration is done on
mlx5_ib_unbind_slave_port.

Signed-off-by: Saeed Mahameed 
---
 drivers/infiniband/hw/mlx5/main.c| 77 +++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  3 ++
 2 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c 
b/drivers/infiniband/hw/mlx5/main.c
index fcf4a0328a90..549c766cf309 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -82,10 +82,13 @@ static char mlx5_version[] =
 
 struct mlx5_ib_event_work {
struct work_struct  work;
-   struct mlx5_core_dev*dev;
-   void*context;
+   union {
+   struct mlx5_ib_dev*dev;
+   struct mlx5_ib_multiport_info *mpi;
+   };
+   boolis_slave;
enum mlx5_dev_event event;
-   unsigned long   param;
+   void*param;
 };
 
 enum {
@@ -4240,14 +4243,14 @@ static void mlx5_ib_handle_event(struct work_struct 
*_work)
struct mlx5_ib_dev *ibdev;
struct ib_event ibev;
bool fatal = false;
-   u8 port = (u8)work->param;
+   u8 port = (u8)(unsigned long)work->param;
 
-   if (mlx5_core_is_mp_slave(work->dev)) {
-   ibdev = mlx5_ib_get_ibdev_from_mpi(work->context);
+   if (work->is_slave) {
+   ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
if (!ibdev)
goto out;
} else {
-   ibdev = work->context;
+   ibdev = work->dev;
}
 
switch (work->event) {
@@ -4256,7 +4259,6 @@ static void mlx5_ib_handle_event(struct work_struct 
*_work)
mlx5_ib_handle_internal_error(ibdev);
fatal = true;
break;
-
case MLX5_DEV_EVENT_PORT_UP:
case MLX5_DEV_EVENT_PORT_DOWN:
case MLX5_DEV_EVENT_PORT_INITIALIZED:
@@ -4311,22 +4313,43 @@ static void mlx5_ib_handle_event(struct work_struct 
*_work)
kfree(work);
 }
 
-static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
- enum mlx5_dev_event event, unsigned long param)
+static int mlx5_ib_event(struct notifier_block *nb,
+unsigned long event, void *param)
 {
struct mlx5_ib_event_work *work;
 
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (!work)
-   return;
+   return NOTIFY_DONE;
 
INIT_WORK(>work, mlx5_ib_handle_event);
-   work->dev = dev;
+   work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events);
+   work->is_slave = false;
work->param = param;
-   work->context = context;
work->event = event;
 
queue_work(mlx5_ib_event_wq, >work);
+
+   return NOTIFY_OK;
+}
+
+static int mlx5_ib_event_slave_port(struct notifier_block *nb,
+   unsigned long event, void *param)
+{
+   struct mlx5_ib_event_work *work;
+
+   work = kmalloc(sizeof(*work), GFP_ATOMIC);
+   if (!work)
+   return NOTIFY_DONE;
+
+   INIT_WORK(>work, mlx5_ib_handle_event);
+   work->mpi = container_of(nb, struct mlx5_ib_multiport_info, 
mdev_events);
+   work->is_slave = true;
+   work->param = param;
+   work->event = event;
+   queue_work(mlx5_ib_event_wq, >work);
+
+   return NOTIFY_OK;
 }
 
 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
@@ -5357,6 +5380,11 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev 
*ibdev,
spin_unlock(>mp.mpi_lock);
return;
}
+
+   if (mpi->mdev_events.notifier_call)
+   mlx5_notifier_unregister(mpi->mdev, >mdev_events);
+   mpi->mdev_events.notifier_call = NULL;
+
mpi->ibdev = NULL;
 
spin_unlock(>mp.mpi_lock);
@@ -5412,6 +5440,7 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev 
*ibdev,
 
ibdev->port[port_num].mp.mpi = mpi;
mpi->ibdev = ibdev;
+   mpi->mdev_events.notifier_call = NULL;
spin_unlock(>port[port_num].mp.mpi_lock);
 
err =

[PATCH mlx5-next 11/13] net/mlx5: Forward QP/WorkQueues resource events

2018-11-26 Thread Saeed Mahameed

Allow forwarding QP and WQ events to mlx5_core interfaces, e.g. mlx5_ib

Use mlx5_notifier_register/unregister in qp.c in order to allow seamless
transition of qp.c to infiniband subsystem.

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/events.c | 10 ++
 drivers/net/ethernet/mellanox/mlx5/core/qp.c |  8 
 include/linux/mlx5/driver.h  |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c 
b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 3708b42c1d6b..201c5f6091ea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -34,6 +34,16 @@ static struct mlx5_nb events_nbs_ref[] = {
/* Events to be forwarded (as is) to mlx5 core interfaces 
(mlx5e/mlx5_ib) */
{.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_PORT_CHANGE },
{.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_GENERAL_EVENT },
+   /* QP/WQ resource events to forward */
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_DCT_DRAINED },
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_PATH_MIG },
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_COMM_EST },
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_SQ_DRAINED },
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_SRQ_LAST_WQE },
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_WQ_CATAS_ERROR },
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_PATH_MIG_FAILED },
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR },
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_WQ_ACCESS_ERROR },
 };
 
 struct mlx5_events {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c 
b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index 28726c63101f..388f205a497f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -150,7 +150,7 @@ static int rsc_event_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
}
 
-   table = mlx5_nb_cof(nb, struct mlx5_qp_table, nb);
+   table = container_of(nb, struct mlx5_qp_table, nb);
priv  = container_of(table, struct mlx5_priv, qp_table);
dev   = container_of(priv, struct mlx5_core_dev, priv);
 
@@ -523,15 +523,15 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev)
INIT_RADIX_TREE(>tree, GFP_ATOMIC);
mlx5_qp_debugfs_init(dev);
 
-   MLX5_NB_INIT(>nb, rsc_event_notifier, NOTIFY_ANY);
-   mlx5_eq_notifier_register(dev, >nb);
+   table->nb.notifier_call = rsc_event_notifier;
+   mlx5_notifier_register(dev, >nb);
 }
 
 void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev)
 {
struct mlx5_qp_table *table = >priv.qp_table;
 
-   mlx5_eq_notifier_unregister(dev, >nb);
+   mlx5_notifier_unregister(dev, >nb);
mlx5_qp_debugfs_cleanup(dev);
 }
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a77bedb8a556..4f078b7f6620 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -456,7 +456,7 @@ struct mlx5_core_health {
 };
 
 struct mlx5_qp_table {
-   struct mlx5_nb  nb;
+   struct notifier_block   nb;
 
/* protect radix tree
 */
-- 
2.19.1

[PATCH mlx5-next 10/13] net/mlx5: Remove all deprecated software versions of FW events

2018-11-26 Thread Saeed Mahameed

Before the new mlx5 event notification infrastructure and API,
mlx5_core used to process all events before forwarding them to mlx5
interfaces (mlx5e/mlx5_ib) and used to translate the event type enum
to a software defined enum, this is not needed anymore since it is ok
for mlx5e and mlx5_ib to receive FW events as is, at least the few ones
mlx5 core allows.

mlx5e and mlx5_ib already moved to use the new API and they only handle FW
events types, it is now safe to remove all equivalent software defined
events and the logic around them.

Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/events.c  | 92 +--
 include/linux/mlx5/driver.h   |  9 --
 2 files changed, 1 insertion(+), 100 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c 
b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 735a9b038a73..3708b42c1d6b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -19,8 +19,6 @@ struct mlx5_event_nb {
  * separate notifiers callbacks, specifically by those mlx5 components.
  */
 static int any_notifier(struct notifier_block *, unsigned long, void *);
-static int port_change(struct notifier_block *, unsigned long, void *);
-static int general_event(struct notifier_block *, unsigned long, void *);
 static int temp_warn(struct notifier_block *, unsigned long, void *);
 static int port_module(struct notifier_block *, unsigned long, void *);
 
@@ -28,9 +26,8 @@ static int port_module(struct notifier_block *, unsigned 
long, void *);
 static int forward_event(struct notifier_block *, unsigned long, void *);
 
 static struct mlx5_nb events_nbs_ref[] = {
+   /* Events to be proccessed by mlx5_core */
{.nb.notifier_call = any_notifier,  .event_type = 
MLX5_EVENT_TYPE_NOTIFY_ANY },
-   {.nb.notifier_call = port_change,   .event_type = 
MLX5_EVENT_TYPE_PORT_CHANGE },
-   {.nb.notifier_call = general_event, .event_type = 
MLX5_EVENT_TYPE_GENERAL_EVENT },
{.nb.notifier_call = temp_warn, .event_type = 
MLX5_EVENT_TYPE_TEMP_WARN_EVENT },
{.nb.notifier_call = port_module,   .event_type = 
MLX5_EVENT_TYPE_PORT_MODULE_EVENT },
 
@@ -127,93 +124,6 @@ static int any_notifier(struct notifier_block *nb,
return NOTIFY_OK;
 }
 
-static enum mlx5_dev_event port_subtype2dev(u8 subtype)
-{
-   switch (subtype) {
-   case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
-   return MLX5_DEV_EVENT_PORT_DOWN;
-   case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
-   return MLX5_DEV_EVENT_PORT_UP;
-   case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
-   return MLX5_DEV_EVENT_PORT_INITIALIZED;
-   case MLX5_PORT_CHANGE_SUBTYPE_LID:
-   return MLX5_DEV_EVENT_LID_CHANGE;
-   case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
-   return MLX5_DEV_EVENT_PKEY_CHANGE;
-   case MLX5_PORT_CHANGE_SUBTYPE_GUID:
-   return MLX5_DEV_EVENT_GUID_CHANGE;
-   case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
-   return MLX5_DEV_EVENT_CLIENT_REREG;
-   }
-   return -1;
-}
-
-/* type == MLX5_EVENT_TYPE_PORT_CHANGE */
-static int port_change(struct notifier_block *nb,
-  unsigned long type, void *data)
-{
-   struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, 
nb);
-   struct mlx5_events   *events   = event_nb->ctx;
-   struct mlx5_core_dev *dev  = events->dev;
-
-   bool dev_event_dispatch = false;
-   enum mlx5_dev_event dev_event;
-   unsigned long dev_event_data;
-   struct mlx5_eqe *eqe = data;
-   u8 port = (eqe->data.port.port >> 4) & 0xf;
-
-   switch (eqe->sub_type) {
-   case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
-   case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
-   case MLX5_PORT_CHANGE_SUBTYPE_LID:
-   case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
-   case MLX5_PORT_CHANGE_SUBTYPE_GUID:
-   case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
-   case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
-   dev_event = port_subtype2dev(eqe->sub_type);
-   dev_event_data = (unsigned long)port;
-   dev_event_dispatch = true;
-   break;
-   default:
-   mlx5_core_warn(dev, "Port event with unrecognized subtype: port 
%d, sub_type %d\n",
-  port, eqe->sub_type);
-   }
-
-   if (dev_event_dispatch)
-   mlx5_notifier_call_chain(events, dev_event, (void 
*)dev_event_data);
-
-   return NOTIFY_OK;
-}
-
-/* type == MLX5_EVENT_TYPE_GENERAL_EVENT */
-static int general_event(struct notifier_block *nb, unsigned long type, void 
*data)
-{
-   struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, 
nb);
-   struct mlx5_events   *events   = event_nb->ctx;
-   struct mlx5_core_dev *dev  = events->dev;
-
-   bool dev_event_dispatch = false;
-   enum mlx5_dev_event dev_event;
-   unsigned

[PATCH mlx5-next 07/13] IB/mlx5: Handle raw port change event rather than the software version

2018-11-26 Thread Saeed Mahameed

Use the FW version of the port change event as forwarded via new mlx5
notifiers API.

After this patch, processed software version of the port change event
will become deprecated and will be totally removed in downstream
patches.

Signed-off-by: Saeed Mahameed 
---
 drivers/infiniband/hw/mlx5/main.c | 86 +++
 1 file changed, 52 insertions(+), 34 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c 
b/drivers/infiniband/hw/mlx5/main.c
index 549c766cf309..a0668b923f78 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -87,7 +87,7 @@ struct mlx5_ib_event_work {
struct mlx5_ib_multiport_info *mpi;
};
boolis_slave;
-   enum mlx5_dev_event event;
+   unsigned intevent;
void*param;
 };
 
@@ -4236,6 +4236,51 @@ static void delay_drop_handler(struct work_struct *work)
mutex_unlock(_drop->lock);
 }
 
+static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
+ struct ib_event *ibev)
+{
+   u8 port = (eqe->data.port.port >> 4) & 0xf;
+
+   ibev->element.port_num = port;
+
+   switch (eqe->sub_type) {
+   case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+   case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+   case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+   /* In RoCE, port up/down events are handled in
+* mlx5_netdev_event().
+*/
+   if (mlx5_ib_port_link_layer(>ib_dev, port) ==
+   IB_LINK_LAYER_ETHERNET)
+   return -EINVAL;
+
+   ibev->event = (eqe->sub_type == 
MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ?
+   IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+   break;
+
+   case MLX5_PORT_CHANGE_SUBTYPE_LID:
+   ibev->event = IB_EVENT_LID_CHANGE;
+   break;
+
+   case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+   ibev->event = IB_EVENT_PKEY_CHANGE;
+   schedule_work(>devr.ports[port - 1].pkey_change_work);
+   break;
+
+   case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+   ibev->event = IB_EVENT_GID_CHANGE;
+   break;
+
+   case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+   ibev->event = IB_EVENT_CLIENT_REREGISTER;
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static void mlx5_ib_handle_event(struct work_struct *_work)
 {
struct mlx5_ib_event_work *work =
@@ -4243,7 +4288,6 @@ static void mlx5_ib_handle_event(struct work_struct 
*_work)
struct mlx5_ib_dev *ibdev;
struct ib_event ibev;
bool fatal = false;
-   u8 port = (u8)(unsigned long)work->param;
 
if (work->is_slave) {
ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
@@ -4257,37 +4301,12 @@ static void mlx5_ib_handle_event(struct work_struct 
*_work)
case MLX5_DEV_EVENT_SYS_ERROR:
ibev.event = IB_EVENT_DEVICE_FATAL;
mlx5_ib_handle_internal_error(ibdev);
+   ibev.element.port_num  = (u8)(unsigned long)work->param;
fatal = true;
break;
-   case MLX5_DEV_EVENT_PORT_UP:
-   case MLX5_DEV_EVENT_PORT_DOWN:
-   case MLX5_DEV_EVENT_PORT_INITIALIZED:
-   /* In RoCE, port up/down events are handled in
-* mlx5_netdev_event().
-*/
-   if (mlx5_ib_port_link_layer(>ib_dev, port) ==
-   IB_LINK_LAYER_ETHERNET)
+   case MLX5_EVENT_TYPE_PORT_CHANGE:
+   if (handle_port_change(ibdev, work->param, ))
goto out;
-
-   ibev.event = (work->event == MLX5_DEV_EVENT_PORT_UP) ?
-IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
-   break;
-
-   case MLX5_DEV_EVENT_LID_CHANGE:
-   ibev.event = IB_EVENT_LID_CHANGE;
-   break;
-
-   case MLX5_DEV_EVENT_PKEY_CHANGE:
-   ibev.event = IB_EVENT_PKEY_CHANGE;
-   schedule_work(>devr.ports[port - 1].pkey_change_work);
-   break;
-
-   case MLX5_DEV_EVENT_GUID_CHANGE:
-   ibev.event = IB_EVENT_GID_CHANGE;
-   break;
-
-   case MLX5_DEV_EVENT_CLIENT_REREG:
-   ibev.event = IB_EVENT_CLIENT_REREGISTER;
break;
case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT:
schedule_work(>delay_drop.delay_drop_work);
@@ -4296,11 +4315,10 @@ static void mlx5_ib_handle_event(struct work_struct 
*_work)
goto out;
}
 
-   ibev.device   = >ib_dev;
-   ibev.element.port_num = port;
+   ibev.device = >ib_dev;
 
-   if (!rdma_is_port_valid(>ib_dev, port)) {
-   mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
+   if

[PATCH mlx5-next 02/13] net/mlx5: Allow port change event to be forwarded to driver notifiers chain

2018-11-26 Thread Saeed Mahameed

The idea is to allow mlx5 core interfaces (mlx5e/mlx5_ib) to be able to
receive some allowed FW events as is via the new notifier API.

In this patch we allow forwarding port change event to mlx5 core interfaces
(mlx5e/mlx5_ib) as it was received from FW.
Once mlx5e and mlx5_ib start using this event we can safely remove the
redundant software version of it and its translation logic.

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/events.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c 
b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 560cc14c55f7..adab66eb726c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -24,12 +24,18 @@ static int general_event(struct notifier_block *, unsigned 
long, void *);
 static int temp_warn(struct notifier_block *, unsigned long, void *);
 static int port_module(struct notifier_block *, unsigned long, void *);
 
+/* handler which forwards the event to events->nh, driver notifiers */
+static int forward_event(struct notifier_block *, unsigned long, void *);
+
 static struct mlx5_nb events_nbs_ref[] = {
{.nb.notifier_call = any_notifier,  .event_type = 
MLX5_EVENT_TYPE_NOTIFY_ANY },
{.nb.notifier_call = port_change,   .event_type = 
MLX5_EVENT_TYPE_PORT_CHANGE },
{.nb.notifier_call = general_event, .event_type = 
MLX5_EVENT_TYPE_GENERAL_EVENT },
{.nb.notifier_call = temp_warn, .event_type = 
MLX5_EVENT_TYPE_TEMP_WARN_EVENT },
{.nb.notifier_call = port_module,   .event_type = 
MLX5_EVENT_TYPE_PORT_MODULE_EVENT },
+
+   /* Events to be forwarded (as is) to mlx5 core interfaces 
(mlx5e/mlx5_ib) */
+   {.nb.notifier_call = forward_event,   .event_type = 
MLX5_EVENT_TYPE_PORT_CHANGE },
 };
 
 struct mlx5_events {
@@ -294,6 +300,16 @@ void mlx5_get_pme_stats(struct mlx5_core_dev *dev, struct 
mlx5_pme_stats *stats)
*stats = dev->priv.events->pme_stats;
 }
 
+/* forward event as is to registered interfaces (mlx5e/mlx5_ib) */
+static int forward_event(struct notifier_block *nb, unsigned long event, void 
*data)
+{
+   struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, 
nb);
+   struct mlx5_events   *events   = event_nb->ctx;
+
+   atomic_notifier_call_chain(>nh, event, data);
+   return NOTIFY_OK;
+}
+
 int mlx5_events_init(struct mlx5_core_dev *dev)
 {
struct mlx5_events *events = kzalloc(sizeof(*events), GFP_KERNEL);
-- 
2.19.1

[PATCH mlx5-next 03/13] net/mlx5e: Use the new mlx5 core notifier API

2018-11-26 Thread Saeed Mahameed

Remove the deprecated mlx5_interface->event mlx5e callback and use new
mlx5 notifier API to subscribe for mlx5 events, handle port change event
as received from FW rather than handling the mlx5 core processed port
change software version event.

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  3 ++-
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 27 ++-
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index aea74856c702..13d8a74d3db5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -631,7 +631,6 @@ struct mlx5e_channel_stats {
 } cacheline_aligned_in_smp;
 
 enum {
-   MLX5E_STATE_ASYNC_EVENTS_ENABLED,
MLX5E_STATE_OPENED,
MLX5E_STATE_DESTROYING,
 };
@@ -690,6 +689,8 @@ struct mlx5e_priv {
struct hwtstamp_config tstamp;
u16q_counter;
u16drop_rq_q_counter;
+   struct notifier_block  events_nb;
+
 #ifdef CONFIG_MLX5_CORE_EN_DCB
struct mlx5e_dcbx  dcbx;
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 0d495a6b3949..56bc41b1c31f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -294,33 +294,35 @@ void mlx5e_queue_update_stats(struct mlx5e_priv *priv)
queue_work(priv->wq, >update_stats_work);
 }
 
-static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
- enum mlx5_dev_event event, unsigned long param)
+static int async_event(struct notifier_block *nb, unsigned long event, void 
*data)
 {
-   struct mlx5e_priv *priv = vpriv;
+   struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, 
events_nb);
+   struct mlx5_eqe   *eqe = data;
 
-   if (!test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, >state))
-   return;
+   if (event != MLX5_EVENT_TYPE_PORT_CHANGE)
+   return NOTIFY_DONE;
 
-   switch (event) {
-   case MLX5_DEV_EVENT_PORT_UP:
-   case MLX5_DEV_EVENT_PORT_DOWN:
+   switch (eqe->sub_type) {
+   case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+   case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
queue_work(priv->wq, >update_carrier_work);
break;
default:
-   break;
+   return NOTIFY_DONE;
}
+
+   return NOTIFY_OK;
 }
 
 static void mlx5e_enable_async_events(struct mlx5e_priv *priv)
 {
-   set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, >state);
+   priv->events_nb.notifier_call = async_event;
+   mlx5_notifier_register(priv->mdev, >events_nb);
 }
 
 static void mlx5e_disable_async_events(struct mlx5e_priv *priv)
 {
-   clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, >state);
-   mlx5_eq_synchronize_async_irq(priv->mdev);
+   mlx5_notifier_unregister(priv->mdev, >events_nb);
 }
 
 static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
@@ -5170,7 +5172,6 @@ static struct mlx5_interface mlx5e_interface = {
.remove= mlx5e_remove,
.attach= mlx5e_attach,
.detach= mlx5e_detach,
-   .event = mlx5e_async_event,
.protocol  = MLX5_INTERFACE_PROTOCOL_ETH,
.get_dev   = mlx5e_get_netdev,
 };
-- 
2.19.1

[PATCH mlx5-next 00/13] Mellanox, mlx5 core driver events API

2018-11-26 Thread Saeed Mahameed

Hi,

This patchset is for mlx5-next shared branch, and will be applied there
once the review is done.

The main idea of this change is to define a flexible scalable and
simpler high level mlx5 core APIs to forward device and driver events 
to upper level interface drivers e.g mlx5_ib and mlx5e netdevice driver.

Patch #1, Driver events notifier API:

Use atomic notifier chain to fire events to mlx5 core driver
consumers (mlx5e/mlx5_ib) and provide dynamic mlx5 register/unregister
notifier API to replace the internal mlx5_interface->event and totally
remove it.

Patch #2, Forward port events via the new notifier chain.
Patch #3, Use the mlx5 events API in mlx5e netdevice.
Patch #4, Forward all device events sent via mlx5_core_event (old API)
to the new notifier chain, this will allow seamless transition to the
new AP.
Patch #5, Use the new events API in mlx5_IB
Patch #6, remove old interface callback, mlx5_interface->event
Patch #7,8,9, mlx5_ib to handle raw FW events as is rather than the
software version of them, this will remove any need for unnecessary
processing of FW events in the low level mlx5_core driver.
Patch #10, Remove unnecessary processing of FW events in the low level
mlx5_core driver, all events are handled on demand by mlx5_ib and/or
mlx5e netdevice.

patch #11,12, forward QP and SRQ events via the new notifier chain,
will be needed by mlx5_ib driver.

Patch #13, Debug patch for mlx5 events.

Thanks,
Saeed.

---

Saeed Mahameed (13):
  net/mlx5: Driver events notifier API
  net/mlx5: Allow port change event to be forwarded to driver notifiers
chain
  net/mlx5e: Use the new mlx5 core notifier API
  net/mlx5: Forward all mlx5 events to mlx5 notifiers chain
  IB/mlx5: Use the new mlx5 core notifier API
  net/mlx5: Remove unused events callback and logic
  IB/mlx5: Handle raw port change event rather than the software version
  net/mlx5: Allow forwarding event type general event as is
  IB/mlx5: Handle raw delay drop general event
  net/mlx5: Remove all deprecated software versions of FW events
  net/mlx5: Forward QP/WorkQueues resource events
  net/mlx5: Forward SRQ resource events
  net/mlx5: Debug print for forwarded async events

 drivers/infiniband/hw/mlx5/main.c | 179 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h  |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/dev.c | 106 +--
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |   3 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  27 +--
 .../net/ethernet/mellanox/mlx5/core/events.c  | 150 ++-
 .../net/ethernet/mellanox/mlx5/core/health.c  |   3 +-
 .../ethernet/mellanox/mlx5/core/lib/mlx5.h|   1 +
 .../net/ethernet/mellanox/mlx5/core/main.c|  10 -
 .../ethernet/mellanox/mlx5/core/mlx5_core.h   |   3 -
 drivers/net/ethernet/mellanox/mlx5/core/qp.c  |   8 +-
 drivers/net/ethernet/mellanox/mlx5/core/srq.c |  38 +---
 include/linux/mlx5/driver.h   |  30 +--
 13 files changed, 235 insertions(+), 326 deletions(-)

-- 
2.19.1

[PATCH mlx5-next 01/13] net/mlx5: Driver events notifier API

2018-11-26 Thread Saeed Mahameed

Use atomic notifier chain to fire events to mlx5 core driver
consumers (mlx5e/mlx5_ib) and provide mlx5 register/unregister notifier
API.

This API will replace the current mlx5_interface->event callback and all
the logic around it, especially the delayed events logic introduced by
commit 97834eba7c19 ("net/mlx5: Delay events till ib registration ends")

Which is not needed anymore with this new API where the mlx5 interface
can dynamically register/unregister its notifier.

Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/events.c  | 25 ++-
 .../ethernet/mellanox/mlx5/core/lib/mlx5.h|  1 +
 include/linux/mlx5/driver.h   |  4 +++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c 
b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 3ad004af37d7..560cc14c55f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -35,7 +35,8 @@ static struct mlx5_nb events_nbs_ref[] = {
 struct mlx5_events {
struct mlx5_core_dev *dev;
struct mlx5_event_nb  notifiers[ARRAY_SIZE(events_nbs_ref)];
-
+   /* driver notifier chain */
+   struct atomic_notifier_head nh;
/* port module events stats */
struct mlx5_pme_stats pme_stats;
 };
@@ -300,6 +301,7 @@ int mlx5_events_init(struct mlx5_core_dev *dev)
if (!events)
return -ENOMEM;
 
+   ATOMIC_INIT_NOTIFIER_HEAD(>nh);
events->dev = dev;
dev->priv.events = events;
return 0;
@@ -330,3 +332,24 @@ void mlx5_events_stop(struct mlx5_core_dev *dev)
for (i = ARRAY_SIZE(events_nbs_ref) - 1; i >= 0 ; i--)
mlx5_eq_notifier_unregister(dev, >notifiers[i].nb);
 }
+
+int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block 
*nb)
+{
+   struct mlx5_events *events = dev->priv.events;
+
+   return atomic_notifier_chain_register(>nh, nb);
+}
+EXPORT_SYMBOL(mlx5_notifier_register);
+
+int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block 
*nb)
+{
+   struct mlx5_events *events = dev->priv.events;
+
+   return atomic_notifier_chain_unregister(>nh, nb);
+}
+EXPORT_SYMBOL(mlx5_notifier_unregister);
+
+int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, 
void *data)
+{
+   return atomic_notifier_call_chain(>nh, event, data);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index 23317e328b0b..4d78a459676e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -73,5 +73,6 @@ struct mlx5_pme_stats {
 };
 
 void mlx5_get_pme_stats(struct mlx5_core_dev *dev, struct mlx5_pme_stats 
*stats);
+int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, 
void *data);
 
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ba64ecf72478..b96929d0cc9c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -46,6 +46,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1062,6 +1063,9 @@ struct mlx5_interface {
 void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol);
 int mlx5_register_interface(struct mlx5_interface *intf);
 void mlx5_unregister_interface(struct mlx5_interface *intf);
+int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block 
*nb);
+int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block 
*nb);
+
 int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
 
 int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev);
-- 
2.19.1

Re: [PATCH iproute2] ss: add support for delivered and delivered_ce fields

2018-11-26 Thread Yuchung Cheng

On Mon, Nov 26, 2018 at 2:29 PM, Eric Dumazet  wrote:
> Kernel support was added in linux-4.18 in commit feb5f2ec6464
> ("tcp: export packets delivery info")
>
> Tested:
>
> ss -ti
> ...
> ESTAB   0 2270520  [2607:f8b0:8099:e16::]:47646   
> [2607:f8b0:8099:e18::]:38953
>  ts sack cubic wscale:8,8 rto:7 rtt:2.824/0.278 mss:1428
>  pmtu:1500 rcvmss:536 advmss:1428 cwnd:89 ssthresh:62 
> bytes_acked:2097871945
> segs_out:1469144 segs_in:65221 data_segs_out:1469142 send 360.0Mbps 
> lastsnd:2
> lastrcv:99231 lastack:2 pacing_rate 431.9Mbps delivery_rate 246.4Mbps
> (*) delivered:1469099 delivered_ce:424799
> busy:99231ms unacked:44 rcv_space:14280 rcv_ssthresh:65535
> notsent:2207688 minrtt:0.228
>
> Signed-off-by: Eric Dumazet 
Acked-by: Yuchung Cheng 

Thank you Eric!
> ---
>  misc/ss.c | 8 
>  1 file changed, 8 insertions(+)
>
> diff --git a/misc/ss.c b/misc/ss.c
> index 
> e4d6ae489e798419fa6ce6fb0f4b8b0b3232adf6..3aa94f235085512510dca9fd597e8e37aaaf0fd3
>  100644
> --- a/misc/ss.c
> +++ b/misc/ss.c
> @@ -817,6 +817,8 @@ struct tcpstat {
> unsigned intfackets;
> unsigned intreordering;
> unsigned intnot_sent;
> +   unsigned intdelivered;
> +   unsigned intdelivered_ce;
> double  rcv_rtt;
> double  min_rtt;
> int rcv_space;
> @@ -2483,6 +2485,10 @@ static void tcp_stats_print(struct tcpstat *s)
>
> if (s->delivery_rate)
> out(" delivery_rate %sbps", sprint_bw(b1, s->delivery_rate));
> +   if (s->delivered)
> +   out(" delivered:%u", s->delivered);
> +   if (s->delivered_ce)
> +   out(" delivered_ce:%u", s->delivered_ce);
> if (s->app_limited)
> out(" app_limited");
>
> @@ -2829,6 +2835,8 @@ static void tcp_show_info(const struct nlmsghdr *nlh, 
> struct inet_diag_msg *r,
> s.busy_time = info->tcpi_busy_time;
> s.rwnd_limited = info->tcpi_rwnd_limited;
> s.sndbuf_limited = info->tcpi_sndbuf_limited;
> +   s.delivered = info->tcpi_delivered;
> +   s.delivered_ce = info->tcpi_delivered_ce;
> tcp_stats_print();
> free(s.dctcp);
> free(s.bbr_info);
> --
> 2.20.0.rc0.387.gc7a69e6b6c-goog
>

[PATCH iproute2] ss: add support for delivered and delivered_ce fields

2018-11-26 Thread Eric Dumazet

Kernel support was added in linux-4.18 in commit feb5f2ec6464
("tcp: export packets delivery info")

Tested:

ss -ti
...
ESTAB   0 2270520  [2607:f8b0:8099:e16::]:47646   
[2607:f8b0:8099:e18::]:38953
 ts sack cubic wscale:8,8 rto:7 rtt:2.824/0.278 mss:1428
 pmtu:1500 rcvmss:536 advmss:1428 cwnd:89 ssthresh:62 bytes_acked:2097871945
segs_out:1469144 segs_in:65221 data_segs_out:1469142 send 360.0Mbps 
lastsnd:2
lastrcv:99231 lastack:2 pacing_rate 431.9Mbps delivery_rate 246.4Mbps
(*) delivered:1469099 delivered_ce:424799
busy:99231ms unacked:44 rcv_space:14280 rcv_ssthresh:65535
notsent:2207688 minrtt:0.228

Signed-off-by: Eric Dumazet 
---
 misc/ss.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/misc/ss.c b/misc/ss.c
index 
e4d6ae489e798419fa6ce6fb0f4b8b0b3232adf6..3aa94f235085512510dca9fd597e8e37aaaf0fd3
 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -817,6 +817,8 @@ struct tcpstat {
unsigned intfackets;
unsigned intreordering;
unsigned intnot_sent;
+   unsigned intdelivered;
+   unsigned intdelivered_ce;
double  rcv_rtt;
double  min_rtt;
int rcv_space;
@@ -2483,6 +2485,10 @@ static void tcp_stats_print(struct tcpstat *s)
 
if (s->delivery_rate)
out(" delivery_rate %sbps", sprint_bw(b1, s->delivery_rate));
+   if (s->delivered)
+   out(" delivered:%u", s->delivered);
+   if (s->delivered_ce)
+   out(" delivered_ce:%u", s->delivered_ce);
if (s->app_limited)
out(" app_limited");
 
@@ -2829,6 +2835,8 @@ static void tcp_show_info(const struct nlmsghdr *nlh, 
struct inet_diag_msg *r,
s.busy_time = info->tcpi_busy_time;
s.rwnd_limited = info->tcpi_rwnd_limited;
s.sndbuf_limited = info->tcpi_sndbuf_limited;
+   s.delivered = info->tcpi_delivered;
+   s.delivered_ce = info->tcpi_delivered_ce;
tcp_stats_print();
free(s.dctcp);
free(s.bbr_info);
-- 
2.20.0.rc0.387.gc7a69e6b6c-goog

[PATCH bpf-next v2 2/3] bpf: add msg_pop_data helper to tools

2018-11-26 Thread John Fastabend

Add the necessary header definitions to tools for new
msg_pop_data_helper.

Signed-off-by: John Fastabend 
---
 tools/include/uapi/linux/bpf.h| 16 +++-
 tools/testing/selftests/bpf/bpf_helpers.h |  2 ++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 23e2031..597afdb 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2268,6 +2268,19 @@ union bpf_attr {
  *
  * Return
  * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ *  Description
+ * Will remove *pop* bytes from a *msg* starting at byte *start*.
+ * This may result in **ENOMEM** errors under certain situations if
+ * an allocation and copy are required due to a full ring buffer.
+ * However, the helper will try to avoid doing the allocation
+ * if possible. Other errors can occur if input parameters are
+ * invalid either due to *start* byte not being valid part of msg
+ * payload and/or *pop* value being to large.
+ *
+ * Return
+ * 0 on success, or a negative erro in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -2360,7 +2373,8 @@ union bpf_attr {
FN(map_push_elem),  \
FN(map_pop_elem),   \
FN(map_peek_elem),  \
-   FN(msg_push_data),
+   FN(msg_push_data),  \
+   FN(msg_pop_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h 
b/tools/testing/selftests/bpf/bpf_helpers.h
index 686e57c..7b69519 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -113,6 +113,8 @@ static int (*bpf_msg_pull_data)(void *ctx, int start, int 
end, int flags) =
(void *) BPF_FUNC_msg_pull_data;
 static int (*bpf_msg_push_data)(void *ctx, int start, int end, int flags) =
(void *) BPF_FUNC_msg_push_data;
+static int (*bpf_msg_pop_data)(void *ctx, int start, int cut, int flags) =
+   (void *) BPF_FUNC_msg_pop_data;
 static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
(void *) BPF_FUNC_bind;
 static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
-- 
2.7.4

[PATCH bpf-next v2 3/3] bpf: test_sockmap, add options for msg_pop_data() helper

2018-11-26 Thread John Fastabend

Similar to msg_pull_data and msg_push_data add a set of options to
have msg_pop_data() exercised.

Signed-off-by: John Fastabend 
---
 tools/testing/selftests/bpf/test_sockmap.c  | 127 +++-
 tools/testing/selftests/bpf/test_sockmap_kern.h |  70 ++---
 2 files changed, 180 insertions(+), 17 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index 622ade0..e85a771 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -79,6 +79,8 @@ int txmsg_start;
 int txmsg_end;
 int txmsg_start_push;
 int txmsg_end_push;
+int txmsg_start_pop;
+int txmsg_pop;
 int txmsg_ingress;
 int txmsg_skb;
 int ktls;
@@ -104,6 +106,8 @@ static const struct option long_options[] = {
{"txmsg_end",   required_argument,  NULL, 'e'},
{"txmsg_start_push", required_argument, NULL, 'p'},
{"txmsg_end_push",   required_argument, NULL, 'q'},
+   {"txmsg_start_pop",  required_argument, NULL, 'w'},
+   {"txmsg_pop",required_argument, NULL, 'x'},
{"txmsg_ingress", no_argument,  _ingress, 1 },
{"txmsg_skb", no_argument,  _skb, 1 },
{"ktls", no_argument,   , 1 },
@@ -473,13 +477,27 @@ static int msg_loop(int fd, int iov_count, int 
iov_length, int cnt,
clock_gettime(CLOCK_MONOTONIC, >end);
} else {
int slct, recvp = 0, recv, max_fd = fd;
+   float total_bytes, txmsg_pop_total;
int fd_flags = O_NONBLOCK;
struct timeval timeout;
-   float total_bytes;
fd_set w;
 
fcntl(fd, fd_flags);
+   /* Account for pop bytes noting each iteration of apply will
+* call msg_pop_data helper so we need to account for this
+* by calculating the number of apply iterations. Note user
+* of the tool can create cases where no data is sent by
+* manipulating pop/push/pull/etc. For example txmsg_apply 1
+* with txmsg_pop 1 will try to apply 1B at a time but each
+* iteration will then pop 1B so no data will ever be sent.
+* This is really only useful for testing edge cases in code
+* paths.
+*/
total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+   txmsg_pop_total = txmsg_pop;
+   if (txmsg_apply)
+   txmsg_pop_total *= (total_bytes / txmsg_apply);
+   total_bytes -= txmsg_pop_total;
err = clock_gettime(CLOCK_MONOTONIC, >start);
if (err < 0)
perror("recv start time: ");
@@ -488,7 +506,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, 
int cnt,
timeout.tv_sec = 0;
timeout.tv_usec = 30;
} else {
-   timeout.tv_sec = 1;
+   timeout.tv_sec = 3;
timeout.tv_usec = 0;
}
 
@@ -503,7 +521,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, 
int cnt,
goto out_errno;
} else if (!slct) {
if (opt->verbose)
-   fprintf(stderr, "unexpected timeout\n");
+   fprintf(stderr, "unexpected timeout: 
recved %zu/%f pop_total %f\n", s->bytes_recvd, total_bytes, txmsg_pop_total);
errno = -EIO;
clock_gettime(CLOCK_MONOTONIC, >end);
goto out_errno;
@@ -619,7 +637,7 @@ static int sendmsg_test(struct sockmap_options *opt)
iov_count = 1;
err = msg_loop(rx_fd, iov_count, iov_buf,
   cnt, , false, opt);
-   if (err && opt->verbose)
+   if (opt->verbose)
fprintf(stderr,
"msg_loop_rx: iov_count %i iov_buf %i cnt %i 
err %i\n",
iov_count, iov_buf, cnt, err);
@@ -931,6 +949,39 @@ static int run_options(struct sockmap_options *options, 
int cg_fd,  int test)
}
}
 
+   if (txmsg_start_pop) {
+   i = 4;
+   err = bpf_map_update_elem(map_fd[5],
+ , _start_pop, 
BPF_ANY);
+   if (err) {
+   fprintf(stderr,
+   "ERROR: bpf_map_update_elem %i@%i 
(txmsg_start_pop):  %d (%s)\n",
+   txmsg_start_pop, i, err, 
strerror(errno));
+

Re: [PATCH mlx5-next 00/11] mlx5 core internal firmware events handling improvements

2018-11-26 Thread Saeed Mahameed

On Tue, 2018-11-20 at 14:12 -0800, Saeed Mahameed wrote:
> Hi
> 
> This patchset is for mlx5-next shared branch, and will be applied
> there
> once the review is done.
> 
> The main idea of this change is to define a flexible scalable and
> simpler low level mlx5 core APIs to upper level components for better
> features decoupling and maximum code locality and modularity.
> 
> Improve and simplify mlx5 core internal firmware and device async
> events
> handling and subscription, currently all async firmware events are
> handled in one place (switch case in eq.c) and every time we need to
> update one of the mlx5_core handlers or add new events handling to
> the
> system, the driver needs to be changed in many places in order to
> deliver
> the new event to its consumer.
> 
> To improve this we will use atomic_notifier_chain to fire firmware
> events
> at internal mlx5 core components such as eswitch/fpga/clock/FW
> tracer/etc..,
> this is to avoid explicit calls from low level mlx5_core to upper
> components
> and to simplify the mlx5_core API for future developments.
> 
> Provide register/unregister notifiers API and call the notifier chain
> on
> firmware async events.
> 
> Example to subscribe to a FW event:
> 
> struct mlx5_nb port_event;
> 
> MLX5_NB_INIT(_event, port_event_handler, PORT_CHANGE);
> mlx5_eq_notifier_register(mdev, _event);
> 
> Where:
>   - port_event_handler is the notifier block callback.
>   - PORT_EVENT is the suffix of MLX5_EVENT_TYPE_PORT_CHANGE (The
> event
> type to subscribe to)
> 
> The above will guarantee that port_event_handler will receive all FW
> events of the type MLX5_EVENT_TYPE_PORT_CHANGE.
> 
> To receive all FW/HW events one can subscribe to
> MLX5_EVENT_TYPE_NOTIFY_ANY.
>  
> There can be only 128 types of firmware events each has its own
> 64Byte 
> EQE (Event Queue Element) data, we will have one
> atomic_notifier_chain
> per event type for maximum performance and verbosity.
> Each handler is going to receive the event_type as unsigned long and
> the event data as void pointer, exactly as defined in the notifier
> block
> handlers prototype.
>
> This API is implemented in the first patch of this series all
> following
> patches are modifying the existing mlx5 components to use the new API
> to
> subscribe to FW events.
> 
> Thanks,
> Saeed.
> 
> ---
> 
> Saeed Mahameed (11):
>   net/mlx5: EQ, Introduce atomic notifier chain subscription API
>   net/mlx5: FWTrace, Use async events chain
>   net/mlx5: FPGA, Use async events chain
>   net/mlx5: Clock, Use async events chain
>   net/mlx5: E-Switch, Use async events chain
>   net/mlx5: FWPage, Use async events chain
>   net/mlx5: CmdIF, Use async events chain
>   net/mlx5: Resource tables, Use async events chain
>   net/mlx5: CQ ERR, Use async events chain
>   net/mlx5: Device events, Use async events chain
>   net/mlx5: Improve core device events handling
> 


Applied to mlx5-next branch.

Thanks!

[PATCH bpf-next v2 1/3] bpf: helper to pop data from messages

2018-11-26 Thread John Fastabend

This adds a BPF SK_MSG program helper so that we can pop data from a
msg. We use this to pop metadata from a previous push data call.

Signed-off-by: John Fastabend 
---
 include/uapi/linux/bpf.h |  16 -
 net/core/filter.c| 171 +++
 net/ipv4/tcp_bpf.c   |  17 -
 net/tls/tls_sw.c |  11 ++-
 4 files changed, 209 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 23e2031..597afdb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2268,6 +2268,19 @@ union bpf_attr {
  *
  * Return
  * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ *  Description
+ * Will remove *pop* bytes from a *msg* starting at byte *start*.
+ * This may result in **ENOMEM** errors under certain situations if
+ * an allocation and copy are required due to a full ring buffer.
+ * However, the helper will try to avoid doing the allocation
+ * if possible. Other errors can occur if input parameters are
+ * invalid either due to *start* byte not being valid part of msg
+ * payload and/or *pop* value being to large.
+ *
+ * Return
+ * 0 on success, or a negative erro in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -2360,7 +2373,8 @@ union bpf_attr {
FN(map_push_elem),  \
FN(map_pop_elem),   \
FN(map_peek_elem),  \
-   FN(msg_push_data),
+   FN(msg_push_data),  \
+   FN(msg_pop_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index f50ea97..bd0df75 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2425,6 +2425,174 @@ static const struct bpf_func_proto 
bpf_msg_push_data_proto = {
.arg4_type  = ARG_ANYTHING,
 };
 
+static void sk_msg_shift_left(struct sk_msg *msg, int i)
+{
+   int prev;
+
+   do {
+   prev = i;
+   sk_msg_iter_var_next(i);
+   msg->sg.data[prev] = msg->sg.data[i];
+   } while (i != msg->sg.end);
+
+   sk_msg_iter_prev(msg, end);
+}
+
+static void sk_msg_shift_right(struct sk_msg *msg, int i)
+{
+   struct scatterlist tmp, sge;
+
+   sk_msg_iter_next(msg, end);
+   sge = sk_msg_elem_cpy(msg, i);
+   sk_msg_iter_var_next(i);
+   tmp = sk_msg_elem_cpy(msg, i);
+
+   while (i != msg->sg.end) {
+   msg->sg.data[i] = sge;
+   sk_msg_iter_var_next(i);
+   sge = tmp;
+   tmp = sk_msg_elem_cpy(msg, i);
+   }
+}
+
+BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
+  u32, len, u64, flags)
+{
+   u32 i = 0, l, space, offset = 0;
+   u64 last = start + len;
+   int pop;
+
+   if (unlikely(flags))
+   return -EINVAL;
+
+   /* First find the starting scatterlist element */
+   i = msg->sg.start;
+   do {
+   l = sk_msg_elem(msg, i)->length;
+
+   if (start < offset + l)
+   break;
+   offset += l;
+   sk_msg_iter_var_next(i);
+   } while (i != msg->sg.end);
+
+   /* Bounds checks: start and pop must be inside message */
+   if (start >= offset + l || last >= msg->sg.size)
+   return -EINVAL;
+
+   space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+   pop = len;
+   /* --| offset
+* -| start  | len ---|
+*
+*  |- a | pop ---|- b |
+*  |__| length
+*
+*
+* a:   region at front of scatter element to save
+* b:   region at back of scatter element to save when length > A + pop
+* pop: region to pop from element, same as input 'pop' here will be
+*  decremented below per iteration.
+*
+* Two top-level cases to handle when start != offset, first B is non
+* zero and second B is zero corresponding to when a pop includes more
+* than one element.
+*
+* Then if B is non-zero AND there is no space allocate space and
+* compact A, B regions into page. If there is space shift ring to
+* the rigth free'ing the next element in ring to place B, leaving
+* A untouched except to reduce length.
+*/
+   if (start != offset) {
+   struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+   int a = start;
+   int b = sge->length - pop - a;
+
+   sk_msg_iter_var_next(i);
+
+   if (pop < sge->length - a) {
+

[PATCH bpf-next v2 0/3] bpf: add sk_msg helper sk_msg_pop_data

2018-11-26 Thread John Fastabend

After being able to add metadata to messages with sk_msg_push_data we
have also found it useful to be able to "pop" this metadata off before
sending it to applications in some cases. This series adds a new helper
sk_msg_pop_data() and the associated patches to add tests and tools/lib
support.

Thanks!

v2: Daniel caught that we missed adding sk_msg_pop_data to the changes
data helper so that the verifier ensures BPF programs revalidate
data after using this helper. Also improve documentation adding a
return description and using RST syntax per Quentin's comment. And
delta calculations for DROP with pop'd data (albeit a strange set
of operations for a program to be doing) had potential to be
incorrect possibly confusing user space applications, so fix it.

John Fastabend (3):
  bpf: helper to pop data from messages
  bpf: add msg_pop_data helper to tools
  bpf: test_sockmap, add options for msg_pop_data() helper usage

 include/uapi/linux/bpf.h|  13 +-
 net/core/filter.c   | 169 
 net/ipv4/tcp_bpf.c  |  14 +-
 tools/include/uapi/linux/bpf.h  |  13 +-
 tools/testing/selftests/bpf/bpf_helpers.h   |   2 +
 tools/testing/selftests/bpf/test_sockmap.c  | 127 +-
 tools/testing/selftests/bpf/test_sockmap_kern.h |  70 --
 7 files changed, 386 insertions(+), 22 deletions(-)

-- 
2.7.4

Re: consistency for statistics with XDP mode

2018-11-26 Thread Jakub Kicinski

On Wed, 21 Nov 2018 14:06:49 -0700, David Ahern wrote:
> Keeping the basic xdp packets in the standard counters allows Paweł, for
> example, to continue to monitor /proc/net/dev.
> 
> Can we get agreement on this? And from there, get updates to the mlx5
> and virtio drivers?

I have a long standing itch to add more detailed but standardized
netlink stats, including per queue statistics.  Per queue stats should
cover XDP naturally.  I will try to sketch it out send an RFC later this
week FWIW.

Re: [PATCH bpf] bpf: Support sk lookup in netns with id 0

2018-11-26 Thread David Ahern

On 11/26/18 2:27 PM, Joe Stringer wrote:
> @@ -2405,6 +2407,9 @@ enum bpf_func_id {
>  /* BPF_FUNC_perf_event_output for sk_buff input context. */
>  #define BPF_F_CTXLEN_MASK(0xfULL << 32)
>  
> +/* BPF_FUNC_sk_lookup_tcp and BPF_FUNC_sk_lookup_udp flags. */
> +#define BPF_F_SK_CURRENT_NS  0x8000 /* For netns field */
> +

I went down the nsid road because it will be needed for other use cases
(e.g., device lookups), and we should have a general API for network
namespaces. Given that, I think the _SK should be dropped from the name.

Re: [PATCH bpf-next] bpf: Avoid unnecessary instruction in conver_bpf_ld_abs()

2018-11-26 Thread Daniel Borkmann

On 11/26/2018 10:42 PM, David Miller wrote:
> 
> 'offset' is constant and if it is zero, no need to subtract it
> from BPF_REG_TMP.
> 
> Signed-off-by: David S. Miller 

Applied to bpf-next, thanks!

[PATCH bpf] bpf: Support sk lookup in netns with id 0

2018-11-26 Thread Joe Stringer

David Ahern and Nicolas Dichtel report that the handling of the netns id
0 is incorrect for the BPF socket lookup helpers: rather than finding
the netns with id 0, it is resolving to the current netns. This renders
the netns_id 0 inaccessible.

To fix this, adjust the API for the netns to treat all u32 values with
the highest bit set (BPF_F_SK_CURRENT_NS) as a lookup in the current
netns, while any values with a lower value (including zero) would result
in a lookup for a socket in the netns corresponding to that id. As
before, if the netns with that ID does not exist, no socket will be
found.

Signed-off-by: Joe Stringer 
---
 include/uapi/linux/bpf.h  | 29 +---
 net/core/filter.c | 16 -
 tools/include/uapi/linux/bpf.h| 33 ---
 .../selftests/bpf/test_sk_lookup_kern.c   | 18 +-
 4 files changed, 55 insertions(+), 41 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 852dc17ab47a..543945d520b9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2187,12 +2187,13 @@ union bpf_attr {
  * **sizeof**\ (*tuple*\ **->ipv6**)
  * Look for an IPv6 socket.
  *
- * If the *netns* is zero, then the socket lookup table in the
- * netns associated with the *ctx* will be used. For the TC hooks,
- * this in the netns of the device in the skb. For socket hooks,
- * this in the netns of the socket. If *netns* is non-zero, then
- * it specifies the ID of the netns relative to the netns
- * associated with the *ctx*.
+ * If the *netns* is **BPF_F_SK_CURRENT_NS** or greater, then the
+ * socket lookup table in the netns associated with the *ctx* will
+ * will be used. For the TC hooks, this is the netns of the device
+ * in the skb. For socket hooks, this is the netns of the socket.
+ * If *netns* is less than **BPF_F_SK_CURRENT_NS**, then it
+ * specifies the ID of the netns relative to the netns associated
+ * with the *ctx*.
  *
  * All values for *flags* are reserved for future usage, and must
  * be left at zero.
@@ -2219,12 +2220,13 @@ union bpf_attr {
  * **sizeof**\ (*tuple*\ **->ipv6**)
  * Look for an IPv6 socket.
  *
- * If the *netns* is zero, then the socket lookup table in the
- * netns associated with the *ctx* will be used. For the TC hooks,
- * this in the netns of the device in the skb. For socket hooks,
- * this in the netns of the socket. If *netns* is non-zero, then
- * it specifies the ID of the netns relative to the netns
- * associated with the *ctx*.
+ * If the *netns* is **BPF_F_SK_CURRENT_NS** or greater, then the
+ * socket lookup table in the netns associated with the *ctx* will
+ * will be used. For the TC hooks, this is the netns of the device
+ * in the skb. For socket hooks, this is the netns of the socket.
+ * If *netns* is less than **BPF_F_SK_CURRENT_NS**, then it
+ * specifies the ID of the netns relative to the netns associated
+ * with the *ctx*.
  *
  * All values for *flags* are reserved for future usage, and must
  * be left at zero.
@@ -2405,6 +2407,9 @@ enum bpf_func_id {
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
 #define BPF_F_CTXLEN_MASK  (0xfULL << 32)
 
+/* BPF_FUNC_sk_lookup_tcp and BPF_FUNC_sk_lookup_udp flags. */
+#define BPF_F_SK_CURRENT_NS0x8000 /* For netns field */
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET,
diff --git a/net/core/filter.c b/net/core/filter.c
index 9a1327eb25fa..8c8a7ad3f5e6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4882,7 +4882,7 @@ static struct sock *sk_lookup(struct net *net, struct 
bpf_sock_tuple *tuple,
  */
 static unsigned long
 bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
- u8 proto, u64 netns_id, u64 flags)
+ u8 proto, u32 netns_id, u64 flags)
 {
struct net *caller_net;
struct sock *sk = NULL;
@@ -4890,22 +4890,22 @@ bpf_sk_lookup(struct sk_buff *skb, struct 
bpf_sock_tuple *tuple, u32 len,
struct net *net;
 
family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
-   if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
+   if (unlikely(family == AF_UNSPEC || flags))
goto out;
 
if (skb->dev)
caller_net = dev_net(skb->dev);
else
caller_net = sock_net(skb->sk);
-   if (netns_id) {
+   if (netns_id & BPF_F_SK_CURRENT_NS) {
+   net = caller_net;
+

[PATCH bpf-next] bpf: Avoid unnecessary instruction in conver_bpf_ld_abs()

2018-11-26 Thread David Miller



'offset' is constant and if it is zero, no need to subtract it
from BPF_REG_TMP.

Signed-off-by: David S. Miller 

diff --git a/net/core/filter.c b/net/core/filter.c
index aa274679965d..f50ea971f7a9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -463,7 +463,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, 
struct bpf_insn **insnp)
bool ldx_off_ok = offset <= S16_MAX;
 
*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
-   *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+   if (offset)
+   *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
*insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
  size, 2 + endian + (!ldx_off_ok * 2));
if (ldx_off_ok) {

Re: [RFC PATCH 0/3] sk_buff: add skb extension infrastructure

2018-11-26 Thread Florian Westphal

David Miller  wrote:
> From: Florian Westphal 
> Date: Mon, 26 Nov 2018 22:19:33 +0100
> 
> >> In the future please document what is so enormous and absolutely
> >> required that they must put it all into the SKB control block.
> > 
> > Ok, will do.
> 
> Why don't we establish the details about what MP-TCP needs in the CB
> before we discuss this further.
> 
> Could you do that for us?

Sure.

There is also an MPTCP conf call this week and I'll present this
patchset there as well.

I'll post the requirements and what possible alternatives have
been considered here as a follow-up.

Thanks,
Florian

Re: [RFC PATCH 0/3] sk_buff: add skb extension infrastructure

2018-11-26 Thread David Miller

From: Florian Westphal 
Date: Mon, 26 Nov 2018 22:19:33 +0100

>> In the future please document what is so enormous and absolutely
>> required that they must put it all into the SKB control block.
> 
> Ok, will do.

Why don't we establish the details about what MP-TCP needs in the CB
before we discuss this further.

Could you do that for us?

I'm open minded about your approach still, now that I've taken the
xfrm array and nf_bridge aspects into consideration.  So please keep
pursuing this.

Thanks Florian.

Re: [RFC PATCH 0/3] sk_buff: add skb extension infrastructure

2018-11-26 Thread Florian Westphal

David Miller  wrote:
> > This adds an extension infrastructure for sk_buff instead:
> > 1. extension memory is released when the sk_buff is free'd.
> > 2. data is shared after cloning an skb.
> > 3. adding extension to an skb will COW the extension
> >buffer if needed.
> 
> So MP-TCP, when enabled for a connection, will have a new atomic
> operation for every packet?

Yes, at least for every kfree_skb call.

> And new tests all in the fast paths of the networking to facilitate
> this feature, a cost paid by everyone.

No, right now everyone has two non-atomic tests (skb->sp + skb->nf_bridge),
with this proposal everyone has one (skb->active_extensions), assuming that
both br_nf and xfrm are converted to use the extension system.

Test(s) occur both on copy/clone and kfree_skb, just like in current
kernels.

atomic test(s) are done in case skb->{sp,nf_bridge} are set, with
this patch its done if skb->active_exensions is != 0.

So from that angle current status is kept.

Main motivation was to find a solution that does not add more costs
for normal cases.

I did a quick hack to also convert skb->sp, it seems possible to do so.

In that case skbuff size is reduced by 8 bytes as sp/nf_bridge get
replaced by single 'extension pointer', and slightly less code
provided kernel is built with both XFRM and bridge netfilter support.

> Sorry, that doesn't seem like a good idea to me.
>
> Can't they just encode whatever huge amount of crap they want to
> put into the CB by deriving the information from skb->sk and some
> tiny value like an index or something to resolve the path?

Perhaps, if thats the only way I'm afraid thats what will need to be
used.  I did try such a scheme once in the past to get
rid of skb->nf_bridge and things became very very fugly due to
kfree_skb() not being aware of such 'external storage', i.e. no
way to easily clean the external storage when an skbuff gets tossed.

Might be possibe to use destructor to take care of this in mptcp case.
I can have a look if this is the only possible way.

> In the future please document what is so enormous and absolutely
> required that they must put it all into the SKB control block.

Ok, will do.

> Like Eric, I am concerned about the slow creep of overhead.  Lots of
> small "not that bad" additions of extra cycles here and there over
> time adds up to impossible to fix performance regressions.

I have the same concern, which is why i am proposing the conversion
of xfrm and nf_bridge to use this instead of the current
nf_bridge/secpath maintanance.

Although MPTCP is the main motivation here, it was intended as a
standalone series, i.e., these 3 patches and a few followup changes
to convert xfrm.

> I'm sorry if this is a major disappointment for the MP-TCP folks but a
> better way needs to be found to integrate what they want to do with
> real zero cost for the rest of the world which won't be using MP-TCP
> and therefore should not be paying for it's added overhead at all.

Agreed.

Re: pull-request: bpf-next 2018-11-26

2018-11-26 Thread David Miller

From: Daniel Borkmann 
Date: Mon, 26 Nov 2018 14:25:28 +0100

> The following pull-request contains BPF updates for your *net-next* tree.
...
> Please consider pulling these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
> 
> Note, there is a tiny merge conflict in BPF's kselftest Makefile.
> Resolution is to take both chunks, like:
> 
>   [...]
>   test_sk_lookup_kern.o test_xdp_vlan.o test_queue_map.o test_stack_map.o \
>   xdp_dummy.o test_map_in_map.o

Pulled, thanks!

[PATCH bpf] sparc: Fix JIT fused branch convergance.

2018-11-26 Thread David Miller



On T4 and later sparc64 cpus we can use the fused compare and branch
instruction.

However, it can only be used if the branch destination is in the range
of a signed 10-bit immediate offset.  This amounts to 1024
instructions forwards or backwards.

After the commit referenced in the Fixes: tag, the largest possible
size program seen by the JIT explodes by a significant factor.

As a result of this convergance takes many more passes since the
expanded "BPF_LDX | BPF_MSH | BPF_B" code sequence, for example,
contains several embedded branch on condition instructions.

On each pass, as suddenly new fused compare and branch instances
become valid, this makes thousands more in range for the next pass.
And so on and so forth.

This is most greatly exemplified by "BPF_MAXINSNS: exec all MSH" which
takes 35 passes to converge, and shrinks the image by about 64K.

To decrease the cost of this number of convergance passes, do the
convergance pass before we have the program image allocated, just like
other JITs (such as x86) do.

Fixes: e0cea7ce988c ("bpf: implement ld_abs/ld_ind in native bpf")
Signed-off-by: David S. Miller 

diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 222785a..7217d63 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -1425,12 +1425,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog 
*prog)
struct bpf_prog *tmp, *orig_prog = prog;
struct sparc64_jit_data *jit_data;
struct bpf_binary_header *header;
+   u32 prev_image_size, image_size;
bool tmp_blinded = false;
bool extra_pass = false;
struct jit_ctx ctx;
-   u32 image_size;
u8 *image_ptr;
-   int pass;
+   int pass, i;
 
if (!prog->jit_requested)
return orig_prog;
@@ -1461,61 +1461,82 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog 
*prog)
header = jit_data->header;
extra_pass = true;
image_size = sizeof(u32) * ctx.idx;
+   prev_image_size = image_size;
+   pass = 1;
goto skip_init_ctx;
}
 
memset(, 0, sizeof(ctx));
ctx.prog = prog;
 
-   ctx.offset = kcalloc(prog->len, sizeof(unsigned int), GFP_KERNEL);
+   ctx.offset = kmalloc_array(prog->len, sizeof(unsigned int), GFP_KERNEL);
if (ctx.offset == NULL) {
prog = orig_prog;
goto out_off;
}
 
-   /* Fake pass to detect features used, and get an accurate assessment
-* of what the final image size will be.
+   /* Longest sequence emitted is for bswap32, 12 instructions.  Pre-cook
+* the offset array so that we converge faster.
 */
-   if (build_body()) {
-   prog = orig_prog;
-   goto out_off;
-   }
-   build_prologue();
-   build_epilogue();
+   for (i = 0; i < prog->len; i++)
+   ctx.offset[i] = i * (12 * 4);
 
-   /* Now we know the actual image size. */
-   image_size = sizeof(u32) * ctx.idx;
-   header = bpf_jit_binary_alloc(image_size, _ptr,
- sizeof(u32), jit_fill_hole);
-   if (header == NULL) {
-   prog = orig_prog;
-   goto out_off;
-   }
-
-   ctx.image = (u32 *)image_ptr;
-skip_init_ctx:
-   for (pass = 1; pass < 3; pass++) {
+   prev_image_size = ~0U;
+   for (pass = 1; pass < 40; pass++) {
ctx.idx = 0;
 
build_prologue();
-
if (build_body()) {
-   bpf_jit_binary_free(header);
prog = orig_prog;
goto out_off;
}
-
build_epilogue();
 
if (bpf_jit_enable > 1)
-   pr_info("Pass %d: shrink = %d, seen = 
[%c%c%c%c%c%c]\n", pass,
-   image_size - (ctx.idx * 4),
+   pr_info("Pass %d: size = %u, seen = [%c%c%c%c%c%c]\n", 
pass,
+   ctx.idx * 4,
ctx.tmp_1_used ? '1' : ' ',
ctx.tmp_2_used ? '2' : ' ',
ctx.tmp_3_used ? '3' : ' ',
ctx.saw_frame_pointer ? 'F' : ' ',
ctx.saw_call ? 'C' : ' ',
ctx.saw_tail_call ? 'T' : ' ');
+
+   if (ctx.idx * 4 == prev_image_size)
+   break;
+   prev_image_size = ctx.idx * 4;
+   cond_resched();
+   }
+
+   /* Now we know the actual image size. */
+   image_size = sizeof(u32) * ctx.idx;
+   header = bpf_jit_binary_alloc(image_size, _ptr,
+ sizeof(u32), jit_fill_hole);
+   if (header == NULL) {
+   prog = orig_prog;
+   goto out_off;
+   }
+
+

Re: [RFC PATCH 0/3] sk_buff: add skb extension infrastructure

2018-11-26 Thread David Miller

From: Florian Westphal 
Date: Mon, 26 Nov 2018 12:38:54 +0100

> This adds an extension infrastructure for sk_buff instead:
> 1. extension memory is released when the sk_buff is free'd.
> 2. data is shared after cloning an skb.
> 3. adding extension to an skb will COW the extension
>buffer if needed.

So MP-TCP, when enabled for a connection, will have a new atomic
operation for every packet?

And new tests all in the fast paths of the networking to facilitate
this feature, a cost paid by everyone.

Sorry, that doesn't seem like a good idea to me.

Can't they just encode whatever huge amount of crap they want to
put into the CB by deriving the information from skb->sk and some
tiny value like an index or something to resolve the path?

In the future please document what is so enormous and absolutely
required that they must put it all into the SKB control block.

Like Eric, I am concerned about the slow creep of overhead.  Lots of
small "not that bad" additions of extra cycles here and there over
time adds up to impossible to fix performance regressions.

I'm sorry if this is a major disappointment for the MP-TCP folks but a
better way needs to be found to integrate what they want to do with
real zero cost for the rest of the world which won't be using MP-TCP
and therefore should not be paying for it's added overhead at all.

Re: [PATCH net-next,v3 00/12] add flow_rule infrastructure

2018-11-26 Thread Marcelo Ricardo Leitner

On Mon, Nov 26, 2018 at 08:33:36PM +0100, Pablo Neira Ayuso wrote:
> Hi Marcelo,

Hello!

> 
> On Thu, Nov 22, 2018 at 07:08:32PM -0200, Marcelo Ricardo Leitner wrote:
> > On Thu, Nov 22, 2018 at 02:22:20PM -0200, Marcelo Ricardo Leitner wrote:
> > > On Wed, Nov 21, 2018 at 03:51:20AM +0100, Pablo Neira Ayuso wrote:
> > > > Hi,
> > > > 
> > > > This patchset is the third iteration [1] [2] [3] to introduce a kernel
> > > > intermediate (IR) to express ACL hardware offloads.
> > > 
> > > On v2 cover letter you had:
> > > 
> > > """
> > > However, cost of this layer is very small, adding 1 million rules via
> > > tc -batch, perf shows:
> > > 
> > >  0.06%  tc   [kernel.vmlinux][k] tc_setup_flow_action
> > > """
> > > 
> > > The above doesn't include time spent on children calls and I'm worried
> > > about the new allocation done by flow_rule_alloc(), as it can impact
> > > rule insertion rate. I'll run some tests here and report back.
> > 
> > I'm seeing +60ms on 1.75s (~3.4%) to add 40k flower rules on ingress
> > with skip_hw and tc in batch mode, with flows like:
> > 
> > filter add dev p6p2 parent : protocol ip prio 1 flower skip_hw
> > src_mac ec:13:db:00:00:00 dst_mac ec:14:c2:00:00:00 src_ip
> > 56.0.0.0 dst_ip 55.0.0.0 action drop
> > 
> > Only 20ms out of those 60ms were consumed within fl_change() calls
> > (considering children calls), though.
> > 
> > Do you see something similar?  I used current net-next (d59da3fbfe3f)
> > and with this patchset applied.
> 
> I see lots of send() and recv() in tc -batch via strace, using this
> example rule, repeating it N times:
> 
> filter add dev eth0 parent : protocol ip pref 1 flower dst_mac 
> f4:52:14:10:df:92 action mirred egress redirect dev eth1
> 
> This is taking ~8 seconds for 40k rules from my old laptop [*], this
> is already not too fast (without my patchset).

On a E5-2643 v3 @ 3.40GHz I see a total of 1.17s with an old iproute
(4.11) (more below).

> 
> I remember we discussed about adding support for real batching for tc
> - probably we can probably do this transparently by assuming that if the
> skbuff length mismatches nlmsghdr->len field, then we enter the batch
> mode from the kernel. This would require to update iproute2 to use
> libmnl batching routines, or code that follows similar approach
> otherwise.

Yes, I believe you're referring to

commit 485d0c6001c4aa134b99c86913d6a7089b7b2ab0
Author: Chris Mi 
Date:   Fri Jan 12 14:13:16 2018 +0900

tc: Add batchsize feature for filter and actions

Which is present in 4.16. It does transparent batching on app side.

With tc from today's tip, I get 1.05s for 40k rules, both with this
patchset applied.

> 
> [*] 0.5 seconds in nft (similar ruleset), this is using netlink batching.

Nice.

Cheers,
  Marcelo

Re: [PATCH net-next v2 1/2] udp: msg_zerocopy

2018-11-26 Thread Willem de Bruijn

On Mon, Nov 26, 2018 at 1:19 PM Willem de Bruijn
 wrote:
>
> On Mon, Nov 26, 2018 at 1:04 PM Paolo Abeni  wrote:
> >
> > On Mon, 2018-11-26 at 12:59 -0500, Willem de Bruijn wrote:
> > > The callers of this function do flush the queue of the other skbs on
> > > error, but only after the call to sock_zerocopy_put_abort.
> > >
> > > sock_zerocopy_put_abort depends on total rollback to revert the
> > > sk_zckey increment and suppress the completion notification (which
> > > must not happen on return with error).
> > >
> > > I don't immediately have a fix. Need to think about this some more..
> >
> > [still out of sheer ignorance] How about tacking a refcnt for the whole
> > ip_append_data() scope, like in the tcp case? that will add an atomic
> > op per loop (likely, hitting the cache) but will remove some code hunk
> > in sock_zerocopy_put_abort() and sock_zerocopy_alloc().
>
> The atomic op pair is indeed what I was trying to avoid. But I also need
> to solve the problem that the final decrement will happen from the freeing
> of the other skbs in __ip_flush_pending_frames, and will not suppress
> the notification.
>
> Freeing the entire queue inside __ip_append_data, effectively making it
> a true noop on error is one approach. But that is invasive, also to non
> zerocopy codepaths, so I would rather avoid that.
>
> Perhaps I need to handle the abort logic in udp_sendmsg directly,
> after both __ip_append_data and __ip_flush_pending_frames.

Actually,

(1) the notification suppression is handled correctly, as .._abort
decrements uarg->len. If now zero, this suppresses the notification
in sock_zerocopy_callback, regardless whether that callback is
called right away or from a later kfree_skb.

(2) if moving skb_zcopy_set below getfrag, then no kfree_skb
will be called on a zerocopy skb inside __ip_append_data. So on
abort the refcount is exactly the number of zerocopy skbs on the
queue that will call sock_zerocopy_put later. Abort then only needs
to handle special case zero, and call sock_zerocopy_put right away.

Tentative fix on top of v2 (I'll squash into v3):

---

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2179ef84bb44..4b21a58329d1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1099,12 +1099,13 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)

-   if (sk->sk_type != SOCK_STREAM && !refcount_read(>refcnt))
+   if (sk->sk_type != SOCK_STREAM &&
!refcount_read(>refcnt)) {
refcount_set(>refcnt, 1);
-
-   sock_zerocopy_put(uarg);
+   sock_zerocopy_put(uarg);
+   }
}
 }

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7504da2f33d6..a19396e21b35 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1014,13 +1014,6 @@ static int __ip_append_data(struct sock *sk,
skb->csum = 0;
skb_reserve(skb, hh_len);

-   /* only the initial fragment is time stamped */
-   skb_shinfo(skb)->tx_flags = cork->tx_flags;
-   cork->tx_flags = 0;
-   skb_shinfo(skb)->tskey = tskey;
-   tskey = 0;
-   skb_zcopy_set(skb, uarg);
-
/*
 *  Find where to start putting bytes.
 */
@@ -1053,6 +1046,13 @@ static int __ip_append_data(struct sock *sk,
exthdrlen = 0;
csummode = CHECKSUM_NONE;

+   /* only the initial fragment is time stamped */
+   skb_shinfo(skb)->tx_flags = cork->tx_flags;
+   cork->tx_flags = 0;
+   skb_shinfo(skb)->tskey = tskey;
+   tskey = 0;
+   skb_zcopy_set(skb, uarg);
+
if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1);

---

This patch moves all the skb_shinfo touches operations after the copy,
to avoid touching that twice.

Instead of the refcnt trick, I could also refactor sock_zerocopy_put
and call __sock_zerocopy_put

---

-void sock_zerocopy_put(struct ubuf_info *uarg)
+static void __sock_zerocopy_put(struct ubuf_info *uarg)
 {
-   if (uarg && refcount_dec_and_test(>refcnt)) {
-   if (uarg->callback)
-   uarg->callback(uarg, uarg->zerocopy);
-   else
-   consume_skb(skb_from_uarg(uarg));
-   }
+   if (uarg->callback)
+   uarg->callback(uarg, uarg->zerocopy);
+   else
+   consume_skb(skb_from_uarg(uarg));
+}
+
+void sock_zerocopy_put(struct ubuf_info *uarg)
+   if (uarg && refcount_dec_and_test(>refcnt))
+   __sock_zerocopy_put(uarg);
 }
 EXPORT_SYMBOL_GPL(sock_zerocopy_put);

---

Re: [PATCH bpf-next] bpf: libbpf: retry program creation without the name

2018-11-26 Thread Quentin Monnet

2018-11-26 11:08 UTC-0800 ~ Vlad Dumitrescu 
> On Fri, Nov 23, 2018 at 2:51 AM Quentin Monnet
>  wrote:
>>
>> 2018-11-21 09:28 UTC-0800 ~ Stanislav Fomichev 
>>> On 11/21, Quentin Monnet wrote:
 2018-11-20 15:26 UTC-0800 ~ Stanislav Fomichev 
> On 11/20, Alexei Starovoitov wrote:
>> On Wed, Nov 21, 2018 at 12:18:57AM +0100, Daniel Borkmann wrote:
>>> On 11/21/2018 12:04 AM, Alexei Starovoitov wrote:
 On Tue, Nov 20, 2018 at 01:19:05PM -0800, Stanislav Fomichev wrote:
> On 11/20, Alexei Starovoitov wrote:
>> On Mon, Nov 19, 2018 at 04:46:25PM -0800, Stanislav Fomichev wrote:
>>> [Recent commit 23499442c319 ("bpf: libbpf: retry map creation 
>>> without
>>> the name") fixed this issue for maps, let's do the same for 
>>> programs.]
>>>
>>> Since commit 88cda1c9da02 ("bpf: libbpf: Provide basic API support
>>> to specify BPF obj name"), libbpf unconditionally sets 
>>> bpf_attr->name
>>> for programs. Pre v4.14 kernels don't know about programs names and
>>> return an error about unexpected non-zero data. Retry sys_bpf 
>>> without
>>> a program name to cover older kernels.
>>>
>>> Signed-off-by: Stanislav Fomichev 
>>> ---
>>>   tools/lib/bpf/bpf.c | 10 ++
>>>   1 file changed, 10 insertions(+)
>>>
>>> diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
>>> index 961e1b9fc592..cbe9d757c646 100644
>>> --- a/tools/lib/bpf/bpf.c
>>> +++ b/tools/lib/bpf/bpf.c
>>> @@ -212,6 +212,16 @@ int bpf_load_program_xattr(const struct 
>>> bpf_load_program_attr *load_attr,
>>>   if (fd >= 0 || !log_buf || !log_buf_sz)
>>>   return fd;
>>>
>>> + if (fd < 0 && errno == E2BIG && load_attr->name) {
>>> + /* Retry the same syscall, but without the name.
>>> +  * Pre v4.14 kernels don't support prog names.
>>> +  */
>>
>> I'm afraid that will put unnecessary stress on the kernel.
>> This check needs to be tighter.
>> Like E2BIG and anything in the log_buf probably means that
>> E2BIG came from the verifier and nothing to do with prog_name.
>> Asking kernel to repeat is an unnecessary work.
>>
>> In general we need to think beyond this single prog_name field.
>> There are bunch of other fields in bpf_load_program_xattr() and 
>> older kernels
>> won't support them. Are we going to zero them out one by one
>> and retry? I don't think that would be practical.
> I general, we don't want to zero anything out. However,
> for this particular problem the rationale is the following:
> In commit 88cda1c9da02 we started unconditionally setting 
> {prog,map}->name
> from the 'higher' libbpfc layer which breaks users on the older 
> kernels.
>
>> Also libbpf silently ignoring prog_name is not great for debugging.
>> A warning is needed.
>> But it cannot be done out of lib/bpf/bpf.c, since it's a set of 
>> syscall
>> wrappers.
>> Imo such "old kernel -> lets retry" feature should probably be done
>> at lib/bpf/libbpf.c level. inside load_program().
> For maps bpftools calls bpf_create_map_xattr directly, that's why
> for maps I did the retry on the lower level (and why for programs I 
> initially
> thought about doing the same). However, in this case maybe asking
> user to omit 'name' argument might be a better option.
>
> For program names, I agree, we might think about doing it on the 
> higher
> level (although I'm not sure whether we want to have different API
> expectations, i.e. bpf_create_map_xattr ignoring the name and
> bpf_load_program_xattr not ignoring the name).
>
> So given that rationale above, what do you think is the best way to
> move forward?
> 1. Same patch, but tighten the retry check inside 
> bpf_load_program_xattr ?
> 2. Move this retry logic into load_program and have different handling
> for bpf_create_map_xattr vs bpf_load_program_xattr ?
> 3. Do 2 and move the retry check for maps from bpf_create_map_xattr
> into bpf_object__create_maps ?
>
> (I'm slightly leaning towards #3)

 me too. I think it's cleaner for maps to do it in
 bpf_object__create_maps().
 Originally bpf.c was envisioned to be a thin layer on top of bpf 
 syscall.
 Whereas 'smart bits' would go into libbpf.c
>>>
>>> Can't we create in bpf_object__load() a small helper 
>>> bpf_object__probe_caps()
>>> which would figure this out _once_ upon start with a

Re: [PATCH net-next,v3 00/12] add flow_rule infrastructure

2018-11-26 Thread Pablo Neira Ayuso

Hi Marcelo,

On Thu, Nov 22, 2018 at 07:08:32PM -0200, Marcelo Ricardo Leitner wrote:
> On Thu, Nov 22, 2018 at 02:22:20PM -0200, Marcelo Ricardo Leitner wrote:
> > On Wed, Nov 21, 2018 at 03:51:20AM +0100, Pablo Neira Ayuso wrote:
> > > Hi,
> > > 
> > > This patchset is the third iteration [1] [2] [3] to introduce a kernel
> > > intermediate (IR) to express ACL hardware offloads.
> > 
> > On v2 cover letter you had:
> > 
> > """
> > However, cost of this layer is very small, adding 1 million rules via
> > tc -batch, perf shows:
> > 
> >  0.06%  tc   [kernel.vmlinux][k] tc_setup_flow_action
> > """
> > 
> > The above doesn't include time spent on children calls and I'm worried
> > about the new allocation done by flow_rule_alloc(), as it can impact
> > rule insertion rate. I'll run some tests here and report back.
> 
> I'm seeing +60ms on 1.75s (~3.4%) to add 40k flower rules on ingress
> with skip_hw and tc in batch mode, with flows like:
> 
> filter add dev p6p2 parent : protocol ip prio 1 flower skip_hw
> src_mac ec:13:db:00:00:00 dst_mac ec:14:c2:00:00:00 src_ip
> 56.0.0.0 dst_ip 55.0.0.0 action drop
> 
> Only 20ms out of those 60ms were consumed within fl_change() calls
> (considering children calls), though.
> 
> Do you see something similar?  I used current net-next (d59da3fbfe3f)
> and with this patchset applied.

I see lots of send() and recv() in tc -batch via strace, using this
example rule, repeating it N times:

filter add dev eth0 parent : protocol ip pref 1 flower dst_mac 
f4:52:14:10:df:92 action mirred egress redirect dev eth1

This is taking ~8 seconds for 40k rules from my old laptop [*], this
is already not too fast (without my patchset).

I remember we discussed about adding support for real batching for tc
- probably we can probably do this transparently by assuming that if the
skbuff length mismatches nlmsghdr->len field, then we enter the batch
mode from the kernel. This would require to update iproute2 to use
libmnl batching routines, or code that follows similar approach
otherwise.

[*] 0.5 seconds in nft (similar ruleset), this is using netlink batching.

[PATCH net-next] r8169: remove unneeded mmiowb barriers

2018-11-26 Thread Heiner Kallweit

writex() has implicit barriers, that's what makes it different from
writex_relaxed(). Therefore these calls to mmiowb() can be removed.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index 4114c2712..bb1847fd6 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1283,13 +1283,11 @@ static u16 rtl_get_events(struct rtl8169_private *tp)
 static void rtl_ack_events(struct rtl8169_private *tp, u16 bits)
 {
RTL_W16(tp, IntrStatus, bits);
-   mmiowb();
 }
 
 static void rtl_irq_disable(struct rtl8169_private *tp)
 {
RTL_W16(tp, IntrMask, 0);
-   mmiowb();
 }
 
 #define RTL_EVENT_NAPI_RX  (RxOK | RxErr)
@@ -6127,10 +6125,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff 
*skb,
if (unlikely(stop_queue))
netif_stop_queue(dev);
 
-   if (__netdev_sent_queue(dev, skb->len, skb->xmit_more)) {
+   if (__netdev_sent_queue(dev, skb->len, skb->xmit_more))
RTL_W8(tp, TxPoll, NPQ);
-   mmiowb();
-   }
 
if (unlikely(stop_queue)) {
/* Sync with rtl_tx:
@@ -6481,9 +6477,7 @@ static int rtl8169_poll(struct napi_struct *napi, int 
budget)
 
if (work_done < budget) {
napi_complete_done(napi, work_done);
-
rtl_irq_enable(tp);
-   mmiowb();
}
 
return work_done;
-- 
2.19.2

Re: [RFC v4 3/5] vxlan: add support for underlay in non-default VRF

2018-11-26 Thread David Ahern

On 11/26/18 12:06 PM, Alexis Bauvin wrote:
> Moreover, the issue of mixing default and non-default vrf needs to be
> addressed. For now it is stale, as I don’t see any solution (except for
> rewriting the whole thing as you suggested before) to address the
> "Address already in use" made by a socket of the default vrf owning the
> port across all vrfs.
> I tested both Vyatta’s changes and SO_REUSEPORT, and neither of them seem
> to work for this case.

That suggests to me the reopen should be done internally so that the
socket failure can cause the enslavement to fail with a message passed
back to the user via extack.

ie., If changing the vrf association breaks vxlan, we should detect that
and fail the change.

Re: [PATCH bpf-next] bpf: libbpf: retry program creation without the name

2018-11-26 Thread Vlad Dumitrescu

On Fri, Nov 23, 2018 at 2:51 AM Quentin Monnet
 wrote:
>
> 2018-11-21 09:28 UTC-0800 ~ Stanislav Fomichev 
> > On 11/21, Quentin Monnet wrote:
> >> 2018-11-20 15:26 UTC-0800 ~ Stanislav Fomichev 
> >>> On 11/20, Alexei Starovoitov wrote:
>  On Wed, Nov 21, 2018 at 12:18:57AM +0100, Daniel Borkmann wrote:
> > On 11/21/2018 12:04 AM, Alexei Starovoitov wrote:
> >> On Tue, Nov 20, 2018 at 01:19:05PM -0800, Stanislav Fomichev wrote:
> >>> On 11/20, Alexei Starovoitov wrote:
>  On Mon, Nov 19, 2018 at 04:46:25PM -0800, Stanislav Fomichev wrote:
> > [Recent commit 23499442c319 ("bpf: libbpf: retry map creation 
> > without
> > the name") fixed this issue for maps, let's do the same for 
> > programs.]
> >
> > Since commit 88cda1c9da02 ("bpf: libbpf: Provide basic API support
> > to specify BPF obj name"), libbpf unconditionally sets 
> > bpf_attr->name
> > for programs. Pre v4.14 kernels don't know about programs names and
> > return an error about unexpected non-zero data. Retry sys_bpf 
> > without
> > a program name to cover older kernels.
> >
> > Signed-off-by: Stanislav Fomichev 
> > ---
> >   tools/lib/bpf/bpf.c | 10 ++
> >   1 file changed, 10 insertions(+)
> >
> > diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
> > index 961e1b9fc592..cbe9d757c646 100644
> > --- a/tools/lib/bpf/bpf.c
> > +++ b/tools/lib/bpf/bpf.c
> > @@ -212,6 +212,16 @@ int bpf_load_program_xattr(const struct 
> > bpf_load_program_attr *load_attr,
> >   if (fd >= 0 || !log_buf || !log_buf_sz)
> >   return fd;
> >
> > + if (fd < 0 && errno == E2BIG && load_attr->name) {
> > + /* Retry the same syscall, but without the name.
> > +  * Pre v4.14 kernels don't support prog names.
> > +  */
> 
>  I'm afraid that will put unnecessary stress on the kernel.
>  This check needs to be tighter.
>  Like E2BIG and anything in the log_buf probably means that
>  E2BIG came from the verifier and nothing to do with prog_name.
>  Asking kernel to repeat is an unnecessary work.
> 
>  In general we need to think beyond this single prog_name field.
>  There are bunch of other fields in bpf_load_program_xattr() and 
>  older kernels
>  won't support them. Are we going to zero them out one by one
>  and retry? I don't think that would be practical.
> >>> I general, we don't want to zero anything out. However,
> >>> for this particular problem the rationale is the following:
> >>> In commit 88cda1c9da02 we started unconditionally setting 
> >>> {prog,map}->name
> >>> from the 'higher' libbpfc layer which breaks users on the older 
> >>> kernels.
> >>>
>  Also libbpf silently ignoring prog_name is not great for debugging.
>  A warning is needed.
>  But it cannot be done out of lib/bpf/bpf.c, since it's a set of 
>  syscall
>  wrappers.
>  Imo such "old kernel -> lets retry" feature should probably be done
>  at lib/bpf/libbpf.c level. inside load_program().
> >>> For maps bpftools calls bpf_create_map_xattr directly, that's why
> >>> for maps I did the retry on the lower level (and why for programs I 
> >>> initially
> >>> thought about doing the same). However, in this case maybe asking
> >>> user to omit 'name' argument might be a better option.
> >>>
> >>> For program names, I agree, we might think about doing it on the 
> >>> higher
> >>> level (although I'm not sure whether we want to have different API
> >>> expectations, i.e. bpf_create_map_xattr ignoring the name and
> >>> bpf_load_program_xattr not ignoring the name).
> >>>
> >>> So given that rationale above, what do you think is the best way to
> >>> move forward?
> >>> 1. Same patch, but tighten the retry check inside 
> >>> bpf_load_program_xattr ?
> >>> 2. Move this retry logic into load_program and have different handling
> >>> for bpf_create_map_xattr vs bpf_load_program_xattr ?
> >>> 3. Do 2 and move the retry check for maps from bpf_create_map_xattr
> >>> into bpf_object__create_maps ?
> >>>
> >>> (I'm slightly leaning towards #3)
> >>
> >> me too. I think it's cleaner for maps to do it in
> >> bpf_object__create_maps().
> >> Originally bpf.c was envisioned to be a thin layer on top of bpf 
> >> syscall.
> >> Whereas 'smart bits' would go into libbpf.c
> >
> > Can't we create in bpf_object__load() a small helper 
> > bpf_object__probe_caps()
> > which would figure this out _once_ upon start with a few things to 
> > probe for
> >

Re: [RFC v4 3/5] vxlan: add support for underlay in non-default VRF

2018-11-26 Thread Alexis Bauvin

Le 26 nov. 2018 à 19:26, Roopa Prabhu  a écrit :
> 
> On Mon, Nov 26, 2018 at 9:54 AM David Ahern  wrote:
>> 
>> On 11/26/18 9:32 AM, Alexis Bauvin wrote:
>>> Thanks for the review. I’ll send a v5 if you have no other comment on
>>> this version!
>> 
>> A few comments on the test script; see attached which has the changes.
>> 
>> Mainly the cleanup does not need to be called at the end since you setup
>> the exit trap. The cleanup calls ip to delete veth-hv-1 and veth-tap but
>> those are moved to other namespaces. 'ip netns exec NAME ip ...' is more
>> efficiently done as 'ip -netns NAME ...'. The test results should align
>> like this:
>> 
>> Checking HV connectivity  [ OK ]
>> Check VM connectivity through VXLAN (underlay in the default VRF) [ OK ]
>> Check VM connectivity through VXLAN (underlay in a VRF)   [ OK ]
>> 
>> So it is easy for users to see the PASS/FAIL.
>> 
>> It would be good to copy the topology ascii art into the test script as
>> well for future users.
>> 
>> Also, add the test as a separate patch at the end and include it in
>> tools/testing/selftests/net/Makefile
>> 
>> Finally, I think you should drop the RFC and send it as a 'ready for
>> inclusion'.
> 
> I cant seem to find patch 5 in my mail box... so commenting here
> (Using reference to patch5 from here
> https://marc.info/?l=linux-netdev=154284885815549=2)
> 
> Still not convinced that the auto reopen is justified here IMO because
> it can be done from user-space and there are many cases where this is
> already done from user-space. A few questions for alexis on that,

I do agree on this. The test shows that a simple down/up is enough, and
the patch was written as a mere convenience.

> - What is the reason for handling NETDEV_CHANGE on the vxlan device
> from the notifier handler. It can be really done in the changelink
> handler, correct  ?

Looks correct to me. The reason is nothing more than me not thinking
about the netlink handlers.

> - Also, IIUC, patch5 blindly re-opens the vxlan device without
> considering if the admin had set it to down last (ie the last state on
> it was vxlan_close). is that correct ?

It is correct. This is a big oversight from my side, that could have
led to crashes. Fortunately the underlying code will check if the
sockets are null (which they are if the interface is down) before
accessing them.

> (Don't want to block the entire series for just patch5. Patch5 can be
> done incrementally after we converge on it. The rest of the series
> looks good as David has already reviewed.  And nice to see the test!).

Thanks!

Given the aforementioned oversight when handling a down interface, it is
best to wait for a better solution for this patch.
Moreover, the issue of mixing default and non-default vrf needs to be
addressed. For now it is stale, as I don’t see any solution (except for
rewriting the whole thing as you suggested before) to address the
"Address already in use" made by a socket of the default vrf owning the
port across all vrfs.
I tested both Vyatta’s changes and SO_REUSEPORT, and neither of them seem
to work for this case.

Re: Did You Receive My Last Mail?

2018-11-26 Thread Reem Al-Hashimi

Hello,

My name is ms. Reem Al-Hashimi. The UAE minister of state for international 
cooparation. I got your contact from an email database from your country. I 
have a financial transaction i would like to discuss with you. Please reply to 
reem2...@daum.net, for more details if you are interested.

Regards,

Ms. Reem Al-Hashimi

Re: [RFC v4 3/5] vxlan: add support for underlay in non-default VRF

2018-11-26 Thread Roopa Prabhu

On Mon, Nov 26, 2018 at 9:54 AM David Ahern  wrote:
>
> On 11/26/18 9:32 AM, Alexis Bauvin wrote:
> > Thanks for the review. I’ll send a v5 if you have no other comment on
> > this version!
>
> A few comments on the test script; see attached which has the changes.
>
> Mainly the cleanup does not need to be called at the end since you setup
> the exit trap. The cleanup calls ip to delete veth-hv-1 and veth-tap but
> those are moved to other namespaces. 'ip netns exec NAME ip ...' is more
> efficiently done as 'ip -netns NAME ...'. The test results should align
> like this:
>
> Checking HV connectivity  [ OK ]
> Check VM connectivity through VXLAN (underlay in the default VRF) [ OK ]
> Check VM connectivity through VXLAN (underlay in a VRF)   [ OK ]
>
> So it is easy for users to see the PASS/FAIL.
>
> It would be good to copy the topology ascii art into the test script as
> well for future users.
>
> Also, add the test as a separate patch at the end and include it in
> tools/testing/selftests/net/Makefile
>
> Finally, I think you should drop the RFC and send it as a 'ready for
> inclusion'.

I cant seem to find patch 5 in my mail box... so commenting here
(Using reference to patch5 from here
https://marc.info/?l=linux-netdev=154284885815549=2)

Still not convinced that the auto reopen is justified here IMO because
it can be done from user-space and there are many cases where this is
already done from user-space. A few questions for alexis on that,
- What is the reason for handling NETDEV_CHANGE on the vxlan device
from the notifier handler. It can be really done in the changelink
handler, correct  ?
- Also, IIUC, patch5 blindly re-opens the vxlan device without
considering if the admin had set it to down last (ie the last state on
it was vxlan_close). is that correct ?

(Don't want to block the entire series for just patch5. Patch5 can be
done incrementally after we converge on it. The rest of the series
looks good as David has already reviewed.  And nice to see the test!).

[iproute PATCH] man: rdma: Add reference to rdma-resource.8

2018-11-26 Thread Phil Sutter

All rdma-related man pages list each other in SEE ALSO section, only
rdma-resource.8 is missing. Add it for the sake of consistency.

Signed-off-by: Phil Sutter 
---
 man/man8/rdma-dev.8  | 1 +
 man/man8/rdma-link.8 | 1 +
 man/man8/rdma.8  | 1 +
 3 files changed, 3 insertions(+)

diff --git a/man/man8/rdma-dev.8 b/man/man8/rdma-dev.8
index 461681b60f54d..b7abfe1088c2f 100644
--- a/man/man8/rdma-dev.8
+++ b/man/man8/rdma-dev.8
@@ -49,6 +49,7 @@ Shows the state of specified RDMA device.
 .SH SEE ALSO
 .BR rdma (8),
 .BR rdma-link (8),
+.BR rdma-resource (8),
 .br
 
 .SH AUTHOR
diff --git a/man/man8/rdma-link.8 b/man/man8/rdma-link.8
index 97dd8bb994d24..bddf34746e8b2 100644
--- a/man/man8/rdma-link.8
+++ b/man/man8/rdma-link.8
@@ -49,6 +49,7 @@ Shows the state of specified rdma link.
 .SH SEE ALSO
 .BR rdma (8),
 .BR rdma-dev (8),
+.BR rdma-resource (8),
 .br
 
 .SH AUTHOR
diff --git a/man/man8/rdma.8 b/man/man8/rdma.8
index 12aa149bbaf3e..b2b5aef866ab0 100644
--- a/man/man8/rdma.8
+++ b/man/man8/rdma.8
@@ -106,6 +106,7 @@ Exit status is 0 if command was successful or a positive 
integer upon failure.
 .SH SEE ALSO
 .BR rdma-dev (8),
 .BR rdma-link (8),
+.BR rdma-resource (8),
 .br
 
 .SH REPORTING BUGS
-- 
2.19.0

Re: [PATCH net-next v2 1/2] udp: msg_zerocopy

2018-11-26 Thread Willem de Bruijn

On Mon, Nov 26, 2018 at 1:04 PM Paolo Abeni  wrote:
>
> On Mon, 2018-11-26 at 12:59 -0500, Willem de Bruijn wrote:
> > The callers of this function do flush the queue of the other skbs on
> > error, but only after the call to sock_zerocopy_put_abort.
> >
> > sock_zerocopy_put_abort depends on total rollback to revert the
> > sk_zckey increment and suppress the completion notification (which
> > must not happen on return with error).
> >
> > I don't immediately have a fix. Need to think about this some more..
>
> [still out of sheer ignorance] How about tacking a refcnt for the whole
> ip_append_data() scope, like in the tcp case? that will add an atomic
> op per loop (likely, hitting the cache) but will remove some code hunk
> in sock_zerocopy_put_abort() and sock_zerocopy_alloc().

The atomic op pair is indeed what I was trying to avoid. But I also need
to solve the problem that the final decrement will happen from the freeing
of the other skbs in __ip_flush_pending_frames, and will not suppress
the notification.

Freeing the entire queue inside __ip_append_data, effectively making it
a true noop on error is one approach. But that is invasive, also to non
zerocopy codepaths, so I would rather avoid that.

Perhaps I need to handle the abort logic in udp_sendmsg directly,
after both __ip_append_data and __ip_flush_pending_frames.

Re: [PATCH net-next v2 1/2] udp: msg_zerocopy

2018-11-26 Thread Willem de Bruijn

On Mon, Nov 26, 2018 at 11:32 AM Paolo Abeni  wrote:
>
> Hi,
>
> Sorry for the long delay...
>
> On Mon, 2018-11-26 at 10:29 -0500, Willem de Bruijn wrote:
> > @@ -1109,6 +1128,7 @@ static int __ip_append_data(struct sock *sk,
> >  error_efault:
> >   err = -EFAULT;
> >  error:
> > + sock_zerocopy_put_abort(uarg);
> >   cork->length -= length;
> >   IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
> >   refcount_add(wmem_alloc_delta, >sk_wmem_alloc);
>
> Out of sheer ignorance on my side, don't we have a bad reference
> accounting if e.g.:
>
> - uarg is attached to multiple skbs, each holding a ref,
> - there is a failure on 'getfrag()'
>
> Such failure will release 2 references (1 kfree_skb(), and another in
> the above sock_zerocopy_put_abort(), as the count is still positive).

Thanks Paolo. Indeed, I had not anticipated the case of partial failure,
where more than one skb is allocated (or converted to zerocopy) and
only the last one is freed on error inside __ip_append_data.

The callers of this function do flush the queue of the other skbs on
error, but only after the call to sock_zerocopy_put_abort.

sock_zerocopy_put_abort depends on total rollback to revert the
sk_zckey increment and suppress the completion notification (which
must not happen on return with error).

I don't immediately have a fix. Need to think about this some more..

Re: [PATCH net-next v2 1/2] udp: msg_zerocopy

2018-11-26 Thread Paolo Abeni

On Mon, 2018-11-26 at 12:59 -0500, Willem de Bruijn wrote:
> The callers of this function do flush the queue of the other skbs on
> error, but only after the call to sock_zerocopy_put_abort.
> 
> sock_zerocopy_put_abort depends on total rollback to revert the
> sk_zckey increment and suppress the completion notification (which
> must not happen on return with error).
> 
> I don't immediately have a fix. Need to think about this some more..

[still out of sheer ignorance] How about tacking a refcnt for the whole
ip_append_data() scope, like in the tcp case? that will add an atomic
op per loop (likely, hitting the cache) but will remove some code hunk
in sock_zerocopy_put_abort() and sock_zerocopy_alloc().

Cheer,

Paolo

Re: [RFC v4 3/5] vxlan: add support for underlay in non-default VRF

2018-11-26 Thread David Ahern

On 11/26/18 9:32 AM, Alexis Bauvin wrote:
> Thanks for the review. I’ll send a v5 if you have no other comment on
> this version!

A few comments on the test script; see attached which has the changes.

Mainly the cleanup does not need to be called at the end since you setup
the exit trap. The cleanup calls ip to delete veth-hv-1 and veth-tap but
those are moved to other namespaces. 'ip netns exec NAME ip ...' is more
efficiently done as 'ip -netns NAME ...'. The test results should align
like this:

Checking HV connectivity  [ OK ]
Check VM connectivity through VXLAN (underlay in the default VRF) [ OK ]
Check VM connectivity through VXLAN (underlay in a VRF)   [ OK ]

So it is easy for users to see the PASS/FAIL.

It would be good to copy the topology ascii art into the test script as
well for future users.

Also, add the test as a separate patch at the end and include it in
tools/testing/selftests/net/Makefile

Finally, I think you should drop the RFC and send it as a 'ready for
inclusion'.

test_vxlan_under_vrf.sh
Description: Bourne shell script

Re: [PATCH net] ixgbe: recognize 1000BaseLX SFP modules as 1Gbps

2018-11-26 Thread Josh Elsasser

Bjørn Mork  wrote:

> Not that it matters much I guess, but I think LX SFPs were unsupported
> at that time. The LX support appears to have been added under the radar
> while refactoring ixgbe_setup_sfp_modules_X550em in commit e23f33367882
> ("ixgbe: Fix 1G and 10G link stability for X550EM_x SFP+")

Looks like you’re right. Want me to respin with an additional “Fixes” tag?

- Josh

[PATCH v2 net] lan743x: Enable driver to work with LAN7431

2018-11-26 Thread Bryan Whitehead

This driver was designed to work with both LAN7430 and LAN7431.
The only difference between the two is the LAN7431 has support
for external phy.

This change adds LAN7431 to the list of recognized devices
supported by this driver.

Updates for v2:
changed 'fixes' tag to match defined format

fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver")
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/lan743x_main.c | 1 +
 drivers/net/ethernet/microchip/lan743x_main.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/microchip/lan743x_main.c 
b/drivers/net/ethernet/microchip/lan743x_main.c
index 867cddb..e2f1531 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -3017,6 +3017,7 @@ static const struct dev_pm_ops lan743x_pm_ops = {
 
 static const struct pci_device_id lan743x_pcidev_tbl[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_SMSC, PCI_DEVICE_ID_SMSC_LAN7430) },
+   { PCI_DEVICE(PCI_VENDOR_ID_SMSC, PCI_DEVICE_ID_SMSC_LAN7431) },
{ 0, }
 };
 
diff --git a/drivers/net/ethernet/microchip/lan743x_main.h 
b/drivers/net/ethernet/microchip/lan743x_main.h
index 0e82b63..2d6eea1 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.h
+++ b/drivers/net/ethernet/microchip/lan743x_main.h
@@ -548,6 +548,7 @@ struct lan743x_adapter;
 /* SMSC acquired EFAR late 1990's, MCHP acquired SMSC 2012 */
 #define PCI_VENDOR_ID_SMSC PCI_VENDOR_ID_EFAR
 #define PCI_DEVICE_ID_SMSC_LAN7430 (0x7430)
+#define PCI_DEVICE_ID_SMSC_LAN7431 (0x7431)
 
 #define PCI_CONFIG_LENGTH  (0x1000)
 
-- 
2.7.4

[Patch net-next] net: explain __skb_checksum_complete() with comments

2018-11-26 Thread Cong Wang

Cc: Herbert Xu 
Signed-off-by: Cong Wang 
---
 net/core/dev.c|  1 +
 net/core/skbuff.c | 18 +-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 368dc3b49dc0..ee0a4ac0bbb6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5787,6 +5787,7 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
 
/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+   /* See comments in __skb_checksum_complete(). */
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6f2ea0f0fb75..530097df328f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2637,6 +2637,7 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, 
int len)
__sum16 sum;
 
sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
+   /* See comments in __skb_checksum_complete(). */
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
@@ -2648,6 +2649,15 @@ __sum16 __skb_checksum_complete_head(struct sk_buff 
*skb, int len)
 }
 EXPORT_SYMBOL(__skb_checksum_complete_head);
 
+/* This function assumes skb->csum already holds pseudo header's checksum,
+ * which has been changed from the hardware checksum, for example, by
+ * __skb_checksum_validate_complete(). And, the original skb->csum must
+ * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
+ *
+ * It returns non-zero if the recomputed checksum is still invalid, otherwise
+ * zero. The new checksum is stored back into skb->csum unless the skb is
+ * shared.
+ */
 __sum16 __skb_checksum_complete(struct sk_buff *skb)
 {
__wsum csum;
@@ -2655,8 +2665,14 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
 
csum = skb_checksum(skb, 0, skb->len, 0);
 
-   /* skb->csum holds pseudo checksum */
sum = csum_fold(csum_add(skb->csum, csum));
+   /* This check is inverted, because we already knew the hardware
+* checksum is invalid before calling this function. So, if the
+* re-computed checksum is valid instead, then we have a mismatch
+* between the original skb->csum and skb_checksum(). This means either
+* the original hardware checksum is incorrect or we screw up skb->csum
+* when moving skb->data around.
+*/
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
-- 
2.19.1

Re: [PATCH bpf-next 2/2] tools/bpf: change selftest test_btf for both jit and non-jit

2018-11-26 Thread Martin Lau

On Sat, Nov 24, 2018 at 11:20:45PM -0800, Yonghong Song wrote:
> The selftest test_btf is changed to test both jit and non-jit.
> The test result should be the same regardless of whether jit
> is enabled or not.
Acked-by: Martin KaFai Lau

Re: [PATCH bpf-next 1/2] bpf: btf: support proper non-jit func info

2018-11-26 Thread Martin Lau

On Sat, Nov 24, 2018 at 11:20:44PM -0800, Yonghong Song wrote:
> Commit 838e96904ff3 ("bpf: Introduce bpf_func_info")
> added bpf func info support. The userspace is able
> to get better ksym's for bpf programs with jit, and
> is able to print out func prototypes.
> 
> For a program containing func-to-func calls, the existing
> implementation returns user specified number of function
> calls and BTF types if jit is enabled. If the jit is not
> enabled, it only returns the type for the main function.
> 
> This is undesirable. Interpreter may still be used
> and we should keep feature identical regardless of
> whether jit is enabled or not.
> This patch fixed this discrepancy.
Acked-by: Martin KaFai Lau

[PATCH v3 net] lan743x: fix return value for lan743x_tx_napi_poll

2018-11-26 Thread Bryan Whitehead

The lan743x driver, when under heavy traffic load, has been noticed
to sometimes hang, or cause a kernel panic.

Debugging reveals that the TX napi poll routine was returning
the wrong value, 'weight'. Most other drivers return 0.
And call napi_complete, instead of napi_complete_done.

Additionally when creating the tx napi poll routine.
Changed netif_napi_add, to netif_tx_napi_add.

Updates for v3:
changed 'fixes' tag to match defined format

Updates for v2:
use napi_complete, instead of napi_complete_done in
lan743x_tx_napi_poll
use netif_tx_napi_add, instead of netif_napi_add for
registration of tx napi poll routine

fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver")
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/lan743x_main.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/microchip/lan743x_main.c 
b/drivers/net/ethernet/microchip/lan743x_main.c
index 867cddb..d627129 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -1672,7 +1672,7 @@ static int lan743x_tx_napi_poll(struct napi_struct *napi, 
int weight)
netif_wake_queue(adapter->netdev);
}
 
-   if (!napi_complete_done(napi, weight))
+   if (!napi_complete(napi))
goto done;
 
/* enable isr */
@@ -1681,7 +1681,7 @@ static int lan743x_tx_napi_poll(struct napi_struct *napi, 
int weight)
lan743x_csr_read(adapter, INT_STS);
 
 done:
-   return weight;
+   return 0;
 }
 
 static void lan743x_tx_ring_cleanup(struct lan743x_tx *tx)
@@ -1870,9 +1870,9 @@ static int lan743x_tx_open(struct lan743x_tx *tx)
tx->vector_flags = lan743x_intr_get_vector_flags(adapter,
 INT_BIT_DMA_TX_
 (tx->channel_number));
-   netif_napi_add(adapter->netdev,
-  >napi, lan743x_tx_napi_poll,
-  tx->ring_size - 1);
+   netif_tx_napi_add(adapter->netdev,
+ >napi, lan743x_tx_napi_poll,
+ tx->ring_size - 1);
napi_enable(>napi);
 
data = 0;
-- 
2.7.4

Compliment of the day,

2018-11-26 Thread Mr.Philippines

Compliment of the day,

I am Mr.Philippine.Kabore I Have a Business Proposal of $5.3 million
For You. I am aware of the unsafe nature of the internet, and was
compelled to use this medium due to the nature of this project.

I have access to very vital information that can be used to transfer
this huge amount of money, which may culminate into the investment of
the said funds into your company or any lucrative venture in your
country.

If you will like to assist me as a partner then indicate your
interest, after which we shall both discuss the modalities and the
sharing percentage.

Upon receipt of your reply on your expression of Interest I will give
you full details,
on how the business will be executed I am open for negotiation. You
should forward your reply to this private email id
(mrphilippines...@yahoo.com) Thanks for your anticipated cooperation.

Note you might receive this message in your inbox or spam or junk
folder, depends on your web host or server network.

Thanks’
Best Regards
Mr.Philippine.Kabore,

Re: [RFC v4 3/5] vxlan: add support for underlay in non-default VRF

2018-11-26 Thread Alexis Bauvin

Le 22 nov. 2018 à 18:19, David Ahern  a écrit :
> On 11/21/18 6:07 PM, Alexis Bauvin wrote:
>> Creating a VXLAN device with is underlay in the non-default VRF makes
>> egress route lookup fail or incorrect since it will resolve in the
>> default VRF, and ingress fail because the socket listens in the default
>> VRF.
>> 
>> This patch binds the underlying UDP tunnel socket to the l3mdev of the
>> lower device of the VXLAN device. This will listen in the proper VRF and
>> output traffic from said l3mdev, matching l3mdev routing rules and
>> looking up the correct routing table.
>> 
>> When the VXLAN device does not have a lower device, or the lower device
>> is in the default VRF, the socket will not be bound to any interface,
>> keeping the previous behaviour.
>> 
>> The underlay l3mdev is deduced from the VXLAN lower device
>> (IFLA_VXLAN_LINK).
>> 
>> +--+ +-+
>> |  | | |
>> | vrf-blue | | vrf-red |
>> |  | | |
>> ++-+ +++
>> ||
>> ||
>> ++-+ +++
>> |  | | |
>> | br-blue  | | br-red  |
>> |  | | |
>> ++-+ +---+-+---+
>> |   | |
>> | +-+ +-+
>> | | |
>> ++-++--++   +++
>> |  |  lower device  |   |   | |
>> |   eth0   | <- - - - - - - | vxlan-red |   | tap-red | (... more taps)
>> |  ||   |   | |
>> +--++---+   +-+
>> 
>> Signed-off-by: Alexis Bauvin 
>> Reviewed-by: Amine Kherbouche 
>> Tested-by: Amine Kherbouche 
>> ---
>> drivers/net/vxlan.c   | 32 +--
>> .../selftests/net/test_vxlan_under_vrf.sh | 90 +++
>> 2 files changed, 114 insertions(+), 8 deletions(-)
>> create mode 100755 tools/testing/selftests/net/test_vxlan_under_vrf.sh
>> 
> 
> Reviewed-by: David Ahern 
> 
> Thanks for adding the test case; I'll try it out next week (after the
> holidays).

Thanks for the review. I’ll send a v5 if you have no other comment on
this version!

Re: [PATCH net-next v2 1/2] udp: msg_zerocopy

2018-11-26 Thread Paolo Abeni

Hi,

Sorry for the long delay...

On Mon, 2018-11-26 at 10:29 -0500, Willem de Bruijn wrote:
> @@ -1109,6 +1128,7 @@ static int __ip_append_data(struct sock *sk,
>  error_efault:
>   err = -EFAULT;
>  error:
> + sock_zerocopy_put_abort(uarg);
>   cork->length -= length;
>   IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
>   refcount_add(wmem_alloc_delta, >sk_wmem_alloc);

Out of sheer ignorance on my side, don't we have a bad reference
accounting if e.g.:

- uarg is attached to multiple skbs, each holding a ref, 
- there is a failure on 'getfrag()'

Such failure will release 2 references (1 kfree_skb(), and another in
the above sock_zerocopy_put_abort(), as the count is still positive).

Cheers,

Paolo

[PATCH v2 net-next 6/8] dpaa2-eth: Add support for XDP_TX

2018-11-26 Thread Ioana Ciocoi Radulescu

Send frames back on the same port for XDP_TX action.
Since the frame buffers have been allocated by us, we can recycle
them directly into the Rx buffer pool instead of requesting a
confirmation frame upon transmission complete.

Signed-off-by: Ioana Radulescu 
---
v2: XDP_TX packets count towards the tx packets and bytes counters

 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 51 +++-
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h |  2 +
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index c2e880b..bc582c4 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -240,14 +240,53 @@ static void xdp_release_buf(struct dpaa2_eth_priv *priv,
ch->xdp.drop_cnt = 0;
 }
 
+static int xdp_enqueue(struct dpaa2_eth_priv *priv, struct dpaa2_fd *fd,
+  void *buf_start, u16 queue_id)
+{
+   struct dpaa2_eth_fq *fq;
+   struct dpaa2_faead *faead;
+   u32 ctrl, frc;
+   int i, err;
+
+   /* Mark the egress frame hardware annotation area as valid */
+   frc = dpaa2_fd_get_frc(fd);
+   dpaa2_fd_set_frc(fd, frc | DPAA2_FD_FRC_FAEADV);
+   dpaa2_fd_set_ctrl(fd, DPAA2_FD_CTRL_ASAL);
+
+   /* Instruct hardware to release the FD buffer directly into
+* the buffer pool once transmission is completed, instead of
+* sending a Tx confirmation frame to us
+*/
+   ctrl = DPAA2_FAEAD_A4V | DPAA2_FAEAD_A2V | DPAA2_FAEAD_EBDDV;
+   faead = dpaa2_get_faead(buf_start, false);
+   faead->ctrl = cpu_to_le32(ctrl);
+   faead->conf_fqid = 0;
+
+   fq = >fq[queue_id];
+   for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) {
+   err = dpaa2_io_service_enqueue_qd(fq->channel->dpio,
+ priv->tx_qdid, 0,
+ fq->tx_qdbin, fd);
+   if (err != -EBUSY)
+   break;
+   }
+
+   return err;
+}
+
 static u32 run_xdp(struct dpaa2_eth_priv *priv,
   struct dpaa2_eth_channel *ch,
+  struct dpaa2_eth_fq *rx_fq,
   struct dpaa2_fd *fd, void *vaddr)
 {
dma_addr_t addr = dpaa2_fd_get_addr(fd);
+   struct rtnl_link_stats64 *percpu_stats;
struct bpf_prog *xdp_prog;
struct xdp_buff xdp;
u32 xdp_act = XDP_PASS;
+   int err;
+
+   percpu_stats = this_cpu_ptr(priv->percpu_stats);
 
rcu_read_lock();
 
@@ -269,6 +308,16 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv,
switch (xdp_act) {
case XDP_PASS:
break;
+   case XDP_TX:
+   err = xdp_enqueue(priv, fd, vaddr, rx_fq->flowid);
+   if (err) {
+   xdp_release_buf(priv, ch, addr);
+   percpu_stats->tx_errors++;
+   } else {
+   percpu_stats->tx_packets++;
+   percpu_stats->tx_bytes += dpaa2_fd_get_len(fd);
+   }
+   break;
default:
bpf_warn_invalid_xdp_action(xdp_act);
case XDP_ABORTED:
@@ -317,7 +366,7 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
percpu_extras = this_cpu_ptr(priv->percpu_extras);
 
if (fd_format == dpaa2_fd_single) {
-   xdp_act = run_xdp(priv, ch, (struct dpaa2_fd *)fd, vaddr);
+   xdp_act = run_xdp(priv, ch, fq, (struct dpaa2_fd *)fd, vaddr);
if (xdp_act != XDP_PASS) {
percpu_stats->rx_packets++;
percpu_stats->rx_bytes += dpaa2_fd_get_len(fd);
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
index 23cf9d9..5530a0e 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
@@ -139,7 +139,9 @@ struct dpaa2_faead {
 };
 
 #define DPAA2_FAEAD_A2V0x2000
+#define DPAA2_FAEAD_A4V0x0800
 #define DPAA2_FAEAD_UPDV   0x1000
+#define DPAA2_FAEAD_EBDDV  0x2000
 #define DPAA2_FAEAD_UPD0x0010
 
 /* Accessors for the hardware annotation fields that we use */
-- 
2.7.4

[PATCH v2 net-next 2/8] dpaa2-eth: Allow XDP header adjustments

2018-11-26 Thread Ioana Ciocoi Radulescu

Reserve XDP_PACKET_HEADROOM bytes in Rx buffers to allow XDP
programs to increase frame header size.

Signed-off-by: Ioana Radulescu 
---
v2: no changes

 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 43 ++--
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index d3cfed4..008cdf8 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -216,11 +216,15 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv,
 
xdp.data = vaddr + dpaa2_fd_get_offset(fd);
xdp.data_end = xdp.data + dpaa2_fd_get_len(fd);
-   xdp.data_hard_start = xdp.data;
+   xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
xdp_set_data_meta_invalid();
 
xdp_act = bpf_prog_run_xdp(xdp_prog, );
 
+   /* xdp.data pointer may have changed */
+   dpaa2_fd_set_offset(fd, xdp.data - vaddr);
+   dpaa2_fd_set_len(fd, xdp.data_end - xdp.data);
+
switch (xdp_act) {
case XDP_PASS:
break;
@@ -1483,7 +1487,7 @@ static bool xdp_mtu_valid(struct dpaa2_eth_priv *priv, 
int mtu)
 
mfl = DPAA2_ETH_L2_MAX_FRM(mtu);
linear_mfl = DPAA2_ETH_RX_BUF_SIZE - DPAA2_ETH_RX_HWA_SIZE -
-dpaa2_eth_rx_head_room(priv);
+dpaa2_eth_rx_head_room(priv) - XDP_PACKET_HEADROOM;
 
if (mfl > linear_mfl) {
netdev_warn(priv->net_dev, "Maximum MTU for XDP is %d\n",
@@ -1537,6 +1541,32 @@ static int dpaa2_eth_change_mtu(struct net_device *dev, 
int new_mtu)
return 0;
 }
 
+static int update_rx_buffer_headroom(struct dpaa2_eth_priv *priv, bool has_xdp)
+{
+   struct dpni_buffer_layout buf_layout = {0};
+   int err;
+
+   err = dpni_get_buffer_layout(priv->mc_io, 0, priv->mc_token,
+DPNI_QUEUE_RX, _layout);
+   if (err) {
+   netdev_err(priv->net_dev, "dpni_get_buffer_layout failed\n");
+   return err;
+   }
+
+   /* Reserve extra headroom for XDP header size changes */
+   buf_layout.data_head_room = dpaa2_eth_rx_head_room(priv) +
+   (has_xdp ? XDP_PACKET_HEADROOM : 0);
+   buf_layout.options = DPNI_BUF_LAYOUT_OPT_DATA_HEAD_ROOM;
+   err = dpni_set_buffer_layout(priv->mc_io, 0, priv->mc_token,
+DPNI_QUEUE_RX, _layout);
+   if (err) {
+   netdev_err(priv->net_dev, "dpni_set_buffer_layout failed\n");
+   return err;
+   }
+
+   return 0;
+}
+
 static int setup_xdp(struct net_device *dev, struct bpf_prog *prog)
 {
struct dpaa2_eth_priv *priv = netdev_priv(dev);
@@ -1560,11 +1590,18 @@ static int setup_xdp(struct net_device *dev, struct 
bpf_prog *prog)
if (up)
dpaa2_eth_stop(dev);
 
-   /* While in xdp mode, enforce a maximum Rx frame size based on MTU */
+   /* While in xdp mode, enforce a maximum Rx frame size based on MTU.
+* Also, when switching between xdp/non-xdp modes we need to reconfigure
+* our Rx buffer layout. Buffer pool was drained on dpaa2_eth_stop,
+* so we are sure no old format buffers will be used from now on.
+*/
if (need_update) {
err = set_rx_mfl(priv, dev->mtu, !!prog);
if (err)
goto out_err;
+   err = update_rx_buffer_headroom(priv, !!prog);
+   if (err)
+   goto out_err;
}
 
old = xchg(>xdp_prog, prog);
-- 
2.7.4

[PATCH v2 net-next 7/8] dpaa2-eth: Cleanup channel stats

2018-11-26 Thread Ioana Ciocoi Radulescu

Remove unused counter. Reorder fields in channel stats structure
to match the ethtool strings order and make it easier to print them
with ethtool -S.

Signed-off-by: Ioana Radulescu 
---
v2: no changes

 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c |  1 -
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h |  6 ++
 drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c | 16 +---
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index bc582c4..d2bc5da 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -467,7 +467,6 @@ static int consume_frames(struct dpaa2_eth_channel *ch,
return 0;
 
fq->stats.frames += cleaned;
-   ch->stats.frames += cleaned;
 
/* A dequeue operation only pulls frames from a single queue
 * into the store. Return the frame queue as an out param.
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
index 5530a0e..41a2a0d 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
@@ -245,12 +245,10 @@ struct dpaa2_eth_fq_stats {
 struct dpaa2_eth_ch_stats {
/* Volatile dequeues retried due to portal busy */
__u64 dequeue_portal_busy;
-   /* Number of CDANs; useful to estimate avg NAPI len */
-   __u64 cdan;
-   /* Number of frames received on queues from this channel */
-   __u64 frames;
/* Pull errors */
__u64 pull_err;
+   /* Number of CDANs; useful to estimate avg NAPI len */
+   __u64 cdan;
 };
 
 /* Maximum number of queues associated with a DPNI */
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
index 26bd5a2..79eeebe 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
@@ -174,8 +174,6 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device 
*net_dev,
int j, k, err;
int num_cnt;
union dpni_statistics dpni_stats;
-   u64 cdan = 0;
-   u64 portal_busy = 0, pull_err = 0;
struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
struct dpaa2_eth_drv_stats *extras;
struct dpaa2_eth_ch_stats *ch_stats;
@@ -212,16 +210,12 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device 
*net_dev,
}
i += j;
 
-   for (j = 0; j < priv->num_channels; j++) {
-   ch_stats = >channel[j]->stats;
-   cdan += ch_stats->cdan;
-   portal_busy += ch_stats->dequeue_portal_busy;
-   pull_err += ch_stats->pull_err;
+   /* Per-channel stats */
+   for (k = 0; k < priv->num_channels; k++) {
+   ch_stats = >channel[k]->stats;
+   for (j = 0; j < sizeof(*ch_stats) / sizeof(__u64); j++)
+   *((__u64 *)data + i + j) += *((__u64 *)ch_stats + j);
}
-
-   *(data + i++) = portal_busy;
-   *(data + i++) = pull_err;
-   *(data + i++) = cdan;
 }
 
 static int prep_eth_rule(struct ethhdr *eth_value, struct ethhdr *eth_mask,
-- 
2.7.4

1 2 >

1 - 100 of 150 matches

Mail list logo