Re: [RFC PATCH] net: mvpp2: fix detection of 10G SFP modules

2018-12-04 Thread Baruch Siach
Hi Russell,

On Tue, Dec 04, 2018 at 10:27:01AM +, Russell King - ARM Linux wrote:
> On Tue, Dec 04, 2018 at 12:19:54PM +0200, Baruch Siach wrote:
> > On Thu, Nov 29, 2018 at 10:00:43PM +, Russell King - ARM Linux wrote:
> > > On Thu, Nov 29, 2018 at 11:31:23AM -0800, Florian Fainelli wrote:
> > > > On 11/29/2018 4:49 AM, Baruch Siach wrote:
> > > > > The mvpp2_phylink_validate() relies on the interface field of
> > > > > phylink_link_state to determine valid link modes. However, when called
> > > > > from phylink_sfp_module_insert() this field in not initialized. The
> > > > > default switch case then excludes 10G link modes. This allows 10G SFP
> > > > > modules that are detected correctly to be configured at max rate of
> > > > > 2.5G.
> > > > > 
> > > > > Catch the uninitialized PHY mode case, and allow 10G rates.
> > > > > 
> > > > > Cc: Maxime Chevallier 
> > > > > Cc: Antoine Tenart 
> > > > > Signed-off-by: Baruch Siach 
> > > > > ---
> > > > > Is that the right fix?
> > > > 
> > > > It would be a bit surprising that this is the right fix, you would
> > > > expect validate to be called once everything has been parsed
> > > > successfully from the SFP, is not that the case here? If not, can you
> > > > find out what happens?
> > > 
> > > Two calls are made - the first with PHY_INTERFACE_MODE_NA to
> > > determine what the advertising link mode may be, and then again
> > > once the interface mode has been selected from the advertising mask.
> > > 
> > > Why?
> > > 
> > > Consider a 4.3Mbps fiberchannel SFP plugged into a 1G-only MAC.
> > > If we did it as a single pass, we would end up passing an
> > > interface mode of 2500BASEX first time around which is illogical.
> > 
> > So you consider this to be the right fix, right?
> 
> Yes, but there is another bug lurking here - the handling of invalid
> interface modes is not correct.  Please see mvneta.c as an example -
> interface modes that are not supported by the MAC (apart from the NA
> mode) end up with the supported mask completely cleared.

I'll take this as an ack. Thanks for reviewing.

I plan to resend this patch with your ack and a proper Fixes tag. I'll then 
add another patch to properly handle the invalid mode case.

Thanks,
baruch

-- 
 http://baruch.siach.name/blog/  ~. .~   Tk Open Systems
=}ooO--U--Ooo{=
   - bar...@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -


[PATCH iproute2-next 1/2] devlink: Add string to uint{8,16,32} conversion for generic parameters

2018-12-04 Thread Shalom Toledo
Allow setting u{8,16,32} generic parameters as a well defined strings in
devlink user space tool.

Signed-off-by: Shalom Toledo 
Reviewed-by: Jiri Pirko 
---
 devlink/devlink.c | 145 ++
 1 file changed, 135 insertions(+), 10 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 8bb254ea1b0b..1e3deb24d214 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -1920,10 +1920,69 @@ static int cmd_dev_eswitch(struct dl *dl)
return -ENOENT;
 }
 
-static void pr_out_param_value(struct dl *dl, int nla_type, struct nlattr *nl)
+struct param_val_conv {
+   const char *name;
+   const char *vstr;
+   uint32_t vuint;
+};
+
+static bool param_val_conv_exists(const struct param_val_conv *param_val_conv,
+ uint32_t len, const char *name)
+{
+   uint32_t i;
+
+   for (i = 0; i < len; i++)
+   if (!strcmp(param_val_conv[i].name, name))
+   return true;
+
+   return false;
+}
+
+static int
+param_val_conv_uint_get(const struct param_val_conv *param_val_conv,
+   uint32_t len, const char *name, const char *vstr,
+   uint32_t *vuint)
+{
+   uint32_t i;
+
+   for (i = 0; i < len; i++)
+   if (!strcmp(param_val_conv[i].name, name) &&
+   !strcmp(param_val_conv[i].vstr, vstr)) {
+   *vuint = param_val_conv[i].vuint;
+   return 0;
+   }
+
+   return -ENOENT;
+}
+
+static int
+param_val_conv_str_get(const struct param_val_conv *param_val_conv,
+  uint32_t len, const char *name, uint32_t vuint,
+  const char **vstr)
+{
+   uint32_t i;
+
+   for (i = 0; i < len; i++)
+   if (!strcmp(param_val_conv[i].name, name) &&
+   param_val_conv[i].vuint == vuint) {
+   *vstr = param_val_conv[i].vstr;
+   return 0;
+   }
+
+   return -ENOENT;
+}
+
+static const struct param_val_conv param_val_conv[] = {};
+
+#define PARAM_VAL_CONV_LEN ARRAY_SIZE(param_val_conv)
+
+static void pr_out_param_value(struct dl *dl, const char *nla_name,
+  int nla_type, struct nlattr *nl)
 {
struct nlattr *nla_value[DEVLINK_ATTR_MAX + 1] = {};
struct nlattr *val_attr;
+   const char *vstr;
+   bool conv_exists;
int err;
 
err = mnl_attr_parse_nested(nl, attr_cb, nla_value);
@@ -1939,15 +1998,51 @@ static void pr_out_param_value(struct dl *dl, int 
nla_type, struct nlattr *nl)
   
param_cmode_name(mnl_attr_get_u8(nla_value[DEVLINK_ATTR_PARAM_VALUE_CMODE])));
val_attr = nla_value[DEVLINK_ATTR_PARAM_VALUE_DATA];
 
+   conv_exists = param_val_conv_exists(param_val_conv, PARAM_VAL_CONV_LEN,
+   nla_name);
+
switch (nla_type) {
case MNL_TYPE_U8:
-   pr_out_uint(dl, "value", mnl_attr_get_u8(val_attr));
+   if (conv_exists) {
+   err = param_val_conv_str_get(param_val_conv,
+PARAM_VAL_CONV_LEN,
+nla_name,
+mnl_attr_get_u8(val_attr),
+);
+   if (err)
+   return;
+   pr_out_str(dl, "value", vstr);
+   } else {
+   pr_out_uint(dl, "value", mnl_attr_get_u8(val_attr));
+   }
break;
case MNL_TYPE_U16:
-   pr_out_uint(dl, "value", mnl_attr_get_u16(val_attr));
+   if (conv_exists) {
+   err = param_val_conv_str_get(param_val_conv,
+PARAM_VAL_CONV_LEN,
+nla_name,
+mnl_attr_get_u16(val_attr),
+);
+   if (err)
+   return;
+   pr_out_str(dl, "value", vstr);
+   } else {
+   pr_out_uint(dl, "value", mnl_attr_get_u16(val_attr));
+   }
break;
case MNL_TYPE_U32:
-   pr_out_uint(dl, "value", mnl_attr_get_u32(val_attr));
+   if (conv_exists) {
+   err = param_val_conv_str_get(param_val_conv,
+PARAM_VAL_CONV_LEN,
+nla_name,
+mnl_attr_get_u32(val_attr),
+);
+   if (err)
+  

Re: consistency for statistics with XDP mode

2018-12-04 Thread Jesper Dangaard Brouer
On Mon, 3 Dec 2018 23:24:18 -0800
Jakub Kicinski  wrote:

> On Tue, 04 Dec 2018 09:03:52 +0200, Toke Høiland-Jørgensen wrote:
> > David Miller  writes:
> >   
> > > From: David Ahern 
> > > Date: Mon, 3 Dec 2018 17:15:03 -0700
> > >
> > >> So, instead of a program tag which the program writer controls, how
> > >> about some config knob that an admin controls that says at attach time
> > >> use standard stats?
> > >
> > > How about, instead of replacing it is in addition to, and admin can
> > > override?
> > 
> > Yeah, I was also thinking about something the program writer can set,
> > but the admin can override. There could even be a system-wide setting
> > that would make the verifier inject it into all programs at load time?  
> 
> That'd be far preferable, having all drivers include the code when we
> have JITs seems like a step backward.

There is one problem with it being part of the eBPF prog.  Once eBPf
prog is loaded you cannot change it (store in read-only page for
security reasons).  So, that will not make it possible for an admin to
enable stats when troubleshooting a system.  The use-case I think we
want to support, is to allow to opt-out due to performance concerns,
but when an admin need to troubleshoot the system, allow the admin to
enable this system wide.

Besides placing this in every eBPF program in the system will replicate
the stats update code (and put more I-cache pressure).  The coded
needed is actually very simple:

[PATCH] xdp: add stats for XDP action codes in xdp_rxq_info

Code muckup of adding XDP stats

diff --git a/include/linux/filter.h b/include/linux/filter.h
index cc17f5f32fbb..600a95e0cbcc 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -628,7 +628,10 @@ static __always_inline u32 bpf_prog_run_xdp(const struct 
bpf_prog *prog,
 * already takes rcu_read_lock() when fetching the program, so
 * it's not necessary here anymore.
 */
-   return BPF_PROG_RUN(prog, xdp);
+   u32 action = BPF_PROG_RUN(prog, xdp);
+   // Q: will adding a branch here cost more than always accounting?
+   xdp->rxq->stats[action <= XDP_REDIRECT ? action : 0]++;
+   return action;
 }
 
 static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 4a0ca7a3d5e5..3409dd9e0fbc 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -6,6 +6,7 @@
 #ifndef __LINUX_NET_XDP_H__
 #define __LINUX_NET_XDP_H__
 
+#include 
 /**
  * DOC: XDP RX-queue information
  *
@@ -61,6 +62,8 @@ struct xdp_rxq_info {
u32 queue_index;
u32 reg_state;
struct xdp_mem_info mem;
+   // TODO: benchmark if stats should be placed on different cache-line
+   u64 stats[XDP_REDIRECT + 1];
 } cacheline_aligned; /* perf critical, avoid false-sharing */
 
 struct xdp_buff {



> We could probably fit the stats into the enormous padding of struct
> xdp_rxq_info, the question is - how do we get to it in a clean way..

Struct xdp_rxq_info is explicitly a read-only cache-line, which contain
static information for each RX-queue.  We could place the stats record
in the next cache-line (most HW systems fetch 2 cache-lines).  But we
can also benchmark if it matters changing xdp_rxq_info to be a
write-cache-line.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: [RFC PATCH] net: mvpp2: fix detection of 10G SFP modules

2018-12-04 Thread Baruch Siach
Hi Russell,

On Thu, Nov 29, 2018 at 10:00:43PM +, Russell King - ARM Linux wrote:
> On Thu, Nov 29, 2018 at 11:31:23AM -0800, Florian Fainelli wrote:
> > On 11/29/2018 4:49 AM, Baruch Siach wrote:
> > > The mvpp2_phylink_validate() relies on the interface field of
> > > phylink_link_state to determine valid link modes. However, when called
> > > from phylink_sfp_module_insert() this field in not initialized. The
> > > default switch case then excludes 10G link modes. This allows 10G SFP
> > > modules that are detected correctly to be configured at max rate of
> > > 2.5G.
> > > 
> > > Catch the uninitialized PHY mode case, and allow 10G rates.
> > > 
> > > Cc: Maxime Chevallier 
> > > Cc: Antoine Tenart 
> > > Signed-off-by: Baruch Siach 
> > > ---
> > > Is that the right fix?
> > 
> > It would be a bit surprising that this is the right fix, you would
> > expect validate to be called once everything has been parsed
> > successfully from the SFP, is not that the case here? If not, can you
> > find out what happens?
> 
> Two calls are made - the first with PHY_INTERFACE_MODE_NA to
> determine what the advertising link mode may be, and then again
> once the interface mode has been selected from the advertising mask.
> 
> Why?
> 
> Consider a 4.3Mbps fiberchannel SFP plugged into a 1G-only MAC.
> If we did it as a single pass, we would end up passing an
> interface mode of 2500BASEX first time around which is illogical.

So you consider this to be the right fix, right?

I should have added:

Fixes: d97c9f4ab000b ("net: mvpp2: 1000baseX support")

Antoine, is this fix OK with you?

baruch

-- 
 http://baruch.siach.name/blog/  ~. .~   Tk Open Systems
=}ooO--U--Ooo{=
   - bar...@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -


[PATCH net-next 2/4] net: Do not route unicast IP packets twice

2018-12-04 Thread Ido Schimmel
Packets marked with 'offload_l3_fwd_mark' were already forwarded by a
capable device and should not be forwarded again by the kernel.
Therefore, have the kernel consume them.

The check is performed in ip{,6}_forward_finish() in order to allow the
kernel to process such packets in ip{,6}_forward() and generate required
exceptions. For example, ICMP redirects.

Signed-off-by: Ido Schimmel 
---
 net/ipv4/ip_forward.c | 7 +++
 net/ipv6/ip6_output.c | 7 +++
 2 files changed, 14 insertions(+)

diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 32662e9e5d21..06ee4696703c 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -69,6 +69,13 @@ static int ip_forward_finish(struct net *net, struct sock 
*sk, struct sk_buff *s
__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
__IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
 
+#ifdef CONFIG_NET_SWITCHDEV
+   if (skb->offload_l3_fwd_mark) {
+   consume_skb(skb);
+   return 0;
+   }
+#endif
+
if (unlikely(opt->optlen))
ip_forward_options(skb);
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ec8c235ea891..6592a39e5ec1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -378,6 +378,13 @@ static inline int ip6_forward_finish(struct net *net, 
struct sock *sk,
__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, 
skb->len);
 
+#ifdef CONFIG_NET_SWITCHDEV
+   if (skb->offload_l3_fwd_mark) {
+   consume_skb(skb);
+   return 0;
+   }
+#endif
+
return dst_output(net, sk, skb);
 }
 
-- 
2.19.1



[PATCH net-next 1/4] skbuff: Rename 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark'

2018-12-04 Thread Ido Schimmel
Commit abf4bb6b63d0 ("skbuff: Add the offload_mr_fwd_mark field") added
the 'offload_mr_fwd_mark' field to indicate that a packet has already
undergone L3 multicast routing by a capable device. The field is used to
prevent the kernel from forwarding a packet through a netdev through
which the device has already forwarded the packet.

Currently, no unicast packet is routed by both the device and the
kernel, but this is about to change by subsequent patches and we need to
be able to mark such packets, so that they will no be forwarded twice.

Instead of adding yet another field to 'struct sk_buff', we can just
rename 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark', as a packet
either has a multicast or a unicast destination IP.

While at it, add a comment about both 'offload_fwd_mark' and
'offload_l3_fwd_mark'.

Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 10 +-
 include/linux/skbuff.h |  4 +++-
 net/core/skbuff.c  |  2 +-
 net/ipv4/ipmr.c|  2 +-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index c293ff1eed63..920085fbbf2a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3554,10 +3554,10 @@ static void mlxsw_sp_rx_listener_mark_func(struct 
sk_buff *skb, u8 local_port,
return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv);
 }
 
-static void mlxsw_sp_rx_listener_mr_mark_func(struct sk_buff *skb,
+static void mlxsw_sp_rx_listener_l3_mark_func(struct sk_buff *skb,
  u8 local_port, void *priv)
 {
-   skb->offload_mr_fwd_mark = 1;
+   skb->offload_l3_fwd_mark = 1;
skb->offload_fwd_mark = 1;
return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv);
 }
@@ -3605,8 +3605,8 @@ static void mlxsw_sp_rx_listener_sample_func(struct 
sk_buff *skb, u8 local_port,
MLXSW_RXL(mlxsw_sp_rx_listener_mark_func, _trap_id, _action,\
_is_ctrl, SP_##_trap_group, DISCARD)
 
-#define MLXSW_SP_RXL_MR_MARK(_trap_id, _action, _trap_group, _is_ctrl) \
-   MLXSW_RXL(mlxsw_sp_rx_listener_mr_mark_func, _trap_id, _action, \
+#define MLXSW_SP_RXL_L3_MARK(_trap_id, _action, _trap_group, _is_ctrl) \
+   MLXSW_RXL(mlxsw_sp_rx_listener_l3_mark_func, _trap_id, _action, \
_is_ctrl, SP_##_trap_group, DISCARD)
 
 #define MLXSW_SP_EVENTL(_func, _trap_id)   \
@@ -3683,7 +3683,7 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = {
MLXSW_SP_RXL_MARK(IPV6_PIM, TRAP_TO_CPU, PIM, false),
MLXSW_SP_RXL_MARK(RPF, TRAP_TO_CPU, RPF, false),
MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false),
-   MLXSW_SP_RXL_MR_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false),
+   MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false),
/* NVE traps */
MLXSW_SP_RXL_MARK(NVE_ENCAP_ARP, TRAP_TO_CPU, ARP, false),
 };
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 75d50ab7997c..b1831a5ca173 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -616,6 +616,8 @@ typedef unsigned char *sk_buff_data_t;
  * @pkt_type: Packet class
  * @fclone: skbuff clone status
  * @ipvs_property: skbuff is owned by ipvs
+ * @offload_fwd_mark: Packet was L2-forwarded in hardware
+ * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
  * @tc_skip_classify: do not classify packet. set by IFB device
  * @tc_at_ingress: used within tc_classify to distinguish in/egress
  * @tc_redirected: packet was redirected by a tc action
@@ -799,7 +801,7 @@ struct sk_buff {
__u8remcsum_offload:1;
 #ifdef CONFIG_NET_SWITCHDEV
__u8offload_fwd_mark:1;
-   __u8offload_mr_fwd_mark:1;
+   __u8offload_l3_fwd_mark:1;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
__u8tc_skip_classify:1;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c78ce114537e..40552547c69a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4885,7 +4885,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 
 #ifdef CONFIG_NET_SWITCHDEV
skb->offload_fwd_mark = 0;
-   skb->offload_mr_fwd_mark = 0;
+   skb->offload_l3_fwd_mark = 0;
 #endif
 
if (!xnet)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a6defbec4f1b..5cbc749a50aa 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1802,7 +1802,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, 
struct mr_table *mrt,
struct vif_device *out_vif = >vif_table[out_vifi];
struct vif_device *in_vif = >vif_table[in_vifi];
 
-   if (!skb->offload_mr_fwd_mark)
+   if (!skb->offload_l3_fwd_mark)
return false;

[PATCH net-next 4/4] selftests: mlxsw: Add one-armed router test

2018-12-04 Thread Ido Schimmel
Construct a "one-armed router" topology and test that packets are
forwarded by the ASIC and that a copy of the packet is sent to the
kernel, which does not forward the packet again.

Signed-off-by: Ido Schimmel 
---
 .../drivers/net/mlxsw/one_armed_router.sh | 259 ++
 1 file changed, 259 insertions(+)
 create mode 100755 
tools/testing/selftests/drivers/net/mlxsw/one_armed_router.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/one_armed_router.sh 
b/tools/testing/selftests/drivers/net/mlxsw/one_armed_router.sh
new file mode 100755
index ..f02d83e94576
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/one_armed_router.sh
@@ -0,0 +1,259 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test a "one-armed router" [1] scenario. Packets forwarded between H1 and H2
+# should be forwarded by the ASIC, but also trapped so that ICMP redirect
+# packets could be potentially generated.
+#
+# 1. https://en.wikipedia.org/wiki/One-armed_router
+#
+# +-+
+# | H1 (vrf)|
+# |+ $h1|
+# || 192.0.2.1/24   |
+# || 2001:db8:1::1/64   |
+# |||
+# ||  default via 192.0.2.2 |
+# ||  default via 2001:db8:1::2 |
+# +|+
+#  |
+# +|--+
+# | SW |  |
+# | +--|+ |
+# | |  + $swp1   BR0 (802.1d)   | |
+# | |   | |
+# | |192.0.2.2/24   | |
+# | |  2001:db8:1::2/64 | |
+# | |   198.51.100.2/24 | |
+# | |  2001:db8:2::2/64 | |
+# | |   | |
+# | |  + $swp2  | |
+# | +--|+ |
+# ||  |
+# +|--+
+#  |
+# +|+
+# ||  default via 198.51.100.2  |
+# ||  default via 2001:db8:2::2 |
+# |||
+# || 2001:db8:2::1/64   |
+# || 198.51.100.1/24|
+# |+ $h2|
+# | H2 (vrf)|
+# +-+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="ping_ipv4 ping_ipv6 fwd_mark_ipv4 fwd_mark_ipv6"
+NUM_NETIFS=4
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+
+h1_create()
+{
+   simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+
+   ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
+   ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2
+}
+
+h1_destroy()
+{
+   ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2
+   ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
+
+   simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+   simple_if_init $h2 198.51.100.1/24 2001:db8:2::1/64
+
+   ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
+   ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2
+}
+
+h2_destroy()
+{
+   ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2
+   ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
+
+   simple_if_fini $h2 198.51.100.1/24 2001:db8:2::1/64
+}
+
+switch_create()
+{
+   ip link add name br0 type bridge mcast_snooping 0
+   ip link set dev br0 up
+
+   ip link set dev $swp1 master br0
+   ip link set dev $swp1 up
+   ip link set dev $swp2 master br0
+   ip link set dev $swp2 up
+
+   tc qdisc add dev $swp1 clsact
+   tc qdisc add dev $swp2 clsact
+
+   __addr_add_del br0 add 192.0.2.2/24 2001:db8:1::2/64
+   __addr_add_del br0 add 198.51.100.2/24 2001:db8:2::2/64
+}
+
+switch_destroy()
+{
+   __addr_add_del br0 del 198.51.100.2/24 2001:db8:2::2/64
+   __addr_add_del br0 del 192.0.2.2/24 2001:db8:1::2/64
+
+   tc qdisc del dev $swp2 clsact
+   tc qdisc del dev $swp1 clsact
+
+   ip link set dev $swp2 down
+   ip link set dev $swp2 nomaster
+   ip link set dev $swp1 down
+   ip link set dev $swp1 nomaster
+
+   ip link set dev br0 down
+   ip link del dev br0
+}
+
+ping_ipv4()
+{
+   ping_test $h1 198.51.100.1 ": h1->h2"
+}
+
+ping_ipv6()
+{
+   ping6_test $h1 2001:db8:2::1 ": h1->h2"
+}
+
+fwd_mark_ipv4()
+{
+   # Transmit packets 

[PATCH net-next 0/4] mlxsw: Add one-armed router support

2018-12-04 Thread Ido Schimmel
Up until now, when a packet was routed by the ASIC through the same
router interface (RIF) from which it ingressed from, the ASIC passed the
sole copy of the packet to the kernel. This allowed the kernel to route
the packet and also potentially generate an ICMP redirect.

There are scenarios (e.g., "one-armed router") where packets are
intentionally routed this way and are therefore not deemed as
exceptions. In such scenarios the current method of trapping packets to
the CPU is problematic, as it results in major packet loss.

This patchset solves the problem by having the ASIC forward the packet,
but also send a copy to the CPU, which gives the kernel the opportunity
to generate required exceptions.

To prevent the kernel from forwarding such packets again, the driver
marks them with 'offload_l3_fwd_mark', which causes the kernel to
consume them in ip{,6}_forward_finish().

Patch #1 renames 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark'. When
set, the field indicates that a packet was already forwarded in L3
(unicast / multicast) by a capable device.

Patch #2 teaches the kernel to consume unicast packets that have
'offload_l3_fwd_mark' set.

Patch #3 changes mlxsw to mirror loopbacked (iRIF == eRIF) packets,
instead of trapping them.

Patch #4 adds a test case for above mentioned scenario.

Ido Schimmel (4):
  skbuff: Rename 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark'
  net: Do not route unicast IP packets twice
  mlxsw: spectrum: Mirror loopbacked packets instead of trapping them
  selftests: mlxsw: Add one-armed router test

 drivers/net/ethernet/mellanox/mlxsw/reg.h |   1 +
 .../net/ethernet/mellanox/mlxsw/spectrum.c|  14 +-
 include/linux/skbuff.h|   4 +-
 net/core/skbuff.c |   2 +-
 net/ipv4/ip_forward.c |   7 +
 net/ipv4/ipmr.c   |   2 +-
 net/ipv6/ip6_output.c |   7 +
 .../drivers/net/mlxsw/one_armed_router.sh | 259 ++
 8 files changed, 287 insertions(+), 9 deletions(-)
 create mode 100755 
tools/testing/selftests/drivers/net/mlxsw/one_armed_router.sh

-- 
2.19.1



[PATCH net-next 3/4] mlxsw: spectrum: Mirror loopbacked packets instead of trapping them

2018-12-04 Thread Ido Schimmel
When the ASIC detects that a unicast packet is routed through the same
router interface (RIF) from which it ingressed (iRIF == eRIF), it raises
a trap called loopback error (LBERROR).

Thus far, this trap was configured to send a sole copy of the packet to
the CPU so that ICMP redirect packets could be potentially generated by
the kernel.

This is problematic as the CPU cannot forward packets at 3.2Tb/s and
there are scenarios (e.g., "one-armed router") where iRIF == eRIF is not
an exception.

Solve this by changing the trap to send a copy of the packet to the CPU.
To prevent the kernel from forwarding the packet again, it is marked
with 'offload_l3_fwd_mark'.

The trap is configured in a trap group of its own with a dedicated
policer in order not to prevent packets trapped by other traps from
reaching the CPU.

Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h  | 1 +
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h 
b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 5c8a38f61a93..b9c37b0bb175 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -5072,6 +5072,7 @@ enum mlxsw_reg_htgt_trap_group {
MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT,
MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD,
MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND,
+   MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR,
 };
 
 /* reg_htgt_trap_group
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 920085fbbf2a..0f37b203103a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3639,7 +3639,7 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = {
/* L3 traps */
MLXSW_SP_RXL_MARK(MTUERROR, TRAP_TO_CPU, ROUTER_EXP, false),
MLXSW_SP_RXL_MARK(TTLERROR, TRAP_TO_CPU, ROUTER_EXP, false),
-   MLXSW_SP_RXL_MARK(LBERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+   MLXSW_SP_RXL_L3_MARK(LBERROR, MIRROR_TO_CPU, LBERROR, false),
MLXSW_SP_RXL_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false),
MLXSW_SP_RXL_MARK(IPV6_UNSPECIFIED_ADDRESS, TRAP_TO_CPU, ROUTER_EXP,
  false),
@@ -3713,6 +3713,7 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core 
*mlxsw_core)
case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF:
+   case MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR:
rate = 128;
burst_size = 7;
break;
@@ -3798,6 +3799,7 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core 
*mlxsw_core)
case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST:
+   case MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR:
priority = 1;
tc = 1;
break;
-- 
2.19.1



Re: Re: [Bug] net/ipv6: skb_over_panic in mld_newpack

2018-12-04 Thread Nicolas Belouin
On 03/12 07:59, Eric Dumazet wrote:
> 
> 
> On 12/03/2018 07:20 AM, Nicolas Belouin wrote:
> > Hi,
> > I ran into a panic while adding an interface to a bridge with a vxlan
> > interface already attached to it, as it seems related mtu=9000.
> > 
> > I get the following panic info :
> > 
> > [ 2482.419893] br100: port 2(vif1.1) entered blocking state
> > [ 2482.425427] br100: port 2(vif1.1) entered forwarding state
> > [ 2482.431797] skbuff: skb_over_panic: text:816e4f78 len:40 put:40 
> > head:880146449000 data:880146458fd0 tail:0xfff8 end:0xec0 dev:vif1.1
> > [ 2482.442891] [ cut here ]
> > [ 2482.448254] kernel BUG at 
> > /srv/jenkins/workspace/workspace/hosting-xen-dom0-kernel/build/src/linux-4.9/net/core/skbuff.c:105!
> > [ 2482.459009] invalid opcode:  [#1] PREEMPT SMP
> > [ 2482.464371] Modules linked in:
> > [ 2482.469682] CPU: 19 PID: 1317 Comm: kworker/19:1 Not tainted 
> > 4.9.135-dom0-e9d15b2-x86_64-iaas #2
> > [ 2482.480362] Hardware name: Dell Inc. PowerEdge C8220/09N44V, BIOS 2.7.1 
> > 03/04/2015
> > [ 2482.491008] Workqueue: ipv6_addrconf addrconf_dad_work
> > [ 2482.496380] task: 88017eef1a00 task.stack: c90001fcc000
> > [ 2482.501785] RIP: e030:[]  [] 
> > skb_panic+0x5f/0x70
> > [ 2482.512450] RSP: e02b:c90001fcfba8  EFLAGS: 00010296
> > [ 2482.517817] RAX: 0088 RBX: 880117fb0800 RCX: 
> > 
> > [ 2482.528447] RDX: 0088 RSI: 880184cd03c8 RDI: 
> > 880184cd03c8
> > [ 2482.539085] RBP: c90001fcfc00 R08: 06a8 R09: 
> > 81ea7359
> > [ 2482.549717] R10: 880180406f80 R11: 06a8 R12: 
> > 880147258cc0
> > [ 2482.560350] R13: c90001fcfc20 R14: 81d13440 R15: 
> > 
> > [ 2482.570993] FS:  () GS:880184cc() 
> > knlGS:
> > [ 2482.581646] CS:  e033 DS:  ES:  CR0: 80050033
> > [ 2482.587039] CR2: 7f5b17f032b0 CR3: 01c08000 CR4: 
> > 00042660
> > [ 2482.597675] Stack:
> > [ 2482.602958]  880146458fd0 fff8 0ec0 
> > 88017f3f
> > [ 2482.613619]  815efa62 816e4f78 880117fb0800 
> > c90001fcfc20
> > [ 2482.624288]  880147258cc0 88017f3f 880146502000 
> > c90001fcfc68
> > [ 2482.634955] Call Trace:
> > [ 2482.640254]  [] ? skb_put+0x42/0x50
> > [ 2482.645633]  [] ? ip6_mc_hdr.constprop.36+0x58/0xd0
> > [ 2482.651045]  [] ? mld_newpack+0x12a/0x1e0
> > [ 2482.656421]  [] ? add_grhead.isra.28+0x87/0xa0
> > [ 2482.661825]  [] ? add_grec+0x446/0x4c0
> > [ 2482.667198]  [] ? __local_bh_enable_ip+0x1b/0xb0
> > [ 2482.672609]  [] ? mld_send_initial_cr.part.29+0x58/0xa0
> > [ 2482.678022]  [] ? ipv6_mc_dad_complete+0x26/0x60
> > [ 2482.683441]  [] ? addrconf_dad_completed+0x29f/0x2c0
> > [ 2482.688850]  [] ? ipv6_dev_mc_inc+0x194/0x2c0
> > [ 2482.694249]  [] ? addrconf_dad_work+0xfe/0x3d0
> > [ 2482.699650]  [] ? _raw_spin_unlock_irq+0xd/0x20
> > [ 2482.705052]  [] ? process_one_work+0x142/0x3e0
> > [ 2482.710453]  [] ? worker_thread+0x62/0x480
> > [ 2482.715848]  [] ? process_one_work+0x3e0/0x3e0
> > [ 2482.721256]  [] ? kthread+0xe2/0x100
> > [ 2482.726621]  [] ? __switch_to+0x261/0x6b0
> > [ 2482.732006]  [] ? kthread_park+0x60/0x60
> > [ 2482.737379]  [] ? ret_from_fork+0x57/0x70
> > [ 2482.742761] Code: 00 00 48 89 44 24 10 8b 87 b0 00 00 00 48 89 44 24 08 
> > 48 8b 87 c0 00 00 00 48 c7 c7 50 8e a2 81 48 89 04 24 31 c0 e8 b5 07 b6 ff 
> > <0f> 0b 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 
> > [ 2482.759199] RIP  [] skb_panic+0x5f/0x70
> > [ 2482.764672]  RSP 
> > [ 2482.771186] ---[ end trace 6d0fe52ed049d841 ]---
> > [ 2482.776641] Kernel panic - not syncing: Fatal exception in interrupt
> > [ 2482.861621] Kernel Offset: disabled
> > 
> > I circumvented the bug by applying this patch:
> > diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
> > index 21f6deb2aec9..2762c3dcc883 100644
> > --- a/net/ipv6/mcast.c
> > +++ b/net/ipv6/mcast.c
> > @@ -1605,8 +1605,6 @@ static struct sk_buff *mld_newpack(struct inet6_dev 
> > *idev, unsigned int mtu)
> >  IPV6_TLV_PADN, 0 };
> > 
> > /* we assume size > sizeof(ra) here */
> > -   /* limit our allocations to order-0 page */
> > -   size = min_t(int, size, SKB_MAX_ORDER(0, 0));
> > skb = sock_alloc_send_skb(sk, size, 1, );
> > 
> > if (!skb)
> > 
> > The lines are introduced by commit 72e09ad107e78d69ff4d3b97a69f0aad2b77280f
> > stating that "order-2 GRP_ATOMIC allocations are very unreliable"
> > I then wonder if this statement is still relevant, or if such a patch
> > would be acceptable to you.
> 
> Thanks for the report, but this patch is not correct.
> 
> I rather suspect commit 1837b2e2bcd23137766555a63867e649c0b637f0 ("mld, igmp: 
> Fix reserved tailroom calculation")
> is the problem.
> 

If I follow the code here, skb_over_panic is triggered if
skb->tail > 

[PATCH iproute2-next 2/2] devlink: Add support for 'fw_load_policy' generic parameter

2018-12-04 Thread Shalom Toledo
Add string to uint conversion for 'fw_load_policy' generic parameter.

Signed-off-by: Shalom Toledo 
Reviewed-by: Jiri Pirko 
---
 devlink/devlink.c| 13 -
 include/uapi/linux/devlink.h |  5 +
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 1e3deb24d214..3651e90c1159 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -1972,7 +1972,18 @@ param_val_conv_str_get(const struct param_val_conv 
*param_val_conv,
return -ENOENT;
 }
 
-static const struct param_val_conv param_val_conv[] = {};
+static const struct param_val_conv param_val_conv[] = {
+   {
+   .name = "fw_load_policy",
+   .vstr = "driver",
+   .vuint = DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER,
+   },
+   {
+   .name = "fw_load_policy",
+   .vstr = "flash",
+   .vuint = DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH,
+   },
+};
 
 #define PARAM_VAL_CONV_LEN ARRAY_SIZE(param_val_conv)
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 5ee0e7397591..d0a33d79dc22 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -163,6 +163,11 @@ enum devlink_param_cmode {
DEVLINK_PARAM_CMODE_MAX = __DEVLINK_PARAM_CMODE_MAX - 1
 };
 
+enum devlink_param_fw_load_policy_value {
+   DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER,
+   DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH,
+};
+
 enum devlink_attr {
/* don't change the order or add anything between, this is ABI! */
DEVLINK_ATTR_UNSPEC,
-- 
2.17.2



[PATCH iproute2-next 0/2] devlink: Add support for 'fw_load_policy' generic parameter

2018-12-04 Thread Shalom Toledo
Patch #1 add string to uint conversion support for generic parameters.
Patch #2 add string to uint support for 'fw_load_policy' generic parameter

Shalom Toledo (2):
  devlink: Add string to uint{8,16,32} conversion for generic parameters
  devlink: Add support for 'fw_load_policy' generic parameter

 devlink/devlink.c| 156 ---
 include/uapi/linux/devlink.h |   5 ++
 2 files changed, 151 insertions(+), 10 deletions(-)

-- 
2.17.2



[RFC bpf-next 3/7] ppc: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*

2018-12-04 Thread Jiong Wang
This patch implements code-gen for BPF_ALU | BPF_ARSH | BPF_*.

Cc: Naveen N. Rao 
Cc: Sandipan Das 
Signed-off-by: Jiong Wang 
---
 arch/powerpc/include/asm/ppc-opcode.h | 2 ++
 arch/powerpc/net/bpf_jit.h| 4 
 arch/powerpc/net/bpf_jit_comp64.c | 6 ++
 3 files changed, 12 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index a6e9e31..9014592 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -342,6 +342,8 @@
 #define PPC_INST_SLW   0x7c30
 #define PPC_INST_SLD   0x7c36
 #define PPC_INST_SRW   0x7c000430
+#define PPC_INST_SRAW  0x7c000630
+#define PPC_INST_SRAWI 0x7c000670
 #define PPC_INST_SRD   0x7c000436
 #define PPC_INST_SRAD  0x7c000634
 #define PPC_INST_SRADI 0x7c000674
diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 47fc666..c2d5192 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -152,6 +152,10 @@
 ___PPC_RS(a) | ___PPC_RB(s))
 #define PPC_SRW(d, a, s)   EMIT(PPC_INST_SRW | ___PPC_RA(d) |\
 ___PPC_RS(a) | ___PPC_RB(s))
+#define PPC_SRAW(d, a, s)  EMIT(PPC_INST_SRAW | ___PPC_RA(d) |   \
+___PPC_RS(a) | ___PPC_RB(s))
+#define PPC_SRAWI(d, a, i) EMIT(PPC_INST_SRAWI | ___PPC_RA(d) |  \
+___PPC_RS(a) | __PPC_SH(i))
 #define PPC_SRD(d, a, s)   EMIT(PPC_INST_SRD | ___PPC_RA(d) |\
 ___PPC_RS(a) | ___PPC_RB(s))
 #define PPC_SRAD(d, a, s)  EMIT(PPC_INST_SRAD | ___PPC_RA(d) |   \
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 17482f5..c685b4f 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -529,9 +529,15 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image,
if (imm != 0)
PPC_SRDI(dst_reg, dst_reg, imm);
break;
+   case BPF_ALU | BPF_ARSH | BPF_X: /* (s32) dst >>= src */
+   PPC_SRAW(dst_reg, dst_reg, src_reg);
+   break;
case BPF_ALU64 | BPF_ARSH | BPF_X: /* (s64) dst >>= src */
PPC_SRAD(dst_reg, dst_reg, src_reg);
break;
+   case BPF_ALU | BPF_ARSH | BPF_K: /* (s32) dst >>= imm */
+   PPC_SRAWI(dst_reg, dst_reg, imm);
+   break;
case BPF_ALU64 | BPF_ARSH | BPF_K: /* (s64) dst >>= imm */
if (imm != 0)
PPC_SRADI(dst_reg, dst_reg, imm);
-- 
2.7.4



[RFC bpf-next 2/7] mips: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_X

2018-12-04 Thread Jiong Wang
Jitting of BPF_K is supported already, but not BPF_X. This patch complete
the support for the latter on both MIPS and microMIPS.

Cc: Paul Burton 
Cc: linux-m...@vger.kernel.org
Signed-off-by: Jiong Wang 
---
 arch/mips/include/asm/uasm.h  | 1 +
 arch/mips/include/uapi/asm/inst.h | 1 +
 arch/mips/mm/uasm-micromips.c | 1 +
 arch/mips/mm/uasm-mips.c  | 1 +
 arch/mips/mm/uasm.c   | 9 +
 arch/mips/net/ebpf_jit.c  | 4 
 6 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index 59dae37..b1990dd 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -157,6 +157,7 @@ Ip_u2u1s3(_slti);
 Ip_u2u1s3(_sltiu);
 Ip_u3u1u2(_sltu);
 Ip_u2u1u3(_sra);
+Ip_u3u2u1(_srav);
 Ip_u2u1u3(_srl);
 Ip_u3u2u1(_srlv);
 Ip_u3u1u2(_subu);
diff --git a/arch/mips/include/uapi/asm/inst.h 
b/arch/mips/include/uapi/asm/inst.h
index 273ef58..40fbb5d 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -371,6 +371,7 @@ enum mm_32a_minor_op {
mm_srl32_op = 0x040,
mm_srlv32_op = 0x050,
mm_sra_op = 0x080,
+   mm_srav_op = 0x090,
mm_rotr_op = 0x0c0,
mm_lwxs_op = 0x118,
mm_addu32_op = 0x150,
diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c
index 24e5b0d..75ef904 100644
--- a/arch/mips/mm/uasm-micromips.c
+++ b/arch/mips/mm/uasm-micromips.c
@@ -104,6 +104,7 @@ static const struct insn insn_table_MM[insn_invalid] = {
[insn_sltiu]= {M(mm_sltiu32_op, 0, 0, 0, 0, 0), RT | RS | SIMM},
[insn_sltu] = {M(mm_pool32a_op, 0, 0, 0, 0, mm_sltu_op), RT | RS | 
RD},
[insn_sra]  = {M(mm_pool32a_op, 0, 0, 0, 0, mm_sra_op), RT | RS | 
RD},
+   [insn_srav] = {M(mm_pool32a_op, 0, 0, 0, 0, mm_srav_op), RT | RS | 
RD},
[insn_srl]  = {M(mm_pool32a_op, 0, 0, 0, 0, mm_srl32_op), RT | RS | 
RD},
[insn_srlv] = {M(mm_pool32a_op, 0, 0, 0, 0, mm_srlv32_op), RT | RS 
| RD},
[insn_rotr] = {M(mm_pool32a_op, 0, 0, 0, 0, mm_rotr_op), RT | RS | 
RD},
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 60ceb93..6abe40f 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -171,6 +171,7 @@ static const struct insn insn_table[insn_invalid] = {
[insn_sltiu]= {M(sltiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM},
[insn_sltu] = {M(spec_op, 0, 0, 0, 0, sltu_op), RS | RT | RD},
[insn_sra]  = {M(spec_op, 0, 0, 0, 0, sra_op),  RT | RD | RE},
+   [insn_srav] = {M(spec_op, 0, 0, 0, 0, srav_op), RS | RT | RD},
[insn_srl]  = {M(spec_op, 0, 0, 0, 0, srl_op),  RT | RD | RE},
[insn_srlv] = {M(spec_op, 0, 0, 0, 0, srlv_op),  RS | RT | RD},
[insn_subu] = {M(spec_op, 0, 0, 0, 0, subu_op), RS | RT | RD},
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index 57570c0..45b6264 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -61,10 +61,10 @@ enum opcode {
insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_nor,
insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sb,
insn_sc, insn_scd, insn_sd, insn_sh, insn_sll, insn_sllv,
-   insn_slt, insn_slti, insn_sltiu, insn_sltu, insn_sra, insn_srl,
-   insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp,
-   insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor,
-   insn_xori, insn_yield,
+   insn_slt, insn_slti, insn_sltiu, insn_sltu, insn_sra, insn_srav,
+   insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall,
+   insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh,
+   insn_xor, insn_xori, insn_yield,
insn_invalid /* insn_invalid must be last */
 };
 
@@ -353,6 +353,7 @@ I_u2u1s3(_slti)
 I_u2u1s3(_sltiu)
 I_u3u1u2(_sltu)
 I_u2u1u3(_sra)
+I_u3u2u1(_srav)
 I_u2u1u3(_srl)
 I_u3u2u1(_srlv)
 I_u2u1u3(_rotr)
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index aeb7b1b..b16710a 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -854,6 +854,7 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
case BPF_ALU | BPF_MOD | BPF_X: /* ALU_REG */
case BPF_ALU | BPF_LSH | BPF_X: /* ALU_REG */
case BPF_ALU | BPF_RSH | BPF_X: /* ALU_REG */
+   case BPF_ALU | BPF_ARSH | BPF_X: /* ALU_REG */
src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
if (src < 0 || dst < 0)
@@ -913,6 +914,9 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
case BPF_RSH:
emit_instr(ctx, srlv, dst, dst, src);
break;
+   case BPF_ARSH:
+   emit_instr(ctx, srav, dst, dst, src);
+   break;
  

[RFC bpf-next 5/7] nfp: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*

2018-12-04 Thread Jiong Wang
BPF_X support needs indirect shift mode, please see code comments for
details.

Reviewed-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 45 
 1 file changed, 45 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 97d33bb..662cbc2 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2382,6 +2382,49 @@ static int neg_reg(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
return 0;
 }
 
+static int __ashr_imm(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
+{
+   /* Set signedness bit (MSB of result). */
+   emit_alu(nfp_prog, reg_none(), reg_a(dst), ALU_OP_OR, reg_imm(0));
+   emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR, reg_b(dst),
+SHF_SC_R_SHF, shift_amt);
+   wrp_immed(nfp_prog, reg_both(dst + 1), 0);
+
+   return 0;
+}
+
+static int ashr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   const struct bpf_insn *insn = >insn;
+   u64 umin, umax;
+   u8 dst, src;
+
+   dst = insn->dst_reg * 2;
+   umin = meta->umin_src;
+   umax = meta->umax_src;
+   if (umin == umax)
+   return __ashr_imm(nfp_prog, dst, umin);
+
+   src = insn->src_reg * 2;
+   /* NOTE: the first insn will set both indirect shift amount (source A)
+* and signedness bit (MSB of result).
+*/
+   emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst));
+   emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
+  reg_b(dst), SHF_SC_R_SHF);
+   wrp_immed(nfp_prog, reg_both(dst + 1), 0);
+
+   return 0;
+}
+
+static int ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   const struct bpf_insn *insn = >insn;
+   u8 dst = insn->dst_reg * 2;
+
+   return __ashr_imm(nfp_prog, dst, insn->imm);
+}
+
 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
const struct bpf_insn *insn = >insn;
@@ -3286,6 +3329,8 @@ static const instr_cb_t instr_cb[256] = {
[BPF_ALU | BPF_DIV | BPF_K] =   div_imm,
[BPF_ALU | BPF_NEG] =   neg_reg,
[BPF_ALU | BPF_LSH | BPF_K] =   shl_imm,
+   [BPF_ALU | BPF_ARSH | BPF_X] =  ashr_reg,
+   [BPF_ALU | BPF_ARSH | BPF_K] =  ashr_imm,
[BPF_ALU | BPF_END | BPF_X] =   end_reg32,
[BPF_LD | BPF_IMM | BPF_DW] =   imm_ld8,
[BPF_LD | BPF_ABS | BPF_B] =data_ld1,
-- 
2.7.4



[RFC bpf-next 1/7] bpf: interpreter support BPF_ALU | BPF_ARSH

2018-12-04 Thread Jiong Wang
This patch implements interpreting BPF_ALU | BPF_ARSH. Do arithmetic right
shift on low 32-bit sub-register, and zero the high 32 bits.

Reviewed-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
---
 kernel/bpf/core.c | 52 ++--
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f93ed66..36e31d8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -923,32 +923,34 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 #define BPF_INSN_MAP(INSN_2, INSN_3)   \
/* 32 bit ALU operations. */\
/*   Register based. */ \
-   INSN_3(ALU, ADD, X),\
-   INSN_3(ALU, SUB, X),\
-   INSN_3(ALU, AND, X),\
-   INSN_3(ALU, OR,  X),\
-   INSN_3(ALU, LSH, X),\
-   INSN_3(ALU, RSH, X),\
-   INSN_3(ALU, XOR, X),\
-   INSN_3(ALU, MUL, X),\
-   INSN_3(ALU, MOV, X),\
-   INSN_3(ALU, DIV, X),\
-   INSN_3(ALU, MOD, X),\
+   INSN_3(ALU, ADD,  X),   \
+   INSN_3(ALU, SUB,  X),   \
+   INSN_3(ALU, AND,  X),   \
+   INSN_3(ALU, OR,   X),   \
+   INSN_3(ALU, LSH,  X),   \
+   INSN_3(ALU, RSH,  X),   \
+   INSN_3(ALU, XOR,  X),   \
+   INSN_3(ALU, MUL,  X),   \
+   INSN_3(ALU, MOV,  X),   \
+   INSN_3(ALU, ARSH, X),   \
+   INSN_3(ALU, DIV,  X),   \
+   INSN_3(ALU, MOD,  X),   \
INSN_2(ALU, NEG),   \
INSN_3(ALU, END, TO_BE),\
INSN_3(ALU, END, TO_LE),\
/*   Immediate based. */\
-   INSN_3(ALU, ADD, K),\
-   INSN_3(ALU, SUB, K),\
-   INSN_3(ALU, AND, K),\
-   INSN_3(ALU, OR,  K),\
-   INSN_3(ALU, LSH, K),\
-   INSN_3(ALU, RSH, K),\
-   INSN_3(ALU, XOR, K),\
-   INSN_3(ALU, MUL, K),\
-   INSN_3(ALU, MOV, K),\
-   INSN_3(ALU, DIV, K),\
-   INSN_3(ALU, MOD, K),\
+   INSN_3(ALU, ADD,  K),   \
+   INSN_3(ALU, SUB,  K),   \
+   INSN_3(ALU, AND,  K),   \
+   INSN_3(ALU, OR,   K),   \
+   INSN_3(ALU, LSH,  K),   \
+   INSN_3(ALU, RSH,  K),   \
+   INSN_3(ALU, XOR,  K),   \
+   INSN_3(ALU, MUL,  K),   \
+   INSN_3(ALU, MOV,  K),   \
+   INSN_3(ALU, ARSH, K),   \
+   INSN_3(ALU, DIV,  K),   \
+   INSN_3(ALU, MOD,  K),   \
/* 64 bit ALU operations. */\
/*   Register based. */ \
INSN_3(ALU64, ADD,  X), \
@@ -1127,6 +1129,12 @@ static u64 ___bpf_prog_run(u64 *regs, const struct 
bpf_insn *insn, u64 *stack)
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
+   ALU_ARSH_X:
+   DST = (u64) (u32) ((*(s32 *) ) >> SRC);
+   CONT;
+   ALU_ARSH_K:
+   DST = (u64) (u32) ((*(s32 *) ) >> IMM);
+   CONT;
ALU64_ARSH_X:
(*(s64 *) ) >>= SRC;
CONT;
-- 
2.7.4



[RFC bpf-next 7/7] selftests: bpf: update testcases for BPF_ALU | BPF_ARSH

2018-12-04 Thread Jiong Wang
"arsh32 on imm" and "arsh32 on reg" now are accepted. Also added two new
testcases to make sure arsh32 won't be treated as arsh64 during
interpretation or JIT code-gen for which case the high bits will be moved
into low halve that the testcases could catch them.

Reviewed-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
---
 tools/testing/selftests/bpf/test_verifier.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 18d0b7f..e7f1bc7 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -721,8 +721,18 @@ static struct bpf_test tests[] = {
BPF_ALU32_IMM(BPF_ARSH, BPF_REG_0, 5),
BPF_EXIT_INSN(),
},
-   .result = REJECT,
-   .errstr = "unknown opcode c4",
+   .result = ACCEPT,
+   .retval = 0,
+   },
+   {
+   "arsh32 on imm 2",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_0, 0x1122334485667788),
+   BPF_ALU32_IMM(BPF_ARSH, BPF_REG_0, 7),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = -16069393,
},
{
"arsh32 on reg",
@@ -732,8 +742,19 @@ static struct bpf_test tests[] = {
BPF_ALU32_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1),
BPF_EXIT_INSN(),
},
-   .result = REJECT,
-   .errstr = "unknown opcode cc",
+   .result = ACCEPT,
+   .retval = 0,
+   },
+   {
+   "arsh32 on reg 2",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_0, 0x55667788),
+   BPF_MOV64_IMM(BPF_REG_1, 15),
+   BPF_ALU32_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = 43724,
},
{
"arsh64 on imm",
-- 
2.7.4



[RFC bpf-next 4/7] s390: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*

2018-12-04 Thread Jiong Wang
This patch implements code-gen for BPF_ALU | BPF_ARSH | BPF_*.

Cc: Martin Schwidefsky 
Cc: Heiko Carstens 
Signed-off-by: Jiong Wang 
---
 arch/s390/net/bpf_jit_comp.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index d7052cb..3ff758e 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -821,10 +821,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
/*
 * BPF_ARSH
 */
+   case BPF_ALU | BPF_ARSH | BPF_X: /* ((s32) dst) >>= src */
+   /* sra %dst,%dst,0(%src) */
+   EMIT4_DISP(0x8a00, dst_reg, src_reg, 0);
+   EMIT_ZERO(dst_reg);
+   break;
case BPF_ALU64 | BPF_ARSH | BPF_X: /* ((s64) dst) >>= src */
/* srag %dst,%dst,0(%src) */
EMIT6_DISP_LH(0xeb00, 0x000a, dst_reg, dst_reg, src_reg, 0);
break;
+   case BPF_ALU | BPF_ARSH | BPF_K: /* ((s32) dst >> imm */
+   if (imm == 0)
+   break;
+   /* sra %dst,imm(%r0) */
+   EMIT4_DISP(0x8a00, dst_reg, REG_0, imm);
+   EMIT_ZERO(dst_reg);
+   break;
case BPF_ALU64 | BPF_ARSH | BPF_K: /* ((s64) dst) >>= imm */
if (imm == 0)
break;
-- 
2.7.4



[RFC bpf-next 0/7] bpf: support BPF_ALU | BPF_ARSH

2018-12-04 Thread Jiong Wang
BPF_ALU | BPF_ARSH | BPF_* were rejected by commit: 7891a87efc71
("bpf: arsh is not supported in 32 bit alu thus reject it"). As explained
in the commit message, this is due to there is no complete support for them
on interpreter and various JIT compilation back-ends.

This patch set is a follow-up which completes the missing bits. This also
pave the way for running bpf program compiled with ALU32 instruction
enabled by specifing -mattr=+alu32 to LLVM for which case there is likely
have more BPF_ALU | BPF_ARSH insns that will trigger the rejection code.

test_verifier.c is updated accordingly.

I have tested this patch set on x86-64 and NFP, I need help of review and
test on the arch changes (mips/ppc/s390).

Note, to avoid merge confict, mips change needs to be applied on top of
commit: 20b880a05f06 ("mips: bpf: fix encoding bug for mm_srlv32_op"), it
is on mips-fixes branch at the moment.

Thanks.

Jiong Wang (7):
  bpf: interpreter support BPF_ALU | BPF_ARSH
  mips: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_X
  ppc: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*
  s390: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*
  nfp: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*
  bpf: verifier remove the rejection on BPF_ALU | BPF_ARSH
  selftests: bpf: update testcases for BPF_ALU | BPF_ARSH

 arch/mips/include/asm/uasm.h |  1 +
 arch/mips/include/uapi/asm/inst.h|  1 +
 arch/mips/mm/uasm-micromips.c|  1 +
 arch/mips/mm/uasm-mips.c |  1 +
 arch/mips/mm/uasm.c  |  9 ++---
 arch/mips/net/ebpf_jit.c |  4 +++
 arch/powerpc/include/asm/ppc-opcode.h|  2 ++
 arch/powerpc/net/bpf_jit.h   |  4 +++
 arch/powerpc/net/bpf_jit_comp64.c|  6 
 arch/s390/net/bpf_jit_comp.c | 12 +++
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 45 
 kernel/bpf/core.c| 52 
 kernel/bpf/verifier.c|  5 ---
 tools/testing/selftests/bpf/test_verifier.c  | 29 +---
 14 files changed, 137 insertions(+), 35 deletions(-)

-- 
2.7.4



[RFC bpf-next 6/7] bpf: verifier remove the rejection on BPF_ALU | BPF_ARSH

2018-12-04 Thread Jiong Wang
This patch remove the rejection on BPF_ALU | BPF_ARSH as we have supported
them on interpreter and all JIT back-ends

Reviewed-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
---
 kernel/bpf/verifier.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ce8a1c3..8ed868e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3649,11 +3649,6 @@ static int check_alu_op(struct bpf_verifier_env *env, 
struct bpf_insn *insn)
return -EINVAL;
}
 
-   if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
-   verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
-   return -EINVAL;
-   }
-
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
 opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
-- 
2.7.4



Re: [PATCH net-next v4 0/2] Add mii_bus to ixgbe driver for dsa devs

2018-12-04 Thread Andrew Lunn
> Yes the current solution whereby we need to get a hold on the network
> device's struct device reference is not quite ideal, AFAIR, Andrew has
> had the same problem.

Yes, it is not nice, and there is a race with systemd renaming the
interfaces using its naming rules. We need a way to lookup an
interface based on a bus location, or similar, not by name.

  Andrew


Re: [RFC PATCH] net: mvpp2: fix detection of 10G SFP modules

2018-12-04 Thread Russell King - ARM Linux
On Tue, Dec 04, 2018 at 12:19:54PM +0200, Baruch Siach wrote:
> Hi Russell,
> 
> On Thu, Nov 29, 2018 at 10:00:43PM +, Russell King - ARM Linux wrote:
> > On Thu, Nov 29, 2018 at 11:31:23AM -0800, Florian Fainelli wrote:
> > > On 11/29/2018 4:49 AM, Baruch Siach wrote:
> > > > The mvpp2_phylink_validate() relies on the interface field of
> > > > phylink_link_state to determine valid link modes. However, when called
> > > > from phylink_sfp_module_insert() this field in not initialized. The
> > > > default switch case then excludes 10G link modes. This allows 10G SFP
> > > > modules that are detected correctly to be configured at max rate of
> > > > 2.5G.
> > > > 
> > > > Catch the uninitialized PHY mode case, and allow 10G rates.
> > > > 
> > > > Cc: Maxime Chevallier 
> > > > Cc: Antoine Tenart 
> > > > Signed-off-by: Baruch Siach 
> > > > ---
> > > > Is that the right fix?
> > > 
> > > It would be a bit surprising that this is the right fix, you would
> > > expect validate to be called once everything has been parsed
> > > successfully from the SFP, is not that the case here? If not, can you
> > > find out what happens?
> > 
> > Two calls are made - the first with PHY_INTERFACE_MODE_NA to
> > determine what the advertising link mode may be, and then again
> > once the interface mode has been selected from the advertising mask.
> > 
> > Why?
> > 
> > Consider a 4.3Mbps fiberchannel SFP plugged into a 1G-only MAC.
> > If we did it as a single pass, we would end up passing an
> > interface mode of 2500BASEX first time around which is illogical.
> 
> So you consider this to be the right fix, right?

Yes, but there is another bug lurking here - the handling of invalid
interface modes is not correct.  Please see mvneta.c as an example -
interface modes that are not supported by the MAC (apart from the NA
mode) end up with the supported mask completely cleared.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
According to speedtest.net: 11.9Mbps down 500kbps up


Re: [PATCH net-next 1/4] indirect call wrappers: helpers to speed-up indirect calls of builtin

2018-12-04 Thread Paolo Abeni
On Mon, 2018-12-03 at 10:04 -0800, Eric Dumazet wrote:
> On 12/03/2018 03:40 AM, Paolo Abeni wrote:
> > This header define a bunch of helpers that allow avoiding the
> > retpoline overhead when calling builtin functions via function pointers.
> > It boils down to explicitly comparing the function pointers to
> > known builtin functions and eventually invoke directly the latter.
> > 
> > The macros defined here implement the boilerplate for the above schema
> > and will be used by the next patches.
> > 
> > rfc -> v1:
> >  - use branch prediction hint, as suggested by Eric
> > 
> > Suggested-by: Eric Dumazet 
> > Signed-off-by: Paolo Abeni 
> > ---
> >  include/linux/indirect_call_wrapper.h | 77 +++
> >  1 file changed, 77 insertions(+)
> >  create mode 100644 include/linux/indirect_call_wrapper.h
> 
> This needs to be discussed more broadly, please include lkml 

Agreed. @David: please let me know if you prefer a repost or a v2 with
the expanded recipients list.

> Also please include Paul Turner   in the discussion, since 
> he was
> the inventor of this idea.

I will do.

Thanks,
Paolo



[PATCH v2 1/2] net: mvpp2: fix detection of 10G SFP modules

2018-12-04 Thread Baruch Siach
The mvpp2_phylink_validate() relies on the interface field of
phylink_link_state to determine valid link modes. However, when called
from phylink_sfp_module_insert() this field in not initialized. The
default switch case then excludes 10G link modes. This allows 10G SFP
modules that are detected correctly to be configured at max rate of
2.5G.

Catch the uninitialized PHY mode case, and allow 10G rates.

Fixes: d97c9f4ab000b ("net: mvpp2: 1000baseX support")
Cc: Maxime Chevallier 
Cc: Antoine Tenart 
Acked-by: Russell King 
Signed-off-by: Baruch Siach 
---
v2:
  Add ack and fixes tags
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 7a37a37e3fb3..eb1dc8abc359 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -4384,6 +4384,7 @@ static void mvpp2_phylink_validate(struct net_device *dev,
 
switch (state->interface) {
case PHY_INTERFACE_MODE_10GKR:
+   case PHY_INTERFACE_MODE_NA:
phylink_set(mask, 1baseCR_Full);
phylink_set(mask, 1baseSR_Full);
phylink_set(mask, 1baseLR_Full);
-- 
2.19.2



[PATCH] i40e: fix mac filter delete when setting mac address

2018-12-04 Thread Stefan Assmann
A previous commit moved the ether_addr_copy() in i40e_set_mac() before
the mac filter del/add to avoid a race. However it wasn't taken into
account that this alters the mac address being handed to
i40e_del_mac_filter().

Also changed i40e_add_mac_filter() to operate on netdev->dev_addr,
hopefully that makes the code easier to read.

Fixes: 458867b2ca0c ("i40e: don't remove netdev->dev_addr when syncing uc list")

Signed-off-by: Stefan Assmann 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 6d5b13f69dec..901ef799d2ad 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1546,17 +1546,17 @@ static int i40e_set_mac(struct net_device *netdev, void 
*p)
netdev_info(netdev, "set new mac address %pM\n", addr->sa_data);
 
/* Copy the address first, so that we avoid a possible race with
-* .set_rx_mode(). If we copy after changing the address in the filter
-* list, we might open ourselves to a narrow race window where
-* .set_rx_mode could delete our dev_addr filter and prevent traffic
-* from passing.
+* .set_rx_mode().
+* - Remove old address from MAC filter
+* - Copy new address
+* - Add new address to MAC filter
 */
-   ether_addr_copy(netdev->dev_addr, addr->sa_data);
-
spin_lock_bh(>mac_filter_hash_lock);
i40e_del_mac_filter(vsi, netdev->dev_addr);
-   i40e_add_mac_filter(vsi, addr->sa_data);
+   ether_addr_copy(netdev->dev_addr, addr->sa_data);
+   i40e_add_mac_filter(vsi, netdev->dev_addr);
spin_unlock_bh(>mac_filter_hash_lock);
+
if (vsi->type == I40E_VSI_MAIN) {
i40e_status ret;
 
-- 
2.19.2



RE: [PATCH net-next] can: flexcan: flexcan_chip_start(): fix the error return code in flexcan_setup_stop_mode()

2018-12-04 Thread Aisheng DONG
> -Original Message-
> From: Wei Yongjun [mailto:weiyongj...@huawei.com]
> Sent: Tuesday, December 4, 2018 2:26 PM
> To: Wolfgang Grandegger ; Marc Kleine-Budde
> ; Aisheng DONG 
> Cc: Wei Yongjun ; linux-...@vger.kernel.org;
> netdev@vger.kernel.org; kernel-janit...@vger.kernel.org
> Subject: [PATCH net-next] can: flexcan: flexcan_chip_start(): fix the error 
> return
> code in flexcan_setup_stop_mode()
> 
> The error return code PTR_ERR(gpr_np) is always 0 since gpr_np is equal to
> NULL in this error handling case. Fix it by return -ENOENT.
> 
> Fixes: de3578c198c6 ("can: flexcan: add self wakeup support")
> Signed-off-by: Wei Yongjun 
> ---
>  drivers/net/can/flexcan.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index
> 0f36eaf..f412d84 100644
> --- a/drivers/net/can/flexcan.c
> +++ b/drivers/net/can/flexcan.c
> @@ -1432,7 +1432,7 @@ static int flexcan_setup_stop_mode(struct
> platform_device *pdev)
>   gpr_np = of_find_node_by_phandle(phandle);
>   if (!gpr_np) {
>   dev_dbg(>dev, "could not find gpr node by phandle\n");
> - return PTR_ERR(gpr_np);
> + return -ENOENT;

Good catch.

Reviewed-by: Dong Aisheng 

Regards
Dong Aisheng

>   }
> 
>   priv = netdev_priv(dev);
> 
> 



Re: [PATCH 2/2] net: phy: ensure autoneg is configured when resuming a phydev

2018-12-04 Thread Anssi Hannula
On 3.12.2018 23:41, Heiner Kallweit wrote:
> On 03.12.2018 11:43, Anssi Hannula wrote:
>> On 1.12.2018 0:16, Heiner Kallweit wrote:
>>> On 30.11.2018 19:45, Anssi Hannula wrote:
 When a PHY_HALTED phydev is resumed by phy_start(), it is set to
 PHY_RESUMING to wait for the link to come up.

 However, if the phydev was put to PHY_HALTED (by e.g. phy_stop()) before
 autonegotiation was ever started by phy_state_machine(), autonegotiation
 remains unconfigured, i.e. phy_config_aneg() is never called.

>>> phy_stop() should only be called once the PHY was started. But it's
>>> right that we don't have full control over it because misbehaving
>>> drivers may call phy_stop() even if the PHY hasn't been started yet.
>> Having run phy_start() does not guarantee that the phy_state_machine()
>> has been run at least once, though.
>>
>> I was able to reproduce the issue by calling phy_start(); phy_stop().
>> Then on the next phy_start() autoneg is not configured.
>>
> Right, phy_start() schedules the state machine run, so there is a small
> window where this can happen. Did you experience this in any real-life
> scenario?

No, I encountered this while tinkering with phy_start/stop() to look how
they interact with suspending phydevs as I wanted to suspend unused ones.

So I don't think there is any rush to fix this one.

>>> I think phy_error() and phy_stop() should be extended to set the state
>>> to PHY_HALTED only if the PHY is in a started state, means:
>>> don't touch the state if it's DOWN, READY, or HALTED.
>>> In case of phy_error() it's more a precaution, because it's used within
>>> phylib only and AFAICS it can be triggered from a started state only.
>> This wouldn't fix the issue that occurs when phy_stop() is called right
>> after phy_start(), though, as PHY is in UP state then.
>>
> After having removed state AN I was thinking already whether we really
> need to have separate states READY and HALTED. I think we wouldn't
> have this scenario if phy_stop() and phy_error() would set the state
> to READY. Merging the two states requires some upfront work, but I have
> some ideas in the back of my mind.
> Overall this should be cleaner than the proposed workaround.

Yeah, sounds better indeed.

>>> So yes, there is a theoretical issue. But as you wrote already, I think
>>> there's a better way to deal with it.
>>>
>>> For checking whether PHY is in a started state you could use the helper
>>> which is being discussed here:
>>> https://marc.info/?l=linux-netdev=154360368104946=2
>>>
>>>
 Fix that by going to PHY_UP instead of PHY_RESUMING if autonegotiation
 has never been configured.

 Signed-off-by: Anssi Hannula 
 ---

 This doesn't feel as clean is I'd like to, though. Maybe there is a
 better way?


  drivers/net/phy/phy.c | 11 ++-
  include/linux/phy.h   |  2 ++
  2 files changed, 12 insertions(+), 1 deletion(-)

 diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
 index d46858694db9..7a650cce7599 100644
 --- a/drivers/net/phy/phy.c
 +++ b/drivers/net/phy/phy.c
 @@ -553,6 +553,8 @@ int phy_start_aneg(struct phy_device *phydev)
if (err < 0)
goto out_unlock;
  
 +  phydev->autoneg_configured = 1;
 +
if (phydev->state != PHY_HALTED) {
if (AUTONEG_ENABLE == phydev->autoneg) {
err = phy_check_link_status(phydev);
 @@ -893,7 +895,14 @@ void phy_start(struct phy_device *phydev)
break;
}
  
 -  phydev->state = PHY_RESUMING;
 +  /* if autoneg/forcing was never configured, go back to PHY_UP
 +   * to make that happen
 +   */
 +  if (!phydev->autoneg_configured)
 +  phydev->state = PHY_UP;
 +  else
 +  phydev->state = PHY_RESUMING;
 +
break;
default:
break;
 diff --git a/include/linux/phy.h b/include/linux/phy.h
 index 8f927246acdb..b306d5ed9d09 100644
 --- a/include/linux/phy.h
 +++ b/include/linux/phy.h
 @@ -389,6 +389,8 @@ struct phy_device {
unsigned loopback_enabled:1;
  
unsigned autoneg:1;
 +  /* autoneg has been configured at least once */
 +  unsigned autoneg_configured:1;
/* The most recently read link state */
unsigned link:1;
  


-- 
Anssi Hannula / Bitwise Oy



[PATCH v2 2/2] net: mvpp2: fix phylink handling of invalid PHY modes

2018-12-04 Thread Baruch Siach
The .validate phylink callback should empty the supported bitmap when
the interface mode is invalid.

Cc: Maxime Chevallier 
Cc: Antoine Tenart 
Reported-by: Russell King 
Signed-off-by: Baruch Siach 
---
v2:
  New patch in this series
---
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 33 ++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index eb1dc8abc359..125ea99418df 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -4375,8 +4375,27 @@ static void mvpp2_phylink_validate(struct net_device 
*dev,
   unsigned long *supported,
   struct phylink_link_state *state)
 {
+   struct mvpp2_port *port = netdev_priv(dev);
__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 
+   /* Invalid combinations */
+   switch (state->interface) {
+   case PHY_INTERFACE_MODE_10GKR:
+   case PHY_INTERFACE_MODE_XAUI:
+   if (port->gop_id != 0)
+   goto empty_set;
+   break;
+   case PHY_INTERFACE_MODE_RGMII:
+   case PHY_INTERFACE_MODE_RGMII_ID:
+   case PHY_INTERFACE_MODE_RGMII_RXID:
+   case PHY_INTERFACE_MODE_RGMII_TXID:
+   if (port->gop_id == 0)
+   goto empty_set;
+   break;
+   default:
+   break;
+   }
+
phylink_set(mask, Autoneg);
phylink_set_port_modes(mask);
phylink_set(mask, Pause);
@@ -4384,6 +4403,7 @@ static void mvpp2_phylink_validate(struct net_device *dev,
 
switch (state->interface) {
case PHY_INTERFACE_MODE_10GKR:
+   case PHY_INTERFACE_MODE_XAUI:
case PHY_INTERFACE_MODE_NA:
phylink_set(mask, 1baseCR_Full);
phylink_set(mask, 1baseSR_Full);
@@ -4392,7 +4412,11 @@ static void mvpp2_phylink_validate(struct net_device 
*dev,
phylink_set(mask, 1baseER_Full);
phylink_set(mask, 1baseKR_Full);
/* Fall-through */
-   default:
+   case PHY_INTERFACE_MODE_RGMII:
+   case PHY_INTERFACE_MODE_RGMII_ID:
+   case PHY_INTERFACE_MODE_RGMII_RXID:
+   case PHY_INTERFACE_MODE_RGMII_TXID:
+   case PHY_INTERFACE_MODE_SGMII:
phylink_set(mask, 10baseT_Half);
phylink_set(mask, 10baseT_Full);
phylink_set(mask, 100baseT_Half);
@@ -4404,11 +4428,18 @@ static void mvpp2_phylink_validate(struct net_device 
*dev,
phylink_set(mask, 1000baseT_Full);
phylink_set(mask, 1000baseX_Full);
phylink_set(mask, 2500baseX_Full);
+   break;
+   default:
+   goto empty_set;
}
 
bitmap_and(supported, supported, mask, __ETHTOOL_LINK_MODE_MASK_NBITS);
bitmap_and(state->advertising, state->advertising, mask,
   __ETHTOOL_LINK_MODE_MASK_NBITS);
+   return;
+
+empty_set:
+   bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
 static void mvpp22_xlg_link_state(struct mvpp2_port *port,
-- 
2.19.2



[PATCH net-next 3/3] net: bridge: increase multicast's default maximum number of entries

2018-12-04 Thread Nikolay Aleksandrov
bridge's default hash_max was 512 which is rather conservative, now that
we're using the generic rhashtable API which autoshrinks let's increase
it to 4096 and move it to a define in br_private.h.

Signed-off-by: Nikolay Aleksandrov 
---
 net/bridge/br_multicast.c | 2 +-
 net/bridge/br_private.h   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 79f981a4c225..5871086a0ab1 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1743,7 +1743,7 @@ static void br_ip6_multicast_query_expired(struct 
timer_list *t)
 
 void br_multicast_init(struct net_bridge *br)
 {
-   br->hash_max = 512;
+   br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX;
 
br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
br->multicast_last_member_count = 2;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e3c56b418543..a469a779478e 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -31,6 +31,8 @@
 #define BR_PORT_BITS   10
 #define BR_MAX_PORTS   (1<

[PATCH net-next 0/3] net: bridge: convert multicast to generic rhashtable

2018-12-04 Thread Nikolay Aleksandrov
Hi,
The current bridge multicast code uses a custom rhashtable
implementation which predates the generic rhashtable API. Patch 01
converts it to use the generic kernel rhashtable which simplifies the
code a lot and removes duplicated functionality. The convert also makes
hash_elasticity obsolete as the generic rhashtable already has such
checks and has a fixed elasticity of RHT_ELASTICITY (16 currently) so we
emit a warning whenever elasticity is set and return RHT_ELASTICITY when
read (patch 02). Since now we have the generic rhashtable which
autoshrinks we can be more liberal with the default hash maximum so patch
03 increases it to 4096 and moves it to a define in br_private.h.

Thanks,
 Nik


Nikolay Aleksandrov (3):
  net: bridge: convert multicast to generic rhashtable
  net: bridge: mark hash_elasticity as obsolete
  net: bridge: increase multicast's default maximum number of entries

 net/bridge/br_device.c|  11 ++
 net/bridge/br_mdb.c   | 120 +--
 net/bridge/br_multicast.c | 405 ++
 net/bridge/br_netlink.c   |  11 +-
 net/bridge/br_private.h   |  36 ++--
 net/bridge/br_sysfs_br.c  |   6 +-
 6 files changed, 149 insertions(+), 440 deletions(-)

-- 
2.17.2



[PATCH net-next 1/3] net: bridge: convert multicast to generic rhashtable

2018-12-04 Thread Nikolay Aleksandrov
The bridge multicast code currently uses a custom resizable hashtable
which predates the generic rhashtable interface. It has many
shortcomings compared and duplicates functionality that is presently
available via the generic rhashtable, so this patch removes the custom
rhashtable implementation in favor of the kernel's generic rhashtable.
The hash maximum is kept and the rhashtable's size is used to do a loose
check if it's reached in which case we revert to the old behaviour and
disable further bridge multicast processing. Also now we can support any
hash maximum, doesn't need to be a power of 2.

Signed-off-by: Nikolay Aleksandrov 
---
 net/bridge/br_device.c|  11 ++
 net/bridge/br_mdb.c   | 120 +---
 net/bridge/br_multicast.c | 402 ++
 net/bridge/br_private.h   |  33 ++--
 4 files changed, 139 insertions(+), 427 deletions(-)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index c6abf927f0c9..1cb09aaaf193 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -131,9 +131,17 @@ static int br_dev_init(struct net_device *dev)
return err;
}
 
+   err = br_mdb_hash_init(br);
+   if (err) {
+   free_percpu(br->stats);
+   br_fdb_hash_fini(br);
+   return err;
+   }
+
err = br_vlan_init(br);
if (err) {
free_percpu(br->stats);
+   br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
return err;
}
@@ -142,6 +150,7 @@ static int br_dev_init(struct net_device *dev)
if (err) {
free_percpu(br->stats);
br_vlan_flush(br);
+   br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
}
br_set_lockdep_class(dev);
@@ -156,6 +165,7 @@ static void br_dev_uninit(struct net_device *dev)
br_multicast_dev_del(br);
br_multicast_uninit_stats(br);
br_vlan_flush(br);
+   br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
free_percpu(br->stats);
 }
@@ -426,6 +436,7 @@ void br_dev_setup(struct net_device *dev)
spin_lock_init(>lock);
INIT_LIST_HEAD(>port_list);
INIT_HLIST_HEAD(>fdb_list);
+   INIT_HLIST_HEAD(>mdb_list);
spin_lock_init(>hash_lock);
 
br->bridge_id.prio[0] = 0x80;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index a7ea2d431714..ea8abdb56df3 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -78,82 +78,72 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry 
*entry, struct br_ip *ip)
 static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev)
 {
+   int idx = 0, s_idx = cb->args[1], err = 0;
struct net_bridge *br = netdev_priv(dev);
-   struct net_bridge_mdb_htable *mdb;
+   struct net_bridge_mdb_entry *mp;
struct nlattr *nest, *nest2;
-   int i, err = 0;
-   int idx = 0, s_idx = cb->args[1];
 
if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
return 0;
 
-   mdb = rcu_dereference(br->mdb);
-   if (!mdb)
-   return 0;
-
nest = nla_nest_start(skb, MDBA_MDB);
if (nest == NULL)
return -EMSGSIZE;
 
-   for (i = 0; i < mdb->max; i++) {
-   struct net_bridge_mdb_entry *mp;
+   hlist_for_each_entry_rcu(mp, >mdb_list, mdb_node) {
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
struct net_bridge_port *port;
 
-   hlist_for_each_entry_rcu(mp, >mhash[i], hlist[mdb->ver]) {
-   if (idx < s_idx)
-   goto skip;
+   if (idx < s_idx)
+   goto skip;
 
-   nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
-   if (nest2 == NULL) {
-   err = -EMSGSIZE;
-   goto out;
-   }
+   nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
+   if (!nest2) {
+   err = -EMSGSIZE;
+   break;
+   }
 
-   for (pp = >ports;
-(p = rcu_dereference(*pp)) != NULL;
- pp = >next) {
-   struct nlattr *nest_ent;
-   struct br_mdb_entry e;
-
-   port = p->port;
-   if (!port)
-   continue;
-
-   memset(, 0, sizeof(e));
-   e.ifindex = port->dev->ifindex;
-   e.vid = p->addr.vid;
-   __mdb_entry_fill_flags(, p->flags);
-   if (p->addr.proto == htons(ETH_P_IP))
-  

[PATCH net-next 2/3] net: bridge: mark hash_elasticity as obsolete

2018-12-04 Thread Nikolay Aleksandrov
Now that the bridge multicast uses the generic rhashtable interface we
can drop the hash_elasticity option as that is already done for us and
it's hardcoded to a maximum of RHT_ELASTICITY (16 currently). Add a
warning about the obsolete option when the hash_elasticity is set.

Signed-off-by: Nikolay Aleksandrov 
---
 net/bridge/br_multicast.c |  1 -
 net/bridge/br_netlink.c   | 11 ---
 net/bridge/br_private.h   |  1 -
 net/bridge/br_sysfs_br.c  |  6 +++---
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index fdca91231815..79f981a4c225 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1743,7 +1743,6 @@ static void br_ip6_multicast_query_expired(struct 
timer_list *t)
 
 void br_multicast_init(struct net_bridge *br)
 {
-   br->hash_elasticity = 4;
br->hash_max = 512;
 
br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 13cd50326af2..d3a5963de35d 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1189,11 +1189,9 @@ static int br_changelink(struct net_device *brdev, 
struct nlattr *tb[],
return err;
}
 
-   if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) {
-   u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]);
-
-   br->hash_elasticity = val;
-   }
+   if (data[IFLA_BR_MCAST_HASH_ELASTICITY])
+   br_warn(br, "the hash_elasticity option has been deprecated and 
is always %u\n",
+   RHT_ELASTICITY);
 
if (data[IFLA_BR_MCAST_HASH_MAX]) {
u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]);
@@ -1457,8 +1455,7 @@ static int br_fill_info(struct sk_buff *skb, const struct 
net_device *brdev)
   br_opt_get(br, BROPT_MULTICAST_QUERIER)) ||
nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
   br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) ||
-   nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY,
-   br->hash_elasticity) ||
+   nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) ||
nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT,
br->multicast_last_member_count) ||
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index ff443aea279f..e3c56b418543 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -372,7 +372,6 @@ struct net_bridge {
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 
-   u32 hash_elasticity;
u32 hash_max;
 
u32 multicast_last_member_count;
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 6a378a7e16ea..05a910d3b70a 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -424,13 +424,13 @@ static DEVICE_ATTR_RW(multicast_querier);
 static ssize_t hash_elasticity_show(struct device *d,
struct device_attribute *attr, char *buf)
 {
-   struct net_bridge *br = to_bridge(d);
-   return sprintf(buf, "%u\n", br->hash_elasticity);
+   return sprintf(buf, "%u\n", RHT_ELASTICITY);
 }
 
 static int set_elasticity(struct net_bridge *br, unsigned long val)
 {
-   br->hash_elasticity = val;
+   br_warn(br, "the hash_elasticity option has been deprecated and is 
always %u\n",
+   RHT_ELASTICITY);
return 0;
 }
 
-- 
2.17.2



RE: [PATCH net-next] can: flexcan: flexcan_chip_start(): fix the error return code in flexcan_setup_stop_mode()

2018-12-04 Thread Aisheng DONG
Copy Joakim.

> > From: Wei Yongjun [mailto:weiyongj...@huawei.com]
[...]
> > Subject: [PATCH net-next] can: flexcan: flexcan_chip_start(): fix the
> > error return code in flexcan_setup_stop_mode()
> >
> > The error return code PTR_ERR(gpr_np) is always 0 since gpr_np is
> > equal to NULL in this error handling case. Fix it by return -ENOENT.
> >
> > Fixes: de3578c198c6 ("can: flexcan: add self wakeup support")
> > Signed-off-by: Wei Yongjun 
> > ---
> >  drivers/net/can/flexcan.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
> > index
> > 0f36eaf..f412d84 100644
> > --- a/drivers/net/can/flexcan.c
> > +++ b/drivers/net/can/flexcan.c
> > @@ -1432,7 +1432,7 @@ static int flexcan_setup_stop_mode(struct
> > platform_device *pdev)
> > gpr_np = of_find_node_by_phandle(phandle);
> > if (!gpr_np) {
> > dev_dbg(>dev, "could not find gpr node by phandle\n");
> > -   return PTR_ERR(gpr_np);
> > +   return -ENOENT;
> 
> Good catch.
> 
> Reviewed-by: Dong Aisheng 
> 
> Regards
> Dong Aisheng
> 
> > }
> >
> > priv = netdev_priv(dev);
> >
> >



Re: [PATCH net-next 0/3] net: bridge: convert multicast to generic rhashtable

2018-12-04 Thread Nikolay Aleksandrov
On 04/12/2018 16:44, Nikolay Aleksandrov wrote:
> Hi,
> The current bridge multicast code uses a custom rhashtable
> implementation which predates the generic rhashtable API. Patch 01
> converts it to use the generic kernel rhashtable which simplifies the
> code a lot and removes duplicated functionality. The convert also makes
> hash_elasticity obsolete as the generic rhashtable already has such
> checks and has a fixed elasticity of RHT_ELASTICITY (16 currently) so we
> emit a warning whenever elasticity is set and return RHT_ELASTICITY when
> read (patch 02). Since now we have the generic rhashtable which
> autoshrinks we can be more liberal with the default hash maximum so patch
> 03 increases it to 4096 and moves it to a define in br_private.h.
> 
> Thanks,
>  Nik
> 
> 
> Nikolay Aleksandrov (3):
>   net: bridge: convert multicast to generic rhashtable
>   net: bridge: mark hash_elasticity as obsolete
>   net: bridge: increase multicast's default maximum number of entries
> 
>  net/bridge/br_device.c|  11 ++
>  net/bridge/br_mdb.c   | 120 +--
>  net/bridge/br_multicast.c | 405 ++
>  net/bridge/br_netlink.c   |  11 +-
>  net/bridge/br_private.h   |  36 ++--
>  net/bridge/br_sysfs_br.c  |   6 +-
>  6 files changed, 149 insertions(+), 440 deletions(-)
> 

Self-NAK, sent an older version of the patch-set which has a problem
when IGMP_SNOOPING is not defined. Will send the current in a few.


Re: [PATCH v5 0/4] Fix unsafe BPF_PROG_TEST_RUN interface

2018-12-04 Thread Alexei Starovoitov
On Mon, Dec 03, 2018 at 11:31:22AM +, Lorenz Bauer wrote:
> Right now, there is no safe way to use BPF_PROG_TEST_RUN with data_out.
> This is because bpf_test_finish copies the output buffer to user space
> without checking its size. This can lead to the kernel overwriting
> data in user space after the buffer if xdp_adjust_head and friends are
> in play.
> 
> Thanks to everyone for their advice and patience with this patch set!
> 
> Changes in v5:
> * Fix up libbpf.map
> 
> Changes in v4:
> * Document bpf_prog_test_run and bpf_prog_test_run_xattr
> * Use struct bpf_prog_test_run_attr for return values
> 
> Changes in v3:
> * Introduce bpf_prog_test_run_xattr instead of modifying the existing
>   function
> 
> Changes in v2:
> * Make the syscall return ENOSPC if data_size_out is too small
> * Make bpf_prog_test_run return EINVAL if size_out is missing
> * Document the new behaviour of data_size_out

Applied, Thanks



Re: [PATCH net-next 0/4] mlxsw: Add one-armed router support

2018-12-04 Thread David Miller
From: Ido Schimmel 
Date: Tue, 4 Dec 2018 08:15:09 +

> Up until now, when a packet was routed by the ASIC through the same
> router interface (RIF) from which it ingressed from, the ASIC passed the
> sole copy of the packet to the kernel. This allowed the kernel to route
> the packet and also potentially generate an ICMP redirect.
> 
> There are scenarios (e.g., "one-armed router") where packets are
> intentionally routed this way and are therefore not deemed as
> exceptions. In such scenarios the current method of trapping packets to
> the CPU is problematic, as it results in major packet loss.
> 
> This patchset solves the problem by having the ASIC forward the packet,
> but also send a copy to the CPU, which gives the kernel the opportunity
> to generate required exceptions.
> 
> To prevent the kernel from forwarding such packets again, the driver
> marks them with 'offload_l3_fwd_mark', which causes the kernel to
> consume them in ip{,6}_forward_finish().
> 
> Patch #1 renames 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark'. When
> set, the field indicates that a packet was already forwarded in L3
> (unicast / multicast) by a capable device.
> 
> Patch #2 teaches the kernel to consume unicast packets that have
> 'offload_l3_fwd_mark' set.
> 
> Patch #3 changes mlxsw to mirror loopbacked (iRIF == eRIF) packets,
> instead of trapping them.
> 
> Patch #4 adds a test case for above mentioned scenario.

Series applied, thank you.


RE: [PATCH] intel: ice: Do not enable NAPI on q_vectors that have no rings

2018-12-04 Thread Bowers, AndrewX
> -Original Message-
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On
> Behalf Of Yang Xiao
> Sent: Wednesday, November 28, 2018 5:54 PM
> To: Kirsher, Jeffrey T ; da...@davemloft.net
> Cc: netdev@vger.kernel.org; intel-wired-...@lists.osuosl.org; Yang Xiao
> 
> Subject: [Intel-wired-lan] [PATCH] intel: ice: Do not enable NAPI on
> q_vectors that have no rings
> 
> From: Young Xiao 
> 
> If ice driver has q_vectors w/ active NAPI that has no rings, then this will
> result in a divide by zero error. To correct it I am updating the driver code 
> so
> that we only support NAPI on q_vectors that have 1 or more rings allocated
> to them.
> 
> See commit 13a8cd191a2b ("i40e: Do not enable NAPI on q_vectors that have
> no rings") for detail.
> 
> Signed-off-by: Young Xiao 
> ---
>  drivers/net/ethernet/intel/ice/ice_main.c | 16 
>  1 file changed, 12 insertions(+), 4 deletions(-)

Tested-by: Andrew Bowers 




Re: [PATCH net-next 1/4] indirect call wrappers: helpers to speed-up indirect calls of builtin

2018-12-04 Thread Edward Cree
On 03/12/18 11:40, Paolo Abeni wrote:
> This header define a bunch of helpers that allow avoiding the
> retpoline overhead when calling builtin functions via function pointers.
> It boils down to explicitly comparing the function pointers to
> known builtin functions and eventually invoke directly the latter.
>
> The macros defined here implement the boilerplate for the above schema
> and will be used by the next patches.
>
> rfc -> v1:
>  - use branch prediction hint, as suggested by Eric
>
> Suggested-by: Eric Dumazet 
> Signed-off-by: Paolo Abeni 
> ---
I'm not sure I see the reason why this is done with numbers and
 'name ## NR', adding extra distance between the callsite and the
 list of callees.  In particular it means that each callable needs
 to specify its index.
Wouldn't it be simpler just to have
    #define INDIRECT_CALL_1(f, f1, ...) \
    (likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__))
    #define INDIRECT_CALL_2(f, f2, f1, ...) \
    (likely(f == f2) ? f2(__VA_ARGS__) : INDIRECT_CALL_1(f, f1, 
__VA_ARGS__))
etc.?  Removing the need for INDIRECT_CALLABLE_DECLARE_* entirely.

At least the commit message should explain the rationale for not
 doing things this way.

-Ed

PS: this has reminded me of my desire to try runtime creation of
 these kinds of branch tables with self-modifying code; is there
 any documentation on how to go about writing to kernel .text at
 runtime?  Last time I had a try at it I got very confused.


RE: [PATCH net 1/2] net/mlx4_en: Change min MTU size to ETH_MIN_MTU

2018-12-04 Thread David Laight
From: Eric Dumazet
> Sent: 04 December 2018 17:04
> 
> On 12/04/2018 08:59 AM, David Laight wrote:
> > From: Tariq Toukan
> >> Sent: 02 December 2018 12:35
> >> From: Eran Ben Elisha 
> >>
> >> NIC driver minimal MTU size shall be set to ETH_MIN_MTU, as defined in
> >> the RFC791 and in the network stack. Remove old mlx4_en only define for
> >> it, which was set to wrong value.
> > ...
> >>
> >> -  /* MTU range: 46 - hw-specific max */
> >> -  dev->min_mtu = MLX4_EN_MIN_MTU;
> >> +  /* MTU range: 68 - hw-specific max */
> >> +  dev->min_mtu = ETH_MIN_MTU;
> >>dev->max_mtu = priv->max_mtu;
> >
> > Where does 68 come from?
> 
> Min IPv4 MTU per RFC791

Which has nothing to do with an ethernet driver.
Indeed, IIRC, it is the smallest maximum frame size that IPv4
can work over.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)


[PATCH v3 2/2] i40e: DRY rx_ptype handling code

2018-12-04 Thread Michał Mirosław
Move rx_ptype extracting to i40e_process_skb_fields() to avoid
duplicating the code.

Signed-off-by: Michał Mirosław 
Signed-off-by: Michał Mirosław 
---
v3:
 * no changes
v2:
 * fix prototype in i40e_txrx_common.h
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c| 12 
 drivers/net/ethernet/intel/i40e/i40e_txrx_common.h |  3 +--
 drivers/net/ethernet/intel/i40e/i40e_xsk.c |  6 +-
 3 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index bc6a873ec574..d0a95424ce58 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1775,8 +1775,7 @@ static inline void i40e_rx_hash(struct i40e_ring *ring,
  * other fields within the skb.
  **/
 void i40e_process_skb_fields(struct i40e_ring *rx_ring,
-union i40e_rx_desc *rx_desc, struct sk_buff *skb,
-u8 rx_ptype)
+union i40e_rx_desc *rx_desc, struct sk_buff *skb)
 {
u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
u32 rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
@@ -1784,6 +1783,8 @@ void i40e_process_skb_fields(struct i40e_ring *rx_ring,
u32 tsynvalid = rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK;
u32 tsyn = (rx_status & I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >>
   I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT;
+   u8 rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
+ I40E_RXD_QW1_PTYPE_SHIFT;
 
if (unlikely(tsynvalid))
i40e_ptp_rx_hwtstamp(rx_ring->vsi->back, skb, tsyn);
@@ -2339,7 +2340,6 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
struct i40e_rx_buffer *rx_buffer;
union i40e_rx_desc *rx_desc;
unsigned int size;
-   u8 rx_ptype;
u64 qword;
 
/* return some buffers to hardware, one at a time is too slow */
@@ -2432,12 +2432,8 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
/* probably a little skewed due to removing CRC */
total_rx_bytes += skb->len;
 
-   qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
-   rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
-  I40E_RXD_QW1_PTYPE_SHIFT;
-
/* populate checksum, VLAN, and protocol */
-   i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
+   i40e_process_skb_fields(rx_ring, rx_desc, skb);
 
i40e_trace(clean_rx_irq_rx, rx_ring, rx_desc, skb);
napi_gro_receive(_ring->q_vector->napi, skb);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h 
b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
index 2c077f1c4eca..8af0e99c6c0d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
@@ -12,8 +12,7 @@ struct i40e_rx_buffer *i40e_clean_programming_status(
union i40e_rx_desc *rx_desc,
u64 qw);
 void i40e_process_skb_fields(struct i40e_ring *rx_ring,
-union i40e_rx_desc *rx_desc, struct sk_buff *skb,
-u8 rx_ptype);
+union i40e_rx_desc *rx_desc, struct sk_buff *skb);
 void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring);
 void i40e_update_rx_stats(struct i40e_ring *rx_ring,
  unsigned int total_rx_bytes,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c 
b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index a6ea2b249471..870cf654e436 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -634,7 +634,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int 
budget)
struct i40e_rx_buffer *bi;
union i40e_rx_desc *rx_desc;
unsigned int size;
-   u8 rx_ptype;
u64 qword;
 
if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
@@ -712,10 +711,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int 
budget)
total_rx_bytes += skb->len;
total_rx_packets++;
 
-   qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
-   rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
-  I40E_RXD_QW1_PTYPE_SHIFT;
-   i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
+   i40e_process_skb_fields(rx_ring, rx_desc, skb);
napi_gro_receive(_ring->q_vector->napi, skb);
}
 
-- 
2.19.2



[PATCH v3 1/2] i40e: fix VLAN.TCI == 0 RX HW offload

2018-12-04 Thread Michał Mirosław
This fixes two bugs in hardware VLAN offload:
 1. VLAN.TCI == 0 was being dropped
 2. there was a race between disabling of VLAN RX feature in hardware
and processing RX queue, where packets processed in this window
could have their VLAN information dropped

Fix moves the VLAN handling into i40e_process_skb_fields() to save on
duplicated code. i40e_receive_skb() becomes trivial and so is removed.

Signed-off-by: Michał Mirosław 
Signed-off-by: Michał Mirosław 
---
v3:
 * fix whitespace for checkpatch
v2:
 * no changes
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 31 +--
 .../ethernet/intel/i40e/i40e_txrx_common.h|  2 --
 drivers/net/ethernet/intel/i40e/i40e_xsk.c|  6 +---
 3 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index aef3c89ee79c..bc6a873ec574 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1558,24 +1558,6 @@ static bool i40e_alloc_mapped_page(struct i40e_ring 
*rx_ring,
return true;
 }
 
-/**
- * i40e_receive_skb - Send a completed packet up the stack
- * @rx_ring:  rx ring in play
- * @skb: packet to send up
- * @vlan_tag: vlan tag for packet
- **/
-void i40e_receive_skb(struct i40e_ring *rx_ring,
- struct sk_buff *skb, u16 vlan_tag)
-{
-   struct i40e_q_vector *q_vector = rx_ring->q_vector;
-
-   if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
-   (vlan_tag & VLAN_VID_MASK))
-   __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
-
-   napi_gro_receive(_vector->napi, skb);
-}
-
 /**
  * i40e_alloc_rx_buffers - Replace used receive buffers
  * @rx_ring: ring to place buffers on
@@ -1812,6 +1794,13 @@ void i40e_process_skb_fields(struct i40e_ring *rx_ring,
 
skb_record_rx_queue(skb, rx_ring->queue_index);
 
+   if (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) {
+   u16 vlan_tag = rx_desc->wb.qword0.lo_dword.l2tag1;
+
+   __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+  le16_to_cpu(vlan_tag));
+   }
+
/* modifies the skb - consumes the enet header */
skb->protocol = eth_type_trans(skb, rx_ring->netdev);
 }
@@ -2350,7 +2339,6 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
struct i40e_rx_buffer *rx_buffer;
union i40e_rx_desc *rx_desc;
unsigned int size;
-   u16 vlan_tag;
u8 rx_ptype;
u64 qword;
 
@@ -2451,11 +2439,8 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
/* populate checksum, VLAN, and protocol */
i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
 
-   vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
-  le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0;
-
i40e_trace(clean_rx_irq_rx, rx_ring, rx_desc, skb);
-   i40e_receive_skb(rx_ring, skb, vlan_tag);
+   napi_gro_receive(_ring->q_vector->napi, skb);
skb = NULL;
 
/* update budget accounting */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h 
b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
index 09809dffe399..2c077f1c4eca 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
@@ -14,8 +14,6 @@ struct i40e_rx_buffer *i40e_clean_programming_status(
 void i40e_process_skb_fields(struct i40e_ring *rx_ring,
 union i40e_rx_desc *rx_desc, struct sk_buff *skb,
 u8 rx_ptype);
-void i40e_receive_skb(struct i40e_ring *rx_ring,
- struct sk_buff *skb, u16 vlan_tag);
 void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring);
 void i40e_update_rx_stats(struct i40e_ring *rx_ring,
  unsigned int total_rx_bytes,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c 
b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 433c8e688c78..a6ea2b249471 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -634,7 +634,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int 
budget)
struct i40e_rx_buffer *bi;
union i40e_rx_desc *rx_desc;
unsigned int size;
-   u16 vlan_tag;
u8 rx_ptype;
u64 qword;
 
@@ -717,10 +716,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int 
budget)
rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
   I40E_RXD_QW1_PTYPE_SHIFT;
i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
-
-   vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
-   

[PATCH net] net: use skb_list_del_init() to remove from RX sublists

2018-12-04 Thread Edward Cree
list_del() leaves the skb->next pointer poisoned, which can then lead to
 a crash in e.g. OVS forwarding.  For example, setting up an OVS VXLAN
 forwarding bridge on sfc as per:

$ ovs-vsctl show
5dfd9c47-f04b-4aaa-aa96-4fbb0a522a30
Bridge "br0"
Port "br0"
Interface "br0"
type: internal
Port "enp6s0f0"
Interface "enp6s0f0"
Port "vxlan0"
Interface "vxlan0"
type: vxlan
options: {key="1", local_ip="10.0.0.5", remote_ip="10.0.0.4"}
ovs_version: "2.5.0"

(where 10.0.0.5 is an address on enp6s0f1)
and sending traffic across it will lead to the following panic:

general protection fault:  [#1] SMP PTI
CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.20.0-rc3-ehc+ #701
Hardware name: Dell Inc. PowerEdge R710/0M233H, BIOS 6.4.0 07/23/2013
RIP: 0010:dev_hard_start_xmit+0x38/0x200
Code: 53 48 89 fb 48 83 ec 20 48 85 ff 48 89 54 24 08 48 89 4c 24 18 0f 84 ab 
01 00 00 48 8d 86 90 00 00 00 48 89 f5 48 89 44 24 10 <4c> 8b 33 48 c7 03 00 00 
00 00 48 8b 05 c7 d1 b3 00 4d 85 f6 0f 95
RSP: 0018:888627b437e0 EFLAGS: 00010202
RAX:  RBX: dead0100 RCX: 88862279c000
RDX: 888614a342c0 RSI:  RDI: 
RBP: 888618a88000 R08: 0001 R09: 03e8
R10:  R11: 888614a34140 R12: 
R13: 0062 R14: dead0100 R15: 88861643
FS:  () GS:888627b4() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 7f6d2bc6d000 CR3: 0200a000 CR4: 06e0
Call Trace:
 
 __dev_queue_xmit+0x623/0x870
 ? masked_flow_lookup+0xf7/0x220 [openvswitch]
 ? ep_poll_callback+0x101/0x310
 do_execute_actions+0xaba/0xaf0 [openvswitch]
 ? __wake_up_common+0x8a/0x150
 ? __wake_up_common_lock+0x87/0xc0
 ? queue_userspace_packet+0x31c/0x5b0 [openvswitch]
 ovs_execute_actions+0x47/0x120 [openvswitch]
 ovs_dp_process_packet+0x7d/0x110 [openvswitch]
 ovs_vport_receive+0x6e/0xd0 [openvswitch]
 ? dst_alloc+0x64/0x90
 ? rt_dst_alloc+0x50/0xd0
 ? ip_route_input_slow+0x19a/0x9a0
 ? __udp_enqueue_schedule_skb+0x198/0x1b0
 ? __udp4_lib_rcv+0x856/0xa30
 ? __udp4_lib_rcv+0x856/0xa30
 ? cpumask_next_and+0x19/0x20
 ? find_busiest_group+0x12d/0xcd0
 netdev_frame_hook+0xce/0x150 [openvswitch]
 __netif_receive_skb_core+0x205/0xae0
 __netif_receive_skb_list_core+0x11e/0x220
 netif_receive_skb_list+0x203/0x460
 ? __efx_rx_packet+0x335/0x5e0 [sfc]
 efx_poll+0x182/0x320 [sfc]
 net_rx_action+0x294/0x3c0
 __do_softirq+0xca/0x297
 irq_exit+0xa6/0xb0
 do_IRQ+0x54/0xd0
 common_interrupt+0xf/0xf
 

So, in all listified-receive handling, instead pull skbs off the lists with
 skb_list_del_init().

Fixes: 9af86f933894 ("net: core: fix use-after-free in 
__netif_receive_skb_list_core")
Fixes: 7da517a3bc52 ("net: core: Another step of skb receive list processing")
Fixes: a4ca8b7df73c ("net: ipv4: fix drop handling in ip_list_rcv() and 
ip_list_rcv_finish()")
Fixes: d8269e2cbf90 ("net: ipv6: listify ipv6_rcv() and ip6_rcv_finish()")
Signed-off-by: Edward Cree 
---
I'm not sure if these are the right Fixes tags, or if I should instead be
 fingering some commit that made dev_hard_start_xmit() more sensitive to
 skb->next.
Also, I only saw a crash from the list_del() in __netif_receive_skb_list_core()
 but I converted all of them in the listified RX path, in case any others
 have similar ways to escape into paths that care about skb->next.

 net/core/dev.c   | 8 
 net/ipv4/ip_input.c  | 4 ++--
 net/ipv6/ip6_input.c | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 3470e7fff1f4..265dd2f8ded8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5014,7 +5014,7 @@ static void __netif_receive_skb_list_core(struct 
list_head *head, bool pfmemallo
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
 
-   list_del(>list);
+   skb_list_del_init(skb);
__netif_receive_skb_core(skb, pfmemalloc, _prev);
if (!pt_prev)
continue;
@@ -5170,7 +5170,7 @@ static void netif_receive_skb_list_internal(struct 
list_head *head)
INIT_LIST_HEAD();
list_for_each_entry_safe(skb, next, head, list) {
net_timestamp_check(netdev_tstamp_prequeue, skb);
-   list_del(>list);
+   skb_list_del_init(skb);
if (!skb_defer_rx_timestamp(skb))
list_add_tail(>list, );
}
@@ -5181,7 +5181,7 @@ static void netif_receive_skb_list_internal(struct 
list_head *head)
rcu_read_lock();
list_for_each_entry_safe(skb, next, head, list) {
xdp_prog = rcu_dereference(skb->dev->xdp_prog);
-   list_del(>list);
+   

Re: consistency for statistics with XDP mode

2018-12-04 Thread Jakub Kicinski
On Tue, 4 Dec 2018 10:29:04 +0100, Jesper Dangaard Brouer wrote:
> On Mon, 3 Dec 2018 23:24:18 -0800
> Jakub Kicinski  wrote:
> 
> > On Tue, 04 Dec 2018 09:03:52 +0200, Toke Høiland-Jørgensen wrote:  
> > > David Miller  writes:
> > > 
> > > > From: David Ahern 
> > > > Date: Mon, 3 Dec 2018 17:15:03 -0700
> > > >  
> > > >> So, instead of a program tag which the program writer controls, how
> > > >> about some config knob that an admin controls that says at attach time
> > > >> use standard stats?  
> > > >
> > > > How about, instead of replacing it is in addition to, and admin can
> > > > override?  
> > > 
> > > Yeah, I was also thinking about something the program writer can set,
> > > but the admin can override. There could even be a system-wide setting
> > > that would make the verifier inject it into all programs at load time?
> > 
> > That'd be far preferable, having all drivers include the code when we
> > have JITs seems like a step backward.  
> 
> There is one problem with it being part of the eBPF prog.  Once eBPf
> prog is loaded you cannot change it (store in read-only page for
> security reasons).  So, that will not make it possible for an admin to
> enable stats when troubleshooting a system.  The use-case I think we
> want to support, is to allow to opt-out due to performance concerns,
> but when an admin need to troubleshoot the system, allow the admin to
> enable this system wide.
> 
> Besides placing this in every eBPF program in the system will replicate
> the stats update code (and put more I-cache pressure).  The coded
> needed is actually very simple:
> 
> [PATCH] xdp: add stats for XDP action codes in xdp_rxq_info
> 
> Code muckup of adding XDP stats
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index cc17f5f32fbb..600a95e0cbcc 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -628,7 +628,10 @@ static __always_inline u32 bpf_prog_run_xdp(const struct 
> bpf_prog *prog,
>  * already takes rcu_read_lock() when fetching the program, so
>  * it's not necessary here anymore.
>  */
> -   return BPF_PROG_RUN(prog, xdp);
> +   u32 action = BPF_PROG_RUN(prog, xdp);
> +   // Q: will adding a branch here cost more than always accounting?
> +   xdp->rxq->stats[action <= XDP_REDIRECT ? action : 0]++;
> +   return action;
>  }

Now if we can turn it into a trampoline dynamically inserted on program
invocation that would be perfect :)

>  static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 4a0ca7a3d5e5..3409dd9e0fbc 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -6,6 +6,7 @@
>  #ifndef __LINUX_NET_XDP_H__
>  #define __LINUX_NET_XDP_H__
>  
> +#include 
>  /**
>   * DOC: XDP RX-queue information
>   *
> @@ -61,6 +62,8 @@ struct xdp_rxq_info {
> u32 queue_index;
> u32 reg_state;
> struct xdp_mem_info mem;
> +   // TODO: benchmark if stats should be placed on different cache-line
> +   u64 stats[XDP_REDIRECT + 1];
>  } cacheline_aligned; /* perf critical, avoid false-sharing */
>  
>  struct xdp_buff {
  
> > We could probably fit the stats into the enormous padding of struct
> > xdp_rxq_info, the question is - how do we get to it in a clean way..  
> 
> Struct xdp_rxq_info is explicitly a read-only cache-line, which contain
> static information for each RX-queue.  We could place the stats record
> in the next cache-line (most HW systems fetch 2 cache-lines).  But we
> can also benchmark if it matters changing xdp_rxq_info to be a
> write-cache-line.

I see, I was thinking xdp_frame work made it generally less of an issue
but I haven't double checked.  Perhaps having first cache line as read
only is a good rule to stick to regardless.


Re: [PATCH rdma-next 0/7] Enrich DEVX support

2018-12-04 Thread Doug Ledford
On Mon, 2018-11-26 at 08:28 +0200, Leon Romanovsky wrote:
> From: Leon Romanovsky 
> 
> From Yishai,
> ---
> This series enriches DEVX support in few aspects: it enables interoperability
> between DEVX and verbs and improves mechanism for controlling privileged DEVX
> commands.
> 
> The first patch updates mlx5 ifc file.
> 
> Next 3 patches enable modifying and querying verbs objects via the DEVX
> interface.
> 
> To achieve that the core layer introduced the 'UVERBS_IDR_ANY_OBJECT' type
> to match any IDR object. Once it's used by some driver's method, the
> infrastructure skips checking for the IDR type and it becomes the driver
> handler responsibility.
> 
> The DEVX methods of modify and query were changed to get any object type via
> the 'UVERBS_IDR_ANY_OBJECT' mechanism. The type checking is done per object as
> part of the driver code.
> 
> The next 3 patches introduce more robust mechanism for controlling privileged
> DEVX commands. The responsibility to block/allow per command was moved to be
> done in the firmware based on the UID credentials that the driver reports upon
> user context creation. This enables more granularity per command based on the
> device security model and the user credentials.
> 
> In addition, by introducing a valid range for 'general commands' we prevent 
> the
> need to touch the driver's code any time that a new future command will be
> added.
> 
> The last patch fixes the XRC verbs flow once a DEVX context is used. This is
> needed as XRCD is some shared kernel resource and as such a kernel UID (=0)
> should be used in its related resources.
> 
> Thanks
> 
> Yishai Hadas (7):
>   net/mlx5: Update mlx5_ifc with DEVX UCTX capabilities bits
>   IB/core: Introduce UVERBS_IDR_ANY_OBJECT
>   IB/core: Enable getting an object type from a given uobject
>   IB/mlx5: Enable modify and query verbs objects via DEVX
>   IB/mlx5: Enforce DEVX privilege by firmware
>   IB/mlx5: Update the supported DEVX commands
>   IB/mlx5: Allow XRC usage via verbs in DEVX context
> 
>  drivers/infiniband/core/rdma_core.c   |  27 +++--
>  drivers/infiniband/core/rdma_core.h   |  21 ++--
>  drivers/infiniband/core/uverbs_uapi.c |  10 +-
>  drivers/infiniband/hw/mlx5/devx.c | 142 ++
>  drivers/infiniband/hw/mlx5/main.c |   4 +-
>  drivers/infiniband/hw/mlx5/mlx5_ib.h  |   6 +-
>  drivers/infiniband/hw/mlx5/qp.c   |  12 +--
>  drivers/infiniband/hw/mlx5/srq.c  |   2 +-
>  include/linux/mlx5/mlx5_ifc.h |  26 -
>  include/rdma/uverbs_ioctl.h   |   6 ++
>  include/rdma/uverbs_std_types.h   |  12 +++
>  11 files changed, 215 insertions(+), 53 deletions(-)
> 
> --
> 2.19.1
> 

Hi Leon,

I've put this into my wip/dl-for-next branch.  Because it pulled in the
mlx5-next tree (which was significant because the patch we wanted was at
the tip and a whole slew of other patches were before it), the changeset
for this patch looks much bigger.  And it had the merge conflict you
mentioned.  You might want to double check my work before it gets moved
over to the official for-next branch.

-- 
Doug Ledford 
GPG KeyID: B826A3330E572FDD
Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD


signature.asc
Description: This is a digitally signed message part


[PATCH net] macvlan: remove duplicate check

2018-12-04 Thread Matteo Croce
Following commit 59f997b088d2 ("macvlan: return correct error value"),
there is a duplicate check for mac addresses both in macvlan_sync_address()
and macvlan_set_mac_address().
As the former calls the latter, remove the one in macvlan_set_mac_address()
and move the one in macvlan_sync_address() before any other check.

Signed-off-by: Matteo Croce 
---
 drivers/net/macvlan.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 0da3d36b283b..f3361aabdb78 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -700,14 +700,14 @@ static int macvlan_sync_address(struct net_device *dev, 
unsigned char *addr)
struct macvlan_port *port = vlan->port;
int err;
 
+   if (macvlan_addr_busy(vlan->port, addr))
+   return -EADDRINUSE;
+
if (!(dev->flags & IFF_UP)) {
/* Just copy in the new address */
ether_addr_copy(dev->dev_addr, addr);
} else {
/* Rehash and update the device filters */
-   if (macvlan_addr_busy(vlan->port, addr))
-   return -EADDRINUSE;
-
if (!macvlan_passthru(port)) {
err = dev_uc_add(lowerdev, addr);
if (err)
@@ -747,9 +747,6 @@ static int macvlan_set_mac_address(struct net_device *dev, 
void *p)
return dev_set_mac_address(vlan->lowerdev, addr);
}
 
-   if (macvlan_addr_busy(vlan->port, addr->sa_data))
-   return -EADDRINUSE;
-
return macvlan_sync_address(dev, addr->sa_data);
 }
 
-- 
2.19.2



Re: [PATCH net] rtnetlink: ndo_dflt_fdb_dump() only work for ARPHRD_ETHER devices

2018-12-04 Thread Ido Schimmel
On Tue, Dec 04, 2018 at 09:40:35AM -0800, Eric Dumazet wrote:
> kmsan was able to trigger a kernel-infoleak using a gre device [1]
> 
> nlmsg_populate_fdb_fill() has a hard coded assumption
> that dev->addr_len is ETH_ALEN, as normally guaranteed
> for ARPHRD_ETHER devices.
> 
> A similar issue was fixed recently in commit da71577545a5
> ("rtnetlink: Disallow FDB configuration for non-Ethernet device")

...

> Fixes: d83b06036048 ("net: add fdb generic dump routine")
> Signed-off-by: Eric Dumazet 
> Cc: John Fastabend 
> Cc: Ido Schimmel 
> Cc: David Ahern 

Reviewed-by: Ido Schimmel 

Thanks!


Re: [PATCH net-next] net: netem: use a list in addition to rbtree

2018-12-04 Thread Stephen Hemminger
I like this, it makes a lot of sense since packets are almost
always queued in order.

Minor style stuff you might want to fix (but don't have to).

> + if (!last ||
> + t_last->time_to_send > last->time_to_send) {
> + last = t_last;
> + }

I don't think you need braces here for single assignment.

> +static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
> +{
> + if (skb == q->t_head) {
> + q->t_head = skb->next;
> + if (!q->t_head)
> + q->t_tail = NULL;
> + } else
> + rb_erase(>rbnode, >t_root);

Checkpatch wants both sides of if/else to have brackets.
Personally, don't care.

Reviewed-by: Stephen Hemminger 


Re: [PATCH net] sctp: always set frag_point on pmtu change

2018-12-04 Thread Jakub Audykowicz
On 2018-12-04 18:45, Marcelo Ricardo Leitner wrote:

> On Tue, Dec 04, 2018 at 06:00:51PM +0100, Jakub Audykowicz wrote:
> ...
>> OK, let's forget about that "if" :)
>> Coming back to the sanity check, I came up with something like below,
>> based on the code from sctp_setsockopt_maxseg, like you mentioned.
>> I may have overcomplicated things since I didn't know how to accomplish
>> the same thing without passing sctp_sock* to sctp_datamsg_from_user.
> Yep. More below.
>
>> I wanted to avoid calling sctp_min_frag_point unless absolutely
>> necessary, so I just check the frag_point against the zero that is
>> causing the eventual kernel panic.
> Makes sense.
>
>>  
>> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
>> index ab9242e51d9e..7e67c0257b3f 100644
>> --- a/include/net/sctp/sctp.h
>> +++ b/include/net/sctp/sctp.h
>> @@ -620,4 +620,15 @@ static inline bool sctp_transport_pmtu_check(struct 
>> sctp_transport *t)
>>  return false;
>>  }
>>  
>> +static inline __u16 sctp_data_chunk_size(struct sctp_association *asoc)
>> +{
>> +return asoc ? sctp_datachk_len(>stream) :
>> +  sizeof(struct sctp_data_chunk);
> I don't think we need another layer on top of data chunk sizes here.
> We don't need this asoc check by the sendmsg callpath because it
> cannot be NULL by then. That said, I think we have avoid this helper,
> for now at least.
>
> One other way would be to include the check into sctp_datachk_len(),
> but we currently have 9 calls to that but only 1 requires the check.
>
> As asoc is initialized and considering the fix we just had in this
> area, stream->si will also be initialized so you should be good to
> just call sctp_datachk_len() directly.
>
>> +}
>> +
>> +static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 
>> datasize)
>> +{
>> +return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize);
>> +}
> This is a good helper to have. Makes it clearer.
>
>> +
>>  #endif /* __net_sctp_h__ */
>> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
>> index a11f93790476..d09b5de73c92 100644
>> --- a/include/net/sctp/structs.h
>> +++ b/include/net/sctp/structs.h
>> @@ -543,7 +543,8 @@ struct sctp_datamsg {
>>  
>>  struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
>>  struct sctp_sndrcvinfo *,
>> -struct iov_iter *);
>> +struct iov_iter *,
>> +struct sctp_sock *);
>>  void sctp_datamsg_free(struct sctp_datamsg *);
>>  void sctp_datamsg_put(struct sctp_datamsg *);
>>  void sctp_chunk_fail(struct sctp_chunk *, int error);
>> diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
>> index ce8087846f05..753c2c123767 100644
>> --- a/net/sctp/chunk.c
>> +++ b/net/sctp/chunk.c
>> @@ -164,7 +164,8 @@ static void sctp_datamsg_assign(struct sctp_datamsg 
>> *msg, struct sctp_chunk *chu
>>   */
>>  struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
>>  struct sctp_sndrcvinfo *sinfo,
>> -struct iov_iter *from)
>> +struct iov_iter *from,
>> +struct sctp_sock *sp)
>>  {
>>  size_t len, first_len, max_data, remaining;
>>  size_t msg_len = iov_iter_count(from);
>> @@ -173,6 +174,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
>> sctp_association *asoc,
>>  struct sctp_chunk *chunk;
>>  struct sctp_datamsg *msg;
>>  int err;
>> +__u32 min_frag_point;
> Reverse christmass tree.. swap the last two lines:
> + __u32 min_frag_point;
>   int err;
> But I guess we don't need this var anymore:
>
>>  
>>  msg = sctp_datamsg_new(GFP_KERNEL);
>>  if (!msg)
>> @@ -190,6 +192,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
>> sctp_association *asoc,
>>  /* This is the biggest possible DATA chunk that can fit into
>>   * the packet
>>   */
>> +if (unlikely(asoc->frag_point == 0)) {
>  !asoc->frag_point   instead
>
> You can get to sctp_sock here with: sctp_sk(asoc->base.sk)
>   struct sctp_sock *sp = sctp_sk(asoc->base.sk);
>
>> +min_frag_point = sctp_min_frag_point(sp, 
>> sctp_data_chunk_size(asoc));
>> +pr_warn_ratelimited("%s: asoc:%p frag_point is too low (%d < 
>> %d), using default minimum",
>> +__func__, asoc, asoc->frag_point, min_frag_point);
>> +asoc->frag_point = min_frag_point;
> No no. Lets not fix up things here. If we do this assignment, we may
> make the disparity even worse.
> With that, we can work only on max_data and avoid the need of min_frag_point.
>   max_data = asoc->frag_point;
>   if (unlikely(!max_data)) {
>   ...
>   max_data = ...
>   }
>
>> +}
>>  max_data = 

Re: [PATCH net 1/2] net/mlx4_en: Change min MTU size to ETH_MIN_MTU

2018-12-04 Thread Eric Dumazet



On 12/04/2018 08:59 AM, David Laight wrote:
> From: Tariq Toukan
>> Sent: 02 December 2018 12:35
>> From: Eran Ben Elisha 
>>
>> NIC driver minimal MTU size shall be set to ETH_MIN_MTU, as defined in
>> the RFC791 and in the network stack. Remove old mlx4_en only define for
>> it, which was set to wrong value.
> ...
>>
>> -/* MTU range: 46 - hw-specific max */
>> -dev->min_mtu = MLX4_EN_MIN_MTU;
>> +/* MTU range: 68 - hw-specific max */
>> +dev->min_mtu = ETH_MIN_MTU;
>>  dev->max_mtu = priv->max_mtu;
> 
> Where does 68 come from?

Min IPv4 MTU per RFC791

> The minimum size of an ethernet packet including the mac addresses
> and CRC is 64 bytes - but that would never be an 'mtu'.
> 
> Since 64 - 46 = 18, the 46 probably excludes both MAC addresses,
> the ethertype/length and the CRC.
> This is 'sort of' the minimum mtu for an ethernet frame.
> 
> I'm not sure which values are supposed to be in dev->min/max_mtu.

I am not sure we really care, only syzkaller can possibly.



Re: [PATCH net-next 1/4] indirect call wrappers: helpers to speed-up indirect calls of builtin

2018-12-04 Thread Paolo Abeni
On Tue, 2018-12-04 at 17:13 +, Edward Cree wrote:
> On 03/12/18 11:40, Paolo Abeni wrote:
> > This header define a bunch of helpers that allow avoiding the
> > retpoline overhead when calling builtin functions via function pointers.
> > It boils down to explicitly comparing the function pointers to
> > known builtin functions and eventually invoke directly the latter.
> > 
> > The macros defined here implement the boilerplate for the above schema
> > and will be used by the next patches.
> > 
> > rfc -> v1:
> >  - use branch prediction hint, as suggested by Eric
> > 
> > Suggested-by: Eric Dumazet 
> > Signed-off-by: Paolo Abeni 
> > ---
> I'm not sure I see the reason why this is done with numbers and
>  'name ## NR', adding extra distance between the callsite and the
>  list of callees.  In particular it means that each callable needs
>  to specify its index.
> Wouldn't it be simpler just to have
> #define 1(f, f1, ...) \
> (likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__))
> #define INDIRECT_CALL_2(f, f2, f1, ...) \
> (likely(f == f2) ? f2(__VA_ARGS__) : INDIRECT_CALL_1(f, f1, 
> __VA_ARGS__))
> etc.?  Removing the need for INDIRECT_CALLABLE_DECLARE_* entirely.

Thank you for the review!

As some of the builtin symbols are static, we would still need some
macro wrappers to properly specify the scope when retpoline is enabled.

Also, I think that f1, f2... declaration before INDIRECT_CALL_ would be
uglier, as we need to list there the function names (so we would have
the same list in 2 places).

Anyway this sounds really one thing that will enrage guys on lklm.
Suggestions for alternative solutions more than welcome ;)

> PS: this has reminded me of my desire to try runtime creation of
> these kinds of branch tables with self-modifying code

This:

https://lore.kernel.org/lkml/cover.1543200841.git.jpoim...@redhat.com/T/#ma30f6b2aa655c99e93cfb267fef75b8fe9fca29b

is possibly related to what you are planning. AFAICS should work only
for global function pointers, not for e.g. function ptr inside lists,
so the above and this series should be complementary.

Cheers,

Paolo





Re: [PATCH net] sctp: always set frag_point on pmtu change

2018-12-04 Thread Marcelo Ricardo Leitner
On Tue, Dec 04, 2018 at 06:00:51PM +0100, Jakub Audykowicz wrote:
...
> OK, let's forget about that "if" :)
> Coming back to the sanity check, I came up with something like below,
> based on the code from sctp_setsockopt_maxseg, like you mentioned.
> I may have overcomplicated things since I didn't know how to accomplish
> the same thing without passing sctp_sock* to sctp_datamsg_from_user.

Yep. More below.

> I wanted to avoid calling sctp_min_frag_point unless absolutely
> necessary, so I just check the frag_point against the zero that is
> causing the eventual kernel panic.

Makes sense.

>  
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index ab9242e51d9e..7e67c0257b3f 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -620,4 +620,15 @@ static inline bool sctp_transport_pmtu_check(struct 
> sctp_transport *t)
>   return false;
>  }
>  
> +static inline __u16 sctp_data_chunk_size(struct sctp_association *asoc)
> +{
> +return asoc ? sctp_datachk_len(>stream) :
> +  sizeof(struct sctp_data_chunk);

I don't think we need another layer on top of data chunk sizes here.
We don't need this asoc check by the sendmsg callpath because it
cannot be NULL by then. That said, I think we have avoid this helper,
for now at least.

One other way would be to include the check into sctp_datachk_len(),
but we currently have 9 calls to that but only 1 requires the check.

As asoc is initialized and considering the fix we just had in this
area, stream->si will also be initialized so you should be good to
just call sctp_datachk_len() directly.

> +}
> +
> +static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize)
> +{
> +return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize);
> +}

This is a good helper to have. Makes it clearer.

> +
>  #endif /* __net_sctp_h__ */
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index a11f93790476..d09b5de73c92 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -543,7 +543,8 @@ struct sctp_datamsg {
>  
>  struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
>   struct sctp_sndrcvinfo *,
> - struct iov_iter *);
> + struct iov_iter *,
> + struct sctp_sock *);
>  void sctp_datamsg_free(struct sctp_datamsg *);
>  void sctp_datamsg_put(struct sctp_datamsg *);
>  void sctp_chunk_fail(struct sctp_chunk *, int error);
> diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> index ce8087846f05..753c2c123767 100644
> --- a/net/sctp/chunk.c
> +++ b/net/sctp/chunk.c
> @@ -164,7 +164,8 @@ static void sctp_datamsg_assign(struct sctp_datamsg *msg, 
> struct sctp_chunk *chu
>   */
>  struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
>   struct sctp_sndrcvinfo *sinfo,
> - struct iov_iter *from)
> + struct iov_iter *from,
> + struct sctp_sock *sp)
>  {
>   size_t len, first_len, max_data, remaining;
>   size_t msg_len = iov_iter_count(from);
> @@ -173,6 +174,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> sctp_association *asoc,
>   struct sctp_chunk *chunk;
>   struct sctp_datamsg *msg;
>   int err;
> + __u32 min_frag_point;

Reverse christmass tree.. swap the last two lines:
+   __u32 min_frag_point;
int err;
But I guess we don't need this var anymore:

>  
>   msg = sctp_datamsg_new(GFP_KERNEL);
>   if (!msg)
> @@ -190,6 +192,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> sctp_association *asoc,
>   /* This is the biggest possible DATA chunk that can fit into
>* the packet
>*/
> + if (unlikely(asoc->frag_point == 0)) {
 !asoc->frag_point   instead

You can get to sctp_sock here with: sctp_sk(asoc->base.sk)
struct sctp_sock *sp = sctp_sk(asoc->base.sk);

> + min_frag_point = sctp_min_frag_point(sp, 
> sctp_data_chunk_size(asoc));
> + pr_warn_ratelimited("%s: asoc:%p frag_point is too low (%d < 
> %d), using default minimum",
> + __func__, asoc, asoc->frag_point, min_frag_point);
> + asoc->frag_point = min_frag_point;

No no. Lets not fix up things here. If we do this assignment, we may
make the disparity even worse.
With that, we can work only on max_data and avoid the need of min_frag_point.
max_data = asoc->frag_point;
if (unlikely(!max_data)) {
...
max_data = ...
}

> + }
>   max_data = asoc->frag_point;
>  
>   /* If the the peer requested that we authenticate DATA chunks
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 

Re: consistency for statistics with XDP mode

2018-12-04 Thread Michael S. Tsirkin
On Tue, Dec 04, 2018 at 10:29:04AM +0100, Jesper Dangaard Brouer wrote:
> On Mon, 3 Dec 2018 23:24:18 -0800
> Jakub Kicinski  wrote:
> 
> > On Tue, 04 Dec 2018 09:03:52 +0200, Toke Høiland-Jørgensen wrote:
> > > David Miller  writes:
> > >   
> > > > From: David Ahern 
> > > > Date: Mon, 3 Dec 2018 17:15:03 -0700
> > > >
> > > >> So, instead of a program tag which the program writer controls, how
> > > >> about some config knob that an admin controls that says at attach time
> > > >> use standard stats?
> > > >
> > > > How about, instead of replacing it is in addition to, and admin can
> > > > override?
> > > 
> > > Yeah, I was also thinking about something the program writer can set,
> > > but the admin can override. There could even be a system-wide setting
> > > that would make the verifier inject it into all programs at load time?  
> > 
> > That'd be far preferable, having all drivers include the code when we
> > have JITs seems like a step backward.
> 
> There is one problem with it being part of the eBPF prog.  Once eBPf
> prog is loaded you cannot change it (store in read-only page for
> security reasons).  So, that will not make it possible for an admin to
> enable stats when troubleshooting a system.  The use-case I think we
> want to support, is to allow to opt-out due to performance concerns,
> but when an admin need to troubleshoot the system, allow the admin to
> enable this system wide.
> 
> Besides placing this in every eBPF program in the system will replicate
> the stats update code (and put more I-cache pressure).  The coded
> needed is actually very simple:
> 
> [PATCH] xdp: add stats for XDP action codes in xdp_rxq_info
> 
> Code muckup of adding XDP stats
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index cc17f5f32fbb..600a95e0cbcc 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -628,7 +628,10 @@ static __always_inline u32 bpf_prog_run_xdp(const struct 
> bpf_prog *prog,
>  * already takes rcu_read_lock() when fetching the program, so
>  * it's not necessary here anymore.
>  */
> -   return BPF_PROG_RUN(prog, xdp);
> +   u32 action = BPF_PROG_RUN(prog, xdp);
> +   // Q: will adding a branch here cost more than always accounting?
> +   xdp->rxq->stats[action <= XDP_REDIRECT ? action : 0]++;

We need array_index_nospec I guess?

> +   return action;
>  }
>  
>  static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 4a0ca7a3d5e5..3409dd9e0fbc 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -6,6 +6,7 @@
>  #ifndef __LINUX_NET_XDP_H__
>  #define __LINUX_NET_XDP_H__
>  
> +#include 
>  /**
>   * DOC: XDP RX-queue information
>   *
> @@ -61,6 +62,8 @@ struct xdp_rxq_info {
> u32 queue_index;
> u32 reg_state;
> struct xdp_mem_info mem;
> +   // TODO: benchmark if stats should be placed on different cache-line
> +   u64 stats[XDP_REDIRECT + 1];
>  } cacheline_aligned; /* perf critical, avoid false-sharing */
>  
>  struct xdp_buff {
> 
> 
> 
> > We could probably fit the stats into the enormous padding of struct
> > xdp_rxq_info, the question is - how do we get to it in a clean way..
> 
> Struct xdp_rxq_info is explicitly a read-only cache-line, which contain
> static information for each RX-queue.  We could place the stats record
> in the next cache-line (most HW systems fetch 2 cache-lines).  But we
> can also benchmark if it matters changing xdp_rxq_info to be a
> write-cache-line.
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH net-next 1/4] indirect call wrappers: helpers to speed-up indirect calls of builtin

2018-12-04 Thread Edward Cree
On 04/12/18 17:44, Paolo Abeni wrote:
> On Tue, 2018-12-04 at 17:13 +, Edward Cree wrote:
>> On 03/12/18 11:40, Paolo Abeni wrote:
>>> This header define a bunch of helpers that allow avoiding the
>>> retpoline overhead when calling builtin functions via function pointers.
>>> It boils down to explicitly comparing the function pointers to
>>> known builtin functions and eventually invoke directly the latter.
>>>
>>> The macros defined here implement the boilerplate for the above schema
>>> and will be used by the next patches.
>>>
>>> rfc -> v1:
>>>  - use branch prediction hint, as suggested by Eric
>>>
>>> Suggested-by: Eric Dumazet 
>>> Signed-off-by: Paolo Abeni 
>>> ---
>> I'm not sure I see the reason why this is done with numbers and
>>  'name ## NR', adding extra distance between the callsite and the
>>  list of callees.  In particular it means that each callable needs
>>  to specify its index.
>> Wouldn't it be simpler just to have
>> #define 1(f, f1, ...) \
>> (likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__))
>> #define INDIRECT_CALL_2(f, f2, f1, ...) \
>> (likely(f == f2) ? f2(__VA_ARGS__) : INDIRECT_CALL_1(f, f1, 
>> __VA_ARGS__))
>> etc.?  Removing the need for INDIRECT_CALLABLE_DECLARE_* entirely.
> Thank you for the review!
>
> As some of the builtin symbols are static, we would still need some
> macro wrappers to properly specify the scope when retpoline is enabled.
Ah I see, it hadn't occurred to me that static callees might not be
 available at the callsite.  Makes sense now.  In that case, have my
 Acked-By for this patch, if you want it.

> This:
> https://lore.kernel.org/lkml/cover.1543200841.git.jpoim...@redhat.com/T/#ma30f6b2aa655c99e93cfb267fef75b8fe9fca29b
>
> is possibly related to what you are planning.
Yes!  That looks like exactly the sort of machinery I need, thanks.
My idea is to build on that something that counts stats of where each
 indirect call goes, then every now and then patches in a new list of
 possible targets to an INDIRECT_CALL_*()-style jump tree, based on
 the N callees that were seen the most often.
(Such a solution could even cope with callees in modules: the jump
 table doesn't need a reference on the module because it only calls
 the function (from the module) if the function pointer is pointing at
 it, in which case (hopefully) whoever supplied the function pointer
 has a reference to the module.)
But I'm not sure I have the mad hacker skillz to make it work.

-Ed


Re: [PATCH net] sctp: always set frag_point on pmtu change

2018-12-04 Thread Marcelo Ricardo Leitner
On Tue, Dec 04, 2018 at 07:51:54PM +0100, Jakub Audykowicz wrote:
...
> Thanks, I've taken your remarks into account and ended up with this 
> simple solution:

LGTM! Thanks

> 
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index ab9242e51d9e..3487686f2cf5 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -620,4 +620,9 @@ static inline bool sctp_transport_pmtu_check(struct 
> sctp_transport *t)
>   return false;
>  }
>  
> +static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize)
> +{
> +return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize);

Not sure how final the patch is but be sure to run checkpatch.pl on
it before submitting it officially. It flagged some issues like the
above indentation using spaces, for example.

> +}
> +
>  #endif /* __net_sctp_h__ */
> diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> index ce8087846f05..dc12c2ba487f 100644
> --- a/net/sctp/chunk.c
> +++ b/net/sctp/chunk.c
> @@ -191,6 +191,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> sctp_association *asoc,
>* the packet
>*/
>   max_data = asoc->frag_point;
> + if (unlikely(!max_data)) {
> + pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing 
> max_data to default minimum",
> + __func__, asoc);
> + max_data = sctp_min_frag_point(
> + sctp_sk(asoc->base.sk), 
> sctp_datachk_len(>stream));
> + }
>  
>   /* If the the peer requested that we authenticate DATA chunks
>* we need to account for bundling of the AUTH chunks along with
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index bf618d1b41fd..b8cebd5a87e5 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3324,8 +3324,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char 
> __user *optval, unsigned
>   __u16 datasize = asoc ? sctp_datachk_len(>stream) :
>sizeof(struct sctp_data_chunk);
>  
> - min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
> -datasize);
> + min_len = sctp_min_frag_point(sp, datasize);
>   max_len = SCTP_MAX_CHUNK_LEN - datasize;
>  
>   if (val < min_len || val > max_len)
> 
>  
> 


Re: [Patch net v3] mlx5: force CHECKSUM_NONE for short ethernet frames

2018-12-04 Thread Saeed Mahameed
On Mon, Dec 3, 2018 at 10:14 PM Cong Wang  wrote:
>
> When an ethernet frame is padded to meet the minimum ethernet frame
> size, the padding octets are not covered by the hardware checksum.
> Fortunately the padding octets are ususally zero's, which don't affect
> checksum. However, we have a switch which pads non-zero octets, this
> causes kernel hardware checksum fault repeatedly.
>
> Prior to commit 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE 
> are friends"),
> skb checksum was forced to be CHECKSUM_NONE when padding is detected.
> After it, we need to keep skb->csum updated, like what we do for RXFCS.
> However, fixing up CHECKSUM_COMPLETE requires to verify and parse IP
> headers, it is not worthy the effort as the packets are so small that
> CHECKSUM_COMPLETE can't save anything.
>
> I tested this patch with RXFCS on and off, it works fine without any
> warning in both cases.
>
> Fixes: 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are 
> friends"),
> Cc: Saeed Mahameed 
> Cc: Eric Dumazet 
> Cc: Tariq Toukan 
> Signed-off-by: Cong Wang 
> ---
>  .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 22 ++-
>  1 file changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> index 624eed345b5d..1c153b8091da 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> @@ -732,6 +732,13 @@ static u8 get_ip_proto(struct sk_buff *skb, int 
> network_depth, __be16 proto)
> ((struct ipv6hdr *)ip_p)->nexthdr;
>  }
>
> +static bool is_short_frame(struct sk_buff *skb, bool has_fcs)
> +{
> +   u32 frame_len = has_fcs ? skb->len - ETH_FCS_LEN : skb->len;
> +
> +   return frame_len <= ETH_ZLEN;
> +}
> +

Do we really need to handle FCS here ?
maybe increase the small packet size to always assume FCS is there:

return skb->len <= ETH_ZLEN + ETH_FCS_LEN;
to avoid conditional statements at Data path.

>  static inline void mlx5e_handle_csum(struct net_device *netdev,
>  struct mlx5_cqe64 *cqe,
>  struct mlx5e_rq *rq,
> @@ -755,9 +762,22 @@ static inline void mlx5e_handle_csum(struct net_device 
> *netdev,
> goto csum_unnecessary;
>
> if (likely(is_last_ethertype_ip(skb, _depth, ))) {
> +   bool has_fcs = !!(netdev->features & NETIF_F_RXFCS);
> +
> if (unlikely(get_ip_proto(skb, network_depth, proto) == 
> IPPROTO_SCTP))
> goto csum_unnecessary;
>
> +   /* CQE csum doesn't cover padding octets in short ethernet
> +* frames. And the pad field is appended prior to calculating
> +* and appending the FCS field.
> +*
> +* Detecting these padded frames requires to verify and parse
> +* IP headers, so we simply force all those small frames to be
> +* CHECKSUM_NONE even if they are not padded.
> +*/
> +   if (unlikely(is_short_frame(skb, has_fcs)))
> +   goto csum_none;
> +

As Eric mentioned, goto csum_unnecessary; here, the code will handle the rest.
for l3/l4 packets the HW already verifies the checksum we must leverage that.

> skb->ip_summed = CHECKSUM_COMPLETE;
> skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
> if (network_depth > ETH_HLEN)
> @@ -768,7 +788,7 @@ static inline void mlx5e_handle_csum(struct net_device 
> *netdev,
> skb->csum = csum_partial(skb->data + ETH_HLEN,
>  network_depth - ETH_HLEN,
>  skb->csum);
> -   if (unlikely(netdev->features & NETIF_F_RXFCS))
> +   if (unlikely(has_fcs))
> skb->csum = csum_block_add(skb->csum,
>(__force 
> __wsum)mlx5e_get_fcs(skb),
>skb->len - ETH_FCS_LEN);
> --
> 2.19.1
>


RE: [PATCH net 1/2] net/mlx4_en: Change min MTU size to ETH_MIN_MTU

2018-12-04 Thread David Laight
From: Eric Dumazet
> Sent: 04 December 2018 17:04
> On 12/04/2018 08:59 AM, David Laight wrote:
> > From: Tariq Toukan
> >> Sent: 02 December 2018 12:35
> >> From: Eran Ben Elisha 
> >>
> >> NIC driver minimal MTU size shall be set to ETH_MIN_MTU, as defined in
> >> the RFC791 and in the network stack. Remove old mlx4_en only define for
> >> it, which was set to wrong value.
> > ...
> >>
> >> -  /* MTU range: 46 - hw-specific max */
> >> -  dev->min_mtu = MLX4_EN_MIN_MTU;
> >> +  /* MTU range: 68 - hw-specific max */
> >> +  dev->min_mtu = ETH_MIN_MTU;
> >>dev->max_mtu = priv->max_mtu;
> >
> > Where does 68 come from?
> 
> Min IPv4 MTU per RFC791

Maybe I'm just confused and these are the ranges that the 'maximum mtu'
can be set to.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)


[PATCH net] rtnetlink: ndo_dflt_fdb_dump() only work for ARPHRD_ETHER devices

2018-12-04 Thread Eric Dumazet
kmsan was able to trigger a kernel-infoleak using a gre device [1]

nlmsg_populate_fdb_fill() has a hard coded assumption
that dev->addr_len is ETH_ALEN, as normally guaranteed
for ARPHRD_ETHER devices.

A similar issue was fixed recently in commit da71577545a5
("rtnetlink: Disallow FDB configuration for non-Ethernet device")

[1]
BUG: KMSAN: kernel-infoleak in copyout lib/iov_iter.c:143 [inline]
BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x4c0/0x2700 lib/iov_iter.c:576
CPU: 0 PID: 6697 Comm: syz-executor310 Not tainted 4.20.0-rc3+ #95
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x32d/0x480 lib/dump_stack.c:113
 kmsan_report+0x12c/0x290 mm/kmsan/kmsan.c:683
 kmsan_internal_check_memory+0x32a/0xa50 mm/kmsan/kmsan.c:743
 kmsan_copy_to_user+0x78/0xd0 mm/kmsan/kmsan_hooks.c:634
 copyout lib/iov_iter.c:143 [inline]
 _copy_to_iter+0x4c0/0x2700 lib/iov_iter.c:576
 copy_to_iter include/linux/uio.h:143 [inline]
 skb_copy_datagram_iter+0x4e2/0x1070 net/core/datagram.c:431
 skb_copy_datagram_msg include/linux/skbuff.h:3316 [inline]
 netlink_recvmsg+0x6f9/0x19d0 net/netlink/af_netlink.c:1975
 sock_recvmsg_nosec net/socket.c:794 [inline]
 sock_recvmsg+0x1d1/0x230 net/socket.c:801
 ___sys_recvmsg+0x444/0xae0 net/socket.c:2278
 __sys_recvmsg net/socket.c:2327 [inline]
 __do_sys_recvmsg net/socket.c:2337 [inline]
 __se_sys_recvmsg+0x2fa/0x450 net/socket.c:2334
 __x64_sys_recvmsg+0x4a/0x70 net/socket.c:2334
 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x441119
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 
db 0a fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:7fffc7f008a8 EFLAGS: 0207 ORIG_RAX: 002f
RAX: ffda RBX: 004002c8 RCX: 00441119
RDX: 0040 RSI: 25c0 RDI: 0003
RBP: 006cc018 R08: 0100 R09: 0100
R10: 0100 R11: 0207 R12: 00402080
R13: 00402110 R14:  R15: 

Uninit was stored to memory at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:246 [inline]
 kmsan_save_stack mm/kmsan/kmsan.c:261 [inline]
 kmsan_internal_chain_origin+0x13d/0x240 mm/kmsan/kmsan.c:469
 kmsan_memcpy_memmove_metadata+0x1a9/0xf70 mm/kmsan/kmsan.c:344
 kmsan_memcpy_metadata+0xb/0x10 mm/kmsan/kmsan.c:362
 __msan_memcpy+0x61/0x70 mm/kmsan/kmsan_instr.c:162
 __nla_put lib/nlattr.c:744 [inline]
 nla_put+0x20a/0x2d0 lib/nlattr.c:802
 nlmsg_populate_fdb_fill+0x444/0x810 net/core/rtnetlink.c:3466
 nlmsg_populate_fdb net/core/rtnetlink.c:3775 [inline]
 ndo_dflt_fdb_dump+0x73a/0x960 net/core/rtnetlink.c:3807
 rtnl_fdb_dump+0x1318/0x1cb0 net/core/rtnetlink.c:3979
 netlink_dump+0xc79/0x1c90 net/netlink/af_netlink.c:2244
 __netlink_dump_start+0x10c4/0x11d0 net/netlink/af_netlink.c:2352
 netlink_dump_start include/linux/netlink.h:216 [inline]
 rtnetlink_rcv_msg+0x141b/0x1540 net/core/rtnetlink.c:4910
 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2477
 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4965
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1699/0x1740 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x13c7/0x1440 net/netlink/af_netlink.c:1917
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xe3b/0x1240 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x305/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:246 [inline]
 kmsan_internal_poison_shadow+0x6d/0x130 mm/kmsan/kmsan.c:170
 kmsan_kmalloc+0xa1/0x100 mm/kmsan/kmsan_hooks.c:186
 __kmalloc+0x14c/0x4d0 mm/slub.c:3825
 kmalloc include/linux/slab.h:551 [inline]
 __hw_addr_create_ex net/core/dev_addr_lists.c:34 [inline]
 __hw_addr_add_ex net/core/dev_addr_lists.c:80 [inline]
 __dev_mc_add+0x357/0x8a0 net/core/dev_addr_lists.c:670
 dev_mc_add+0x6d/0x80 net/core/dev_addr_lists.c:687
 ip_mc_filter_add net/ipv4/igmp.c:1128 [inline]
 igmp_group_added+0x4d4/0xb80 net/ipv4/igmp.c:1311
 __ip_mc_inc_group+0xea9/0xf70 net/ipv4/igmp.c:1444
 ip_mc_inc_group net/ipv4/igmp.c:1453 [inline]
 ip_mc_up+0x1c3/0x400 net/ipv4/igmp.c:1775
 inetdev_event+0x1d03/0x1d80 net/ipv4/devinet.c:1522
 notifier_call_chain kernel/notifier.c:93 [inline]
 __raw_notifier_call_chain kernel/notifier.c:394 [inline]
 raw_notifier_call_chain+0x13d/0x240 kernel/notifier.c:401
 __dev_notify_flags+0x3da/0x860 net/core/dev.c:1733
 dev_change_flags+0x1ac/0x230 net/core/dev.c:7569
 do_setlink+0x165f/0x5ea0 net/core/rtnetlink.c:2492
 

RE: [PATCH] i40e: fix mac filter delete when setting mac address

2018-12-04 Thread Keller, Jacob E
> -Original Message-
> From: Stefan Assmann [mailto:sassm...@kpanic.de]
> Sent: Tuesday, December 04, 2018 6:19 AM
> To: intel-wired-...@lists.osuosl.org
> Cc: netdev@vger.kernel.org; da...@davemloft.net; Kirsher, Jeffrey T
> ; Keller, Jacob E ;
> sassm...@kpanic.de
> Subject: [PATCH] i40e: fix mac filter delete when setting mac address
> 
> A previous commit moved the ether_addr_copy() in i40e_set_mac() before
> the mac filter del/add to avoid a race. However it wasn't taken into
> account that this alters the mac address being handed to
> i40e_del_mac_filter().

Woops!

> 
> Also changed i40e_add_mac_filter() to operate on netdev->dev_addr,
> hopefully that makes the code easier to read.
> 

Makes sense.

> Fixes: 458867b2ca0c ("i40e: don't remove netdev->dev_addr when syncing uc 
> list")
> 

Good catch. This looks correct to me.

Acked-by: Jacob Keller 

> Signed-off-by: Stefan Assmann 
> ---
>  drivers/net/ethernet/intel/i40e/i40e_main.c | 14 +++---
>  1 file changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c
> b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index 6d5b13f69dec..901ef799d2ad 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -1546,17 +1546,17 @@ static int i40e_set_mac(struct net_device *netdev, 
> void
> *p)
>   netdev_info(netdev, "set new mac address %pM\n", addr->sa_data);
> 
>   /* Copy the address first, so that we avoid a possible race with
> -  * .set_rx_mode(). If we copy after changing the address in the filter
> -  * list, we might open ourselves to a narrow race window where
> -  * .set_rx_mode could delete our dev_addr filter and prevent traffic
> -  * from passing.
> +  * .set_rx_mode().
> +  * - Remove old address from MAC filter
> +  * - Copy new address
> +  * - Add new address to MAC filter
>*/
> - ether_addr_copy(netdev->dev_addr, addr->sa_data);
> -
>   spin_lock_bh(>mac_filter_hash_lock);
>   i40e_del_mac_filter(vsi, netdev->dev_addr);
> - i40e_add_mac_filter(vsi, addr->sa_data);
> + ether_addr_copy(netdev->dev_addr, addr->sa_data);
> + i40e_add_mac_filter(vsi, netdev->dev_addr);
>   spin_unlock_bh(>mac_filter_hash_lock);
> +
>   if (vsi->type == I40E_VSI_MAIN) {
>   i40e_status ret;
> 
> --
> 2.19.2



[PATCH net-next] tcp: reduce POLLOUT events caused by TCP_NOTSENT_LOWAT

2018-12-04 Thread Eric Dumazet
TCP_NOTSENT_LOWAT socket option or sysctl was added in linux-3.12
as a step to enable bigger tcp sndbuf limits.

It works reasonably well, but the following happens :

Once the limit is reached, TCP stack generates
an [E]POLLOUT event for every incoming ACK packet.

This causes a high number of context switches.

This patch implements the strategy David Miller added
in sock_def_write_space() :

 - If TCP socket has a notsent_lowat constraint of X bytes,
   allow sendmsg() to fill up to X bytes, but send [E]POLLOUT
   only if number of notsent bytes is below X/2

This considerably reduces TCP_NOTSENT_LOWAT overhead,
while allowing to keep the pipe full.

Tested:
 100 ms RTT netem testbed between A and B, 100 concurrent TCP_STREAM

A:/# cat /proc/sys/net/ipv4/tcp_wmem
4096262144  6400
A:/# super_netperf 100 -H B -l 1000 -- -K bbr &

A:/# grep TCP /proc/net/sockstat
TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 1364904 # This is about 54 MB of 
memory per flow :/

A:/# vmstat 5 5
procs ---memory-- ---swap-- -io -system-- --cpu-
 r  b   swpd   free   buff  cache   si   sobibo   in   cs us sy id wa st
 0  0  0 256220672  13532 6949760010 0   28   14  0  1 99  
0  0
 2  0  0 256320016  13532 69848000   512 0 715901 5927  0 10 90 
 0  0
 0  0  0 256197232  13532 70099200   73513 771161 5849  0 11 89 
 0  0
 1  0  0 256233824  13532 70332000   51223 719650 6635  0 11 89 
 0  0
 2  0  0 256226880  13532 70578000   642 4 775650 6009  0 12 88 
 0  0

A:/# echo 2097152 >/proc/sys/net/ipv4/tcp_notsent_lowat

A:/# grep TCP /proc/net/sockstat
TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 86411 # 3.5 MB per flow

A:/# vmstat 5 5  # check that context switches have not inflated too much.
procs ---memory-- ---swap-- -io -system-- --cpu-
 r  b   swpd   free   buff  cache   si   sobibo   in   cs us sy id wa st
 2  0  0 260386512  13592 6621480010 0   17   14  0  1 99  
0  0
 0  0  0 260519680  13592 60418400   51213 726843 12424  0 10 
90  0  0
 1  1  0 260435424  13592 59836000   51225 764645 12925  0 10 
90  0  0
 1  0  0 260855392  13592 57838000   512 7 722943 13624  0 11 
88  0  0
 1  0  0 260445008  13592 60117600   61434 772288 14317  0 10 
90  0  0

Signed-off-by: Eric Dumazet 
Acked-by: Soheil Hassas Yeganeh 
---
 include/net/sock.h | 20 +++-
 include/net/tcp.h  |  8 ++--
 net/core/stream.c  |  2 +-
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 
f665d74ae509b41d3ffe10faad4065b61f369341..df390a3e23fedca4c75e3a7e66ad9e35aac71746
 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1110,7 +1110,7 @@ struct proto {
unsigned intinuse_idx;
 #endif
 
-   bool(*stream_memory_free)(const struct sock *sk);
+   bool(*stream_memory_free)(const struct sock *sk, 
int wake);
bool(*stream_memory_read)(const struct sock *sk);
/* Memory pressure */
void(*enter_memory_pressure)(struct sock *sk);
@@ -1192,19 +1192,29 @@ static inline void sk_refcnt_debug_release(const struct 
sock *sk)
 #define sk_refcnt_debug_release(sk) do { } while (0)
 #endif /* SOCK_REFCNT_DEBUG */
 
-static inline bool sk_stream_memory_free(const struct sock *sk)
+static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
 {
if (sk->sk_wmem_queued >= sk->sk_sndbuf)
return false;
 
return sk->sk_prot->stream_memory_free ?
-   sk->sk_prot->stream_memory_free(sk) : true;
+   sk->sk_prot->stream_memory_free(sk, wake) : true;
 }
 
-static inline bool sk_stream_is_writeable(const struct sock *sk)
+static inline bool sk_stream_memory_free(const struct sock *sk)
+{
+   return __sk_stream_memory_free(sk, 0);
+}
+
+static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
 {
return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
-  sk_stream_memory_free(sk);
+  __sk_stream_memory_free(sk, wake);
+}
+
+static inline bool sk_stream_is_writeable(const struct sock *sk)
+{
+   return __sk_stream_is_writeable(sk, 0);
 }
 
 static inline int sk_under_cgroup_hierarchy(struct sock *sk,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 
0681afc62354c6693e316ae66cb7e72a1cb8bd0a..e0a65c067662346f26a14754ac73de0600d797a8
 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1870,12 +1870,16 @@ static inline u32 tcp_notsent_lowat(const struct 
tcp_sock *tp)
return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
 }
 
-static inline bool tcp_stream_memory_free(const struct sock *sk)
+/* @wake is one when sk_stream_write_space() calls us.
+ * This sends 

Re: [RFC bpf-next 1/7] bpf: interpreter support BPF_ALU | BPF_ARSH

2018-12-04 Thread David Miller
From: Jiong Wang 
Date: Tue,  4 Dec 2018 04:56:29 -0500

> This patch implements interpreting BPF_ALU | BPF_ARSH. Do arithmetic right
> shift on low 32-bit sub-register, and zero the high 32 bits.
> 
> Reviewed-by: Jakub Kicinski 
> Signed-off-by: Jiong Wang 

I just want to say that this behavior is interesting because on most
cpus that have a 32-bit and 64-bit variant, the 32-bit arithmetic
right shift typically sign extends to 64-bit rather than zero extends
which is what is being defined here.

Well, definitely, sparc64 behaves this way.


RE: [Intel-wired-lan] [PATCH] i40e: fix mac filter delete when setting mac address

2018-12-04 Thread Bowers, AndrewX
> -Original Message-
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On
> Behalf Of Stefan Assmann
> Sent: Tuesday, December 4, 2018 6:19 AM
> To: intel-wired-...@lists.osuosl.org
> Cc: netdev@vger.kernel.org; da...@davemloft.net; sassm...@kpanic.de
> Subject: [Intel-wired-lan] [PATCH] i40e: fix mac filter delete when setting
> mac address
> 
> A previous commit moved the ether_addr_copy() in i40e_set_mac() before
> the mac filter del/add to avoid a race. However it wasn't taken into account
> that this alters the mac address being handed to i40e_del_mac_filter().
> 
> Also changed i40e_add_mac_filter() to operate on netdev->dev_addr,
> hopefully that makes the code easier to read.
> 
> Fixes: 458867b2ca0c ("i40e: don't remove netdev->dev_addr when syncing
> uc list")
> 
> Signed-off-by: Stefan Assmann 
> ---
>  drivers/net/ethernet/intel/i40e/i40e_main.c | 14 +++---
>  1 file changed, 7 insertions(+), 7 deletions(-)

Tested-by: Andrew Bowers 




[PATCH net-next v2 1/3] net: bridge: convert multicast to generic rhashtable

2018-12-04 Thread Nikolay Aleksandrov
The bridge multicast code currently uses a custom resizable hashtable
which predates the generic rhashtable interface. It has many
shortcomings compared and duplicates functionality that is presently
available via the generic rhashtable, so this patch removes the custom
rhashtable implementation in favor of the kernel's generic rhashtable.
The hash maximum is kept and the rhashtable's size is used to do a loose
check if it's reached in which case we revert to the old behaviour and
disable further bridge multicast processing. Also now we can support any
hash maximum, doesn't need to be a power of 2.

v2: handle when IGMP snooping is undefined, add br_mdb_init/uninit
placeholders

Signed-off-by: Nikolay Aleksandrov 
---
 net/bridge/br_device.c|  10 +
 net/bridge/br_mdb.c   | 120 +---
 net/bridge/br_multicast.c | 403 ++
 net/bridge/br_private.h   |  38 ++--
 4 files changed, 146 insertions(+), 425 deletions(-)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index c6abf927f0c9..9f41a5d4da3f 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -131,9 +131,17 @@ static int br_dev_init(struct net_device *dev)
return err;
}
 
+   err = br_mdb_hash_init(br);
+   if (err) {
+   free_percpu(br->stats);
+   br_fdb_hash_fini(br);
+   return err;
+   }
+
err = br_vlan_init(br);
if (err) {
free_percpu(br->stats);
+   br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
return err;
}
@@ -142,6 +150,7 @@ static int br_dev_init(struct net_device *dev)
if (err) {
free_percpu(br->stats);
br_vlan_flush(br);
+   br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
}
br_set_lockdep_class(dev);
@@ -156,6 +165,7 @@ static void br_dev_uninit(struct net_device *dev)
br_multicast_dev_del(br);
br_multicast_uninit_stats(br);
br_vlan_flush(br);
+   br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
free_percpu(br->stats);
 }
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index a7ea2d431714..ea8abdb56df3 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -78,82 +78,72 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry 
*entry, struct br_ip *ip)
 static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev)
 {
+   int idx = 0, s_idx = cb->args[1], err = 0;
struct net_bridge *br = netdev_priv(dev);
-   struct net_bridge_mdb_htable *mdb;
+   struct net_bridge_mdb_entry *mp;
struct nlattr *nest, *nest2;
-   int i, err = 0;
-   int idx = 0, s_idx = cb->args[1];
 
if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
return 0;
 
-   mdb = rcu_dereference(br->mdb);
-   if (!mdb)
-   return 0;
-
nest = nla_nest_start(skb, MDBA_MDB);
if (nest == NULL)
return -EMSGSIZE;
 
-   for (i = 0; i < mdb->max; i++) {
-   struct net_bridge_mdb_entry *mp;
+   hlist_for_each_entry_rcu(mp, >mdb_list, mdb_node) {
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
struct net_bridge_port *port;
 
-   hlist_for_each_entry_rcu(mp, >mhash[i], hlist[mdb->ver]) {
-   if (idx < s_idx)
-   goto skip;
+   if (idx < s_idx)
+   goto skip;
 
-   nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
-   if (nest2 == NULL) {
-   err = -EMSGSIZE;
-   goto out;
-   }
+   nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
+   if (!nest2) {
+   err = -EMSGSIZE;
+   break;
+   }
 
-   for (pp = >ports;
-(p = rcu_dereference(*pp)) != NULL;
- pp = >next) {
-   struct nlattr *nest_ent;
-   struct br_mdb_entry e;
-
-   port = p->port;
-   if (!port)
-   continue;
-
-   memset(, 0, sizeof(e));
-   e.ifindex = port->dev->ifindex;
-   e.vid = p->addr.vid;
-   __mdb_entry_fill_flags(, p->flags);
-   if (p->addr.proto == htons(ETH_P_IP))
-   e.addr.u.ip4 = p->addr.u.ip4;
+   for (pp = >ports; (p = rcu_dereference(*pp)) != NULL;
+ pp = >next) {
+   struct nlattr 

[PATCH net-next v2 3/3] net: bridge: increase multicast's default maximum number of entries

2018-12-04 Thread Nikolay Aleksandrov
bridge's default hash_max was 512 which is rather conservative, now that
we're using the generic rhashtable API which autoshrinks let's increase
it to 4096 and move it to a define in br_private.h.

Signed-off-by: Nikolay Aleksandrov 
---
v2: no change

 net/bridge/br_multicast.c | 2 +-
 net/bridge/br_private.h   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index d71edc9fa5c3..031067283678 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1743,7 +1743,7 @@ static void br_ip6_multicast_query_expired(struct 
timer_list *t)
 
 void br_multicast_init(struct net_bridge *br)
 {
-   br->hash_max = 512;
+   br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX;
 
br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
br->multicast_last_member_count = 2;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index df7584068d25..20a703a679e1 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -31,6 +31,8 @@
 #define BR_PORT_BITS   10
 #define BR_MAX_PORTS   (1<

Re: [PATCH net] rtnetlink: Refine sanity checks in rtnl_fdb_{add|del}

2018-12-04 Thread Eric Dumazet



On 12/03/2018 03:52 PM, David Miller wrote:
> From: Ido Schimmel 
> Date: Fri, 30 Nov 2018 19:00:24 +0200
> 
>> Yes, agree. Patch is good. I'll tag your v2.
> 
> This means, I assume, that a new version of this fix is coming.
> 
> Eric, is this correct?
> 

This is absolutely correct David, I will send a v2 today.


RE: [PATCH net 1/2] net/mlx4_en: Change min MTU size to ETH_MIN_MTU

2018-12-04 Thread David Laight
From: Tariq Toukan
> Sent: 02 December 2018 12:35
> From: Eran Ben Elisha 
> 
> NIC driver minimal MTU size shall be set to ETH_MIN_MTU, as defined in
> the RFC791 and in the network stack. Remove old mlx4_en only define for
> it, which was set to wrong value.
...
> 
> - /* MTU range: 46 - hw-specific max */
> - dev->min_mtu = MLX4_EN_MIN_MTU;
> + /* MTU range: 68 - hw-specific max */
> + dev->min_mtu = ETH_MIN_MTU;
>   dev->max_mtu = priv->max_mtu;

Where does 68 come from?
The minimum size of an ethernet packet including the mac addresses
and CRC is 64 bytes - but that would never be an 'mtu'.

Since 64 - 46 = 18, the 46 probably excludes both MAC addresses,
the ethertype/length and the CRC.
This is 'sort of' the minimum mtu for an ethernet frame.

I'm not sure which values are supposed to be in dev->min/max_mtu.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)



RE: [PATCH net-next v4 1/2] ixgbe: register a mdiobus

2018-12-04 Thread Bowers, AndrewX
> -Original Message-
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On
> Behalf Of Steve Douthit
> Sent: Monday, December 3, 2018 12:15 PM
> To: Kirsher, Jeffrey T 
> Cc: Andrew Lunn ; Florian Fainelli ;
> netdev@vger.kernel.org; intel-wired-...@lists.osuosl.org; David S. Miller
> 
> Subject: [Intel-wired-lan] [PATCH net-next v4 1/2] ixgbe: register a mdiobus
> 
> Most dsa devices expect a 'struct mii_bus' pointer to talk to switches via the
> MII interface.
> 
> While this works for dsa devices, it will not work safely with Linux PHYs in 
> all
> configurations since the firmware of the ixgbe device may be polling some
> PHY addresses in the background.
> 
> Signed-off-by: Stephen Douthit 
> Reviewed-by: Andrew Lunn 
> Reviewed-by: Florian Fainelli 
> ---
>  drivers/net/ethernet/intel/Kconfig|   1 +
>  drivers/net/ethernet/intel/ixgbe/ixgbe.h  |   2 +
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   5 +
>  drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c  | 299 ++
>  drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h  |   2 +
>  5 files changed, 309 insertions(+)

Tested-by: Andrew Bowers 




RE: [PATCH net-next v4 2/2] ixgbe: use mii_bus to handle MII related ioctls

2018-12-04 Thread Bowers, AndrewX
> -Original Message-
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On
> Behalf Of Steve Douthit
> Sent: Monday, December 3, 2018 12:15 PM
> To: Kirsher, Jeffrey T 
> Cc: Andrew Lunn ; Florian Fainelli ;
> netdev@vger.kernel.org; intel-wired-...@lists.osuosl.org; David S. Miller
> 
> Subject: [Intel-wired-lan] [PATCH net-next v4 2/2] ixgbe: use mii_bus to
> handle MII related ioctls
> 
> Use the mii_bus callbacks to address the entire clause 22/45 address space.
> Enables userspace to poke switch registers instead of a single PHY address.
> 
> The ixgbe firmware may be polling PHYs in a way that is not protected by the
> mii_bus lock.  This isn't new behavior, but as Andrew Lunn pointed out there
> are more addresses available for conflicts.
> 
> Signed-off-by: Stephen Douthit 
> Reviewed-by: Andrew Lunn 
> Reviewed-by: Florian Fainelli 
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 18 ++
>  1 file changed, 18 insertions(+)

Tested-by: Andrew Bowers 




[PATCH net-next v2 2/3] net: bridge: mark hash_elasticity as obsolete

2018-12-04 Thread Nikolay Aleksandrov
Now that the bridge multicast uses the generic rhashtable interface we
can drop the hash_elasticity option as that is already done for us and
it's hardcoded to a maximum of RHT_ELASTICITY (16 currently). Add a
warning about the obsolete option when the hash_elasticity is set.

Signed-off-by: Nikolay Aleksandrov 
---
v2: no change

 net/bridge/br_multicast.c |  1 -
 net/bridge/br_netlink.c   | 11 ---
 net/bridge/br_private.h   |  1 -
 net/bridge/br_sysfs_br.c  |  6 +++---
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index a3ac47dd0e5f..d71edc9fa5c3 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1743,7 +1743,6 @@ static void br_ip6_multicast_query_expired(struct 
timer_list *t)
 
 void br_multicast_init(struct net_bridge *br)
 {
-   br->hash_elasticity = 4;
br->hash_max = 512;
 
br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 13cd50326af2..d3a5963de35d 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1189,11 +1189,9 @@ static int br_changelink(struct net_device *brdev, 
struct nlattr *tb[],
return err;
}
 
-   if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) {
-   u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]);
-
-   br->hash_elasticity = val;
-   }
+   if (data[IFLA_BR_MCAST_HASH_ELASTICITY])
+   br_warn(br, "the hash_elasticity option has been deprecated and 
is always %u\n",
+   RHT_ELASTICITY);
 
if (data[IFLA_BR_MCAST_HASH_MAX]) {
u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]);
@@ -1457,8 +1455,7 @@ static int br_fill_info(struct sk_buff *skb, const struct 
net_device *brdev)
   br_opt_get(br, BROPT_MULTICAST_QUERIER)) ||
nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
   br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) ||
-   nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY,
-   br->hash_elasticity) ||
+   nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) ||
nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT,
br->multicast_last_member_count) ||
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a0ce0822921c..df7584068d25 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -372,7 +372,6 @@ struct net_bridge {
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 
-   u32 hash_elasticity;
u32 hash_max;
 
u32 multicast_last_member_count;
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 6a378a7e16ea..05a910d3b70a 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -424,13 +424,13 @@ static DEVICE_ATTR_RW(multicast_querier);
 static ssize_t hash_elasticity_show(struct device *d,
struct device_attribute *attr, char *buf)
 {
-   struct net_bridge *br = to_bridge(d);
-   return sprintf(buf, "%u\n", br->hash_elasticity);
+   return sprintf(buf, "%u\n", RHT_ELASTICITY);
 }
 
 static int set_elasticity(struct net_bridge *br, unsigned long val)
 {
-   br->hash_elasticity = val;
+   br_warn(br, "the hash_elasticity option has been deprecated and is 
always %u\n",
+   RHT_ELASTICITY);
return 0;
 }
 
-- 
2.17.2



[PATCH net-next v2 0/3] net: bridge: convert multicast to generic rhashtable

2018-12-04 Thread Nikolay Aleksandrov
Hi,
The current bridge multicast code uses a custom rhashtable
implementation which predates the generic rhashtable API. Patch 01
converts it to use the generic kernel rhashtable which simplifies the
code a lot and removes duplicated functionality. The convert also makes
hash_elasticity obsolete as the generic rhashtable already has such
checks and has a fixed elasticity of RHT_ELASTICITY (16 currently) so we
emit a warning whenever elasticity is set and return RHT_ELASTICITY when
read (patch 02). Since now we have the generic rhashtable which
autoshrinks we can be more liberal with the default hash maximum so patch
03 increases it to 4096 and moves it to a define in br_private.h.

v2: send the latest version of the set which handles when IGMP snooping
is not defined, changes are in patch 01

Thanks,
 Nik


Nikolay Aleksandrov (3):
  net: bridge: convert multicast to generic rhashtable
  net: bridge: mark hash_elasticity as obsolete
  net: bridge: increase multicast's default maximum number of entries

 net/bridge/br_device.c|  10 +
 net/bridge/br_mdb.c   | 120 +--
 net/bridge/br_multicast.c | 406 +++---
 net/bridge/br_netlink.c   |  11 +-
 net/bridge/br_private.h   |  41 ++--
 net/bridge/br_sysfs_br.c  |   6 +-
 6 files changed, 156 insertions(+), 438 deletions(-)

-- 
2.17.2



Re: [PATCH bpf 0/3] bpf: improve verifier resilience

2018-12-04 Thread Daniel Borkmann
On 12/04/2018 07:46 AM, Alexei Starovoitov wrote:
> Three patches to improve verifier ability to handle pathological bpf programs
> with a lot of branches:
> - make sure prog_load syscall can be aborted
> - improve branch taken analysis
> - introduce per-insn complexity limit for unprivileged programs
> 
> Alexei Starovoitov (3):
>   bpf: check pending signals while verifying programs
>   bpf: improve verifier branch analysis
>   bpf: add per-insn complexity limit
> 
>  kernel/bpf/verifier.c   | 103 +---
>  tools/testing/selftests/bpf/test_verifier.c |   4 +-
>  2 files changed, 91 insertions(+), 16 deletions(-)
> 

Applied to bpf, thanks!


Re: [PATCH net-next 1/4] indirect call wrappers: helpers to speed-up indirect calls of builtin

2018-12-04 Thread David Miller
From: Paolo Abeni 
Date: Tue, 04 Dec 2018 12:27:51 +0100

> On Mon, 2018-12-03 at 10:04 -0800, Eric Dumazet wrote:
>> On 12/03/2018 03:40 AM, Paolo Abeni wrote:
>> > This header define a bunch of helpers that allow avoiding the
>> > retpoline overhead when calling builtin functions via function pointers.
>> > It boils down to explicitly comparing the function pointers to
>> > known builtin functions and eventually invoke directly the latter.
>> > 
>> > The macros defined here implement the boilerplate for the above schema
>> > and will be used by the next patches.
>> > 
>> > rfc -> v1:
>> >  - use branch prediction hint, as suggested by Eric
>> > 
>> > Suggested-by: Eric Dumazet 
>> > Signed-off-by: Paolo Abeni 
>> > ---
>> >  include/linux/indirect_call_wrapper.h | 77 +++
>> >  1 file changed, 77 insertions(+)
>> >  create mode 100644 include/linux/indirect_call_wrapper.h
>> 
>> This needs to be discussed more broadly, please include lkml 
> 
> Agreed. @David: please let me know if you prefer a repost or a v2 with
> the expanded recipients list.

v2 probably works better and will help me better keep track of things.

Thanks for asking.


Re: [PATCH net-next] net: netem: use a list in addition to rbtree

2018-12-04 Thread Eric Dumazet



On 12/03/2018 05:07 PM, Peter Oskolkov wrote:
> When testing high-bandwidth TCP streams with large windows,
> high latency, and low jitter, netem consumes a lot of CPU cycles
> doing rbtree rebalancing.
> 
> This patch uses a linear list/queue in addition to the rbtree:
> if an incoming packet is past the tail of the linear queue, it is
> added there, otherwise it is inserted into the rbtree.
> 
> Without this patch, perf shows netem_enqueue, netem_dequeue,
> and rb_* functions among the top offenders. With this patch,
> only netem_enqueue is noticeable if jitter is low/absent.
> 
> Suggested-by: Eric Dumazet 
> Signed-off-by: Peter Oskolkov 
> ---

Reviewed-by: Eric Dumazet 

Thanks Peter !



[PATCH iproute2 2/2] testsuite: Add a test for batch processing

2018-12-04 Thread Petr Machata
Test that when a second or following command in a batch fails, tc
reports it correctly. This is a test for the previous patch.

Signed-off-by: Petr Machata 
---
 testsuite/tests/tc/batch.t | 23 +++
 1 file changed, 23 insertions(+)
 create mode 100755 testsuite/tests/tc/batch.t

diff --git a/testsuite/tests/tc/batch.t b/testsuite/tests/tc/batch.t
new file mode 100755
index 000..50e7ba3
--- /dev/null
+++ b/testsuite/tests/tc/batch.t
@@ -0,0 +1,23 @@
+#!/bin/sh
+. lib/generic.sh
+
+DEV="$(rand_dev)"
+ts_ip "$0" "Add $DEV dummy interface" link add dev $DEV type dummy
+ts_ip "$0" "Enable $DEV" link set $DEV up
+ts_tc "$0" "Add ingress qdisc" qdisc add dev $DEV clsact
+
+TMP="$(mktemp)"
+echo filt add dev $DEV ingress pref 1000 matchall action pass >> "$TMP"
+echo filt add dev $DEV ingress pref 1000 matchall action pass >> "$TMP"
+
+"$TC" -b "$TMP" 2> $STD_ERR > $STD_OUT
+if [ $? -eq 0 ]; then
+   ts_err "$0: batch passed when it should have failed"
+elif [ ! -s $STD_ERR ]; then
+   ts_err "$0: batch produced no error message"
+else
+   echo "$0: batch failed, as expected"
+fi
+
+rm "$TMP"
+ts_ip "$0" "Del $DEV dummy interface" link del dev $DEV
-- 
2.4.11



[PATCH iproute2 1/2] libnetlink: Process further iovs on no error

2018-12-04 Thread Petr Machata
When no error is reported in the first iov, do not prematurely return,
but process further iovs. This fixes batch processing.

Fixes: c60389e4f9ea ("libnetlink: fix leak and using unused memory on error")
Signed-off-by: Petr Machata 
---
 lib/libnetlink.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index c0b80ed..9545710 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -763,6 +763,7 @@ static int __rtnl_talk_iov(struct rtnl_handle *rtnl, struct 
iovec *iov,
msg.msg_iovlen = 1;
i = 0;
while (1) {
+next:
status = rtnl_recvmsg(rtnl->fd, , );
++i;
 
@@ -826,6 +827,8 @@ static int __rtnl_talk_iov(struct rtnl_handle *rtnl, struct 
iovec *iov,
else
free(buf);
 
+   if (i < iovlen)
+   goto next;
return error ? -i : 0;
}
 
-- 
2.4.11



Re: [PATCH net] sctp: always set frag_point on pmtu change

2018-12-04 Thread Jakub Audykowicz
On 2018-11-28 12:26, Marcelo Ricardo Leitner wrote:

> On Wed, Nov 28, 2018 at 12:08:38AM -0200, Marcelo Ricardo Leitner wrote:
>> On Tue, Nov 27, 2018 at 11:18:02PM +0100, Jakub Audykowicz wrote:
>>> On 2018-11-19 08:20, Xin Long wrote:
>>>
 On Mon, Nov 19, 2018 at 5:49 AM Jakub Audykowicz
  wrote:
> Calling send on a connected SCTP socket results in kernel panic if
> spp_pathmtu was configured manually before an association is established
> and it was not reconfigured to another value once the association is
> established.
>
> Steps to reproduce:
> 1. Set up a listening SCTP server socket.
> 2. Set up an SCTP client socket.
> 3. Configure client socket using setsockopt SCTP_PEER_ADDR_PARAMS with
> spp_pathmtu set to a legal value (e.g. 1000) and
> SPP_PMTUD_DISABLE set in spp_flags.
> 4. Connect client to server.
> 5. Send message from client to server.
>
> At this point oom-killer is invoked, which will eventually lead to:
> [5.197262] Out of memory and no killable processes...
> [5.198107] Kernel panic - not syncing: System is deadlocked on memory
>
> Commit 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> introduces sctp_assoc_update_frag_point, but this function is not called
> in this case, causing frag_point to be zero:
>  void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
>  {
> -   if (asoc->pathmtu != pmtu)
> +   if (asoc->pathmtu != pmtu) {
> asoc->pathmtu = pmtu;
> +   sctp_assoc_update_frag_point(asoc);
> +   }
>
> In this scenario, on association establishment, asoc->pathmtu is already
> 1000 and pmtu will be as well. Before this commit the frag_point was being
> correctly set in the scenario described. Moving the call outside the if
> block fixes the issue.
>
> I will be providing a separate patch to lksctp-tools with a simple test
> reproducing this problem ("func_tests: frag_point should never be zero").
>
> I have also taken the liberty to introduce a sanity check in chunk.c to
> set the frag_point to a non-negative value in order to avoid chunking
> endlessly (which is the reason for the eventual panic).
>
> Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> Signed-off-by: Jakub Audykowicz 
> ---
>  include/net/sctp/constants.h |  3 +++
>  net/sctp/associola.c | 13 +++--
>  net/sctp/chunk.c |  6 ++
>  3 files changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
> index 8dadc74c22e7..90316fab6f04 100644
> --- a/include/net/sctp/constants.h
> +++ b/include/net/sctp/constants.h
> @@ -293,6 +293,9 @@ enum { SCTP_MAX_GABS = 16 };
>  */
>  #define SCTP_DEFAULT_MINSEGMENT 512/* MTU size ... if no mtu disc */
>
> +/* An association's fragmentation point should never be non-positive */
> +#define SCTP_FRAG_POINT_MIN 1
> +
>  #define SCTP_SECRET_SIZE 32/* Number of octets in a 256 
> bits. */
>
>  #define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index 6a28b96e779e..44d71a1af62e 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -1431,13 +1431,14 @@ void sctp_assoc_update_frag_point(struct 
> sctp_association *asoc)
>
>  void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
>  {
> -   if (asoc->pathmtu != pmtu) {
> -   asoc->pathmtu = pmtu;
> -   sctp_assoc_update_frag_point(asoc);
> -   }
> +   pr_debug("%s: before asoc:%p, pmtu:%d, frag_point:%d\n",
> +   __func__, asoc, asoc->pathmtu, asoc->frag_point);
> +
> +   asoc->pathmtu = pmtu;
> +   sctp_assoc_update_frag_point(asoc);
>
> -   pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
> -asoc->pathmtu, asoc->frag_point);
> +   pr_debug("%s: after asoc:%p, pmtu:%d, frag_point:%d\n",
> +   __func__, asoc, asoc->pathmtu, asoc->frag_point);
>  }
 The idea was whenever asoc->pathmtu changes,  frag_point should
 be updated, but we missed one place in sctp_association_init().

 Another issue is after 4-shakehand, the client's asoc->intl_enable
 may be changed from 0 to 1, which means the frag_point should
 also be updated, since [1]:

 void sctp_assoc_update_frag_point(struct sctp_association *asoc)
 {
 int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu,
 sctp_datachk_len(>stream)); <--- 
 [1]

 So one fix for 

Re: [PATCH net-next v4 0/2] Add mii_bus to ixgbe driver for dsa devs

2018-12-04 Thread Steve Douthit
On 12/3/18 6:46 PM, Florian Fainelli wrote:
> Yes, we have been discussing that topic with Andrew and have a few ideas
> on how that could be achieved, but not code to use that at the moment.
> One of the idea was to finally allow enslaving the DSA master network
> device, that way you could assign specific ports to a specific CPU/DSA
> master network port within the switch, though that requires setting up a
> bridge. Would that work for your use case?

If bridge here means a port based vlan, then yes that works.  If we're
talking about something that shows up under brctl, then that's less
ideal.

It's probably time to split the thread for any further discussion since
this isn't really relevant to intel-wired-lan anymore.


Re: [PATCH net-next] net: netem: use a list in addition to rbtree

2018-12-04 Thread Peter Oskolkov
Thanks, Stephen!

I don't care much about braces either. David, do you want me to send a
new patch with braces moved around?

On Tue, Dec 4, 2018 at 9:56 AM Stephen Hemminger
 wrote:
>
> I like this, it makes a lot of sense since packets are almost
> always queued in order.
>
> Minor style stuff you might want to fix (but don't have to).
>
> > + if (!last ||
> > + t_last->time_to_send > 
> > last->time_to_send) {
> > + last = t_last;
> > + }
>
> I don't think you need braces here for single assignment.
>
> > +static void netem_erase_head(struct netem_sched_data *q, struct sk_buff 
> > *skb)
> > +{
> > + if (skb == q->t_head) {
> > + q->t_head = skb->next;
> > + if (!q->t_head)
> > + q->t_tail = NULL;
> > + } else
> > + rb_erase(>rbnode, >t_root);
>
> Checkpatch wants both sides of if/else to have brackets.
> Personally, don't care.
>
> Reviewed-by: Stephen Hemminger 


Re: [PATCH rdma-next 0/7] Enrich DEVX support

2018-12-04 Thread Jason Gunthorpe
On Tue, Dec 4, 2018 at 12:02 PM Doug Ledford  wrote:
>
> On Mon, 2018-11-26 at 08:28 +0200, Leon Romanovsky wrote:
> > From: Leon Romanovsky 
> >
> > From Yishai,
> > ---
> > This series enriches DEVX support in few aspects: it enables 
> > interoperability
> > between DEVX and verbs and improves mechanism for controlling privileged 
> > DEVX
> > commands.
> >
> > The first patch updates mlx5 ifc file.
> >
> > Next 3 patches enable modifying and querying verbs objects via the DEVX
> > interface.
> >
> > To achieve that the core layer introduced the 'UVERBS_IDR_ANY_OBJECT' type
> > to match any IDR object. Once it's used by some driver's method, the
> > infrastructure skips checking for the IDR type and it becomes the driver
> > handler responsibility.
> >
> > The DEVX methods of modify and query were changed to get any object type via
> > the 'UVERBS_IDR_ANY_OBJECT' mechanism. The type checking is done per object 
> > as
> > part of the driver code.
> >
> > The next 3 patches introduce more robust mechanism for controlling 
> > privileged
> > DEVX commands. The responsibility to block/allow per command was moved to be
> > done in the firmware based on the UID credentials that the driver reports 
> > upon
> > user context creation. This enables more granularity per command based on 
> > the
> > device security model and the user credentials.
> >
> > In addition, by introducing a valid range for 'general commands' we prevent 
> > the
> > need to touch the driver's code any time that a new future command will be
> > added.
> >
> > The last patch fixes the XRC verbs flow once a DEVX context is used. This is
> > needed as XRCD is some shared kernel resource and as such a kernel UID (=0)
> > should be used in its related resources.
> >
> > Thanks
> >
> > Yishai Hadas (7):
> >   net/mlx5: Update mlx5_ifc with DEVX UCTX capabilities bits
> >   IB/core: Introduce UVERBS_IDR_ANY_OBJECT
> >   IB/core: Enable getting an object type from a given uobject
> >   IB/mlx5: Enable modify and query verbs objects via DEVX
> >   IB/mlx5: Enforce DEVX privilege by firmware
> >   IB/mlx5: Update the supported DEVX commands
> >   IB/mlx5: Allow XRC usage via verbs in DEVX context
> >
> >  drivers/infiniband/core/rdma_core.c   |  27 +++--
> >  drivers/infiniband/core/rdma_core.h   |  21 ++--
> >  drivers/infiniband/core/uverbs_uapi.c |  10 +-
> >  drivers/infiniband/hw/mlx5/devx.c | 142 ++
> >  drivers/infiniband/hw/mlx5/main.c |   4 +-
> >  drivers/infiniband/hw/mlx5/mlx5_ib.h  |   6 +-
> >  drivers/infiniband/hw/mlx5/qp.c   |  12 +--
> >  drivers/infiniband/hw/mlx5/srq.c  |   2 +-
> >  include/linux/mlx5/mlx5_ifc.h |  26 -
> >  include/rdma/uverbs_ioctl.h   |   6 ++
> >  include/rdma/uverbs_std_types.h   |  12 +++
> >  11 files changed, 215 insertions(+), 53 deletions(-)
> >
> > --
> > 2.19.1
> >
>
> Hi Leon,
>
> I've put this into my wip/dl-for-next branch.  Because it pulled in the
> mlx5-next tree (which was significant because the patch we wanted was at
> the tip and a whole slew of other patches were before it),

Yes, this is how the shared branch works, DaveM merges it, we don't
have to unless we have a dependency, so it can accumulate a bit.

> the changeset
> for this patch looks much bigger.  And it had the merge conflict you
> mentioned.  You might want to double check my work before it gets moved
> over to the official for-next branch.

It is a bit easier to follow the git log if merges are done with the
--log option to summarize what was pulled in.

Jason


Re: [Patch net] mlx5: check for malformed packets

2018-12-04 Thread Saeed Mahameed
On Mon, Dec 3, 2018 at 10:39 AM Cong Wang  wrote:
>
> On Sat, Dec 1, 2018 at 12:38 PM Cong Wang  wrote:
> >
> > is_last_ethertype_ip() is used to check IP/IPv6 protocol before
> > parsing IP/IPv6 headers.
>
> One thing I noticed while reviewing the assembly code is that
> is_last_ethertype_ip() is no longer inlined after this patch.
>
> I think I should keep it inlined by using __always_inline, as it is on
> a hot path.
>
> What do you think, Tariq?

We are against having inline keywords in c file,  so better if you
move this function to a header file an force the inline keyword there.


[PATCH net] sctp: frag_point sanity check

2018-12-04 Thread Jakub Audykowicz
If for some reason an association's fragmentation point is zero,
sctp_datamsg_from_user will try to endlessly try to divide a message
into zero-sized chunks. This eventually causes kernel panic due to
running out of memory.

Although this situation is quite unlikely, it has occurred before as
reported. I propose to add this simple last-ditch sanity check due to
the severity of the potential consequences.

Signed-off-by: Jakub Audykowicz 
---
 include/net/sctp/sctp.h | 5 +
 net/sctp/chunk.c| 6 ++
 net/sctp/socket.c   | 3 +--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index ab9242e51d9e..2abbc15824af 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -620,4 +620,9 @@ static inline bool sctp_transport_pmtu_check(struct 
sctp_transport *t)
return false;
 }
 
+static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize)
+{
+   return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize);
+}
+
 #endif /* __net_sctp_h__ */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index ce8087846f05..d5b91bc8a377 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -191,6 +191,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
sctp_association *asoc,
 * the packet
 */
max_data = asoc->frag_point;
+   if (unlikely(!max_data)) {
+   max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk),
+  sctp_datachk_len(>stream));
+   pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing 
max_data to default minimum (%d)",
+   __func__, asoc, max_data);
+   }
 
/* If the the peer requested that we authenticate DATA chunks
 * we need to account for bundling of the AUTH chunks along with
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf618d1b41fd..b8cebd5a87e5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3324,8 +3324,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char 
__user *optval, unsigned
__u16 datasize = asoc ? sctp_datachk_len(>stream) :
 sizeof(struct sctp_data_chunk);
 
-   min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
-  datasize);
+   min_len = sctp_min_frag_point(sp, datasize);
max_len = SCTP_MAX_CHUNK_LEN - datasize;
 
if (val < min_len || val > max_len)
-- 
2.17.1



[PATCH v2 net-next 0/1] net: netem: use a list _and_ rbtree

2018-12-04 Thread Peter Oskolkov
v2: address style suggestions by Stephen Hemminger.

All changes are noop vs v1.

Peter Oskolkov (1):
  net: netem: use a list in addition to rbtree

 net/sched/sch_netem.c | 89 +--
 1 file changed, 69 insertions(+), 20 deletions(-)



[PATCH v2 net-next 1/1] net: netem: use a list in addition to rbtree

2018-12-04 Thread Peter Oskolkov
When testing high-bandwidth TCP streams with large windows,
high latency, and low jitter, netem consumes a lot of CPU cycles
doing rbtree rebalancing.

This patch uses a linear list/queue in addition to the rbtree:
if an incoming packet is past the tail of the linear queue, it is
added there, otherwise it is inserted into the rbtree.

Without this patch, perf shows netem_enqueue, netem_dequeue,
and rb_* functions among the top offenders. With this patch,
only netem_enqueue is noticeable if jitter is low/absent.

Suggested-by: Eric Dumazet 
Signed-off-by: Peter Oskolkov 
---
 net/sched/sch_netem.c | 89 +--
 1 file changed, 69 insertions(+), 20 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 2c38e3d07924..84658f60a872 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -77,6 +77,10 @@ struct netem_sched_data {
/* internal t(ime)fifo qdisc uses t_root and sch->limit */
struct rb_root t_root;
 
+   /* a linear queue; reduces rbtree rebalancing when jitter is low */
+   struct sk_buff  *t_head;
+   struct sk_buff  *t_tail;
+
/* optional qdisc for classful handling (NULL at netem init) */
struct Qdisc*qdisc;
 
@@ -369,26 +373,39 @@ static void tfifo_reset(struct Qdisc *sch)
rb_erase(>rbnode, >t_root);
rtnl_kfree_skbs(skb, skb);
}
+
+   rtnl_kfree_skbs(q->t_head, q->t_tail);
+   q->t_head = NULL;
+   q->t_tail = NULL;
 }
 
 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 {
struct netem_sched_data *q = qdisc_priv(sch);
u64 tnext = netem_skb_cb(nskb)->time_to_send;
-   struct rb_node **p = >t_root.rb_node, *parent = NULL;
 
-   while (*p) {
-   struct sk_buff *skb;
-
-   parent = *p;
-   skb = rb_to_skb(parent);
-   if (tnext >= netem_skb_cb(skb)->time_to_send)
-   p = >rb_right;
+   if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
+   if (q->t_tail)
+   q->t_tail->next = nskb;
else
-   p = >rb_left;
+   q->t_head = nskb;
+   q->t_tail = nskb;
+   } else {
+   struct rb_node **p = >t_root.rb_node, *parent = NULL;
+
+   while (*p) {
+   struct sk_buff *skb;
+
+   parent = *p;
+   skb = rb_to_skb(parent);
+   if (tnext >= netem_skb_cb(skb)->time_to_send)
+   p = >rb_right;
+   else
+   p = >rb_left;
+   }
+   rb_link_node(>rbnode, parent, p);
+   rb_insert_color(>rbnode, >t_root);
}
-   rb_link_node(>rbnode, parent, p);
-   rb_insert_color(>rbnode, >t_root);
sch->q.qlen++;
 }
 
@@ -530,9 +547,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc 
*sch,
t_skb = skb_rb_last(>t_root);
t_last = netem_skb_cb(t_skb);
if (!last ||
-   t_last->time_to_send > last->time_to_send) {
+   t_last->time_to_send > last->time_to_send)
+   last = t_last;
+   }
+   if (q->t_tail) {
+   struct netem_skb_cb *t_last =
+   netem_skb_cb(q->t_tail);
+
+   if (!last ||
+   t_last->time_to_send > last->time_to_send)
last = t_last;
-   }
}
 
if (last) {
@@ -611,11 +635,38 @@ static void get_slot_next(struct netem_sched_data *q, u64 
now)
q->slot.bytes_left = q->slot_config.max_bytes;
 }
 
+static struct sk_buff *netem_peek(struct netem_sched_data *q)
+{
+   struct sk_buff *skb = skb_rb_first(>t_root);
+   u64 t1, t2;
+
+   if (!skb)
+   return q->t_head;
+   if (!q->t_head)
+   return skb;
+
+   t1 = netem_skb_cb(skb)->time_to_send;
+   t2 = netem_skb_cb(q->t_head)->time_to_send;
+   if (t1 < t2)
+   return skb;
+   return q->t_head;
+}
+
+static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
+{
+   if (skb == q->t_head) {
+   q->t_head = skb->next;
+   if (!q->t_head)
+   q->t_tail = NULL;
+   } else {
+   rb_erase(>rbnode, >t_root);
+   }
+}
+
 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 {
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
-   struct rb_node *p;
 
 tfifo_dequeue:
skb = 

Re: [oss-drivers] Re: [RFC bpf-next 1/7] bpf: interpreter support BPF_ALU | BPF_ARSH

2018-12-04 Thread Alexei Starovoitov
On Tue, Dec 04, 2018 at 11:42:54AM -0800, Jakub Kicinski wrote:
> On Tue, 4 Dec 2018 11:29:55 -0800, Alexei Starovoitov wrote:
> > Regarding the set... It looks good to me. Pls resubmit without RFC tag
> > and probably include mips patch as patch 1, so we can apply the whole
> > thing to bpf-next. git will take care of same patch being in two trees.
> 
> Doesn't it only take care if the commit id matches?  I.e. if you were
> to pull the MIPS tree?

I think we've tried that before. As long as all text of the patch
is the same (including commit log), git will not complain.
fingers crossed :)



Re: [RFC bpf-next 1/7] bpf: interpreter support BPF_ALU | BPF_ARSH

2018-12-04 Thread David Miller
From: Alexei Starovoitov 
Date: Tue, 4 Dec 2018 11:29:55 -0800

> I guess sparc doesn't really have 32 subregisters.  All registers
> are considered 64-bit. It has 32-bit alu ops on 64-bit registers
> instead.

Right.

Anyways, sparc will require two instructions because of this, the
'sra' then a 'srl' by zero bits to clear the top 32-bits.

I'll code up the sparc JIT part when this goes in.


Re: [PATCH 1/2] vmalloc: New flag for flush before releasing pages

2018-12-04 Thread Andy Lutomirski
On Tue, Dec 4, 2018 at 12:02 PM Edgecombe, Rick P
 wrote:
>
> On Tue, 2018-12-04 at 16:03 +, Will Deacon wrote:
> > On Mon, Dec 03, 2018 at 05:43:11PM -0800, Nadav Amit wrote:
> > > > On Nov 27, 2018, at 4:07 PM, Rick Edgecombe 
> > > > wrote:
> > > >
> > > > Since vfree will lazily flush the TLB, but not lazily free the 
> > > > underlying
> > > > pages,
> > > > it often leaves stale TLB entries to freed pages that could get re-used.
> > > > This is
> > > > undesirable for cases where the memory being freed has special 
> > > > permissions
> > > > such
> > > > as executable.
> > >
> > > So I am trying to finish my patch-set for preventing transient W+X 
> > > mappings
> > > from taking space, by handling kprobes & ftrace that I missed (thanks 
> > > again
> > > for
> > > pointing it out).
> > >
> > > But all of the sudden, I don’t understand why we have the problem that 
> > > this
> > > (your) patch-set deals with at all. We already change the mappings to make
> > > the memory writable before freeing the memory, so why can’t we make it
> > > non-executable at the same time? Actually, why do we make the module 
> > > memory,
> > > including its data executable before freeing it???
> >
> > Yeah, this is really confusing, but I have a suspicion it's a combination
> > of the various different configurations and hysterical raisins. We can't
> > rely on module_alloc() allocating from the vmalloc area (see nios2) nor
> > can we rely on disable_ro_nx() being available at build time.
> >
> > If we *could* rely on module allocations always using vmalloc(), then
> > we could pass in Rick's new flag and drop disable_ro_nx() altogether
> > afaict -- who cares about the memory attributes of a mapping that's about
> > to disappear anyway?
> >
> > Is it just nios2 that does something different?
> >
> > Will
>
> Yea it is really intertwined. I think for x86, set_memory_nx everywhere would
> solve it as well, in fact that was what I first thought the solution should be
> until this was suggested. It's interesting that from the other thread Masami
> Hiramatsu referenced, set_memory_nx was suggested last year and would have
> inadvertently blocked this on x86. But, on the other architectures I have 
> since
> learned it is a bit different.
>
> It looks like actually most arch's don't re-define set_memory_*, and so all of
> the frob_* functions are actually just noops. In which case allocating RWX is
> needed to make it work at all, because that is what the allocation is going to
> stay at. So in these archs, set_memory_nx won't solve it because it will do
> nothing.
>
> On x86 I think you cannot get rid of disable_ro_nx fully because there is the
> changing of the permissions on the directmap as well. You don't want some 
> other
> caller getting a page that was left RO when freed and then trying to write to
> it, if I understand this.
>

Exactly.

After slightly more thought, I suggest renaming VM_IMMEDIATE_UNMAP to
VM_MAY_ADJUST_PERMS or similar.  It would have the semantics you want,
but it would also call some arch hooks to put back the direct map
permissions before the flush.  Does that seem reasonable?  It would
need to be hooked up that implement set_memory_ro(), but that should
be quite easy.  If nothing else, it could fall back to set_memory_ro()
in the absence of a better implementation.


Re: [Patch net] mlx5: check for malformed packets

2018-12-04 Thread Cong Wang
On Tue, Dec 4, 2018 at 11:33 AM Saeed Mahameed
 wrote:
>
> On Sat, Dec 1, 2018 at 12:38 PM Cong Wang  wrote:
> >
> > is_last_ethertype_ip() is used to check IP/IPv6 protocol before
> > parsing IP/IPv6 headers.
> >
> > But __vlan_get_protocol() is only bound to skb->len, a malicious
> > packet could exhaust all skb->len by inserting sufficient ETH_P_8021AD
> > headers, and it may not even contain an IP/IPv6 header at all, so we
> > have to check if we are still safe to continue to parse IP/IPv6 header.
> > If not, treat it as non-IP packet.
> >
> > This should not cause any crash as we stil have tail room in skb,
> > but we can't just rely on it either.
>
> Hi Cong, is this reproducible or just a theory ? which part of the
> code you think will cause the invalid access or crash ?

Since you don't even read into my changelog, here it is:

"This should not cause any crash as we stil have tail room in skb,
but we can't just rely on it either."

As I already explained to you in a private email, when we
reference whatever field in struct iphdr, we have to make sure
the offset of that field is within skb->len.


> do you have steps to reproduce this?
>

Again, you really have to read the changelog I wrote:


"a malicious
packet could exhaust all skb->len by inserting sufficient ETH_P_8021AD
headers, and it may not even contain an IP/IPv6 header at all, "


> I would like to investigate this myself, it will take a couple of days
> if that's ok with you ..

Sure, take your time. I am sending the patch only for showing
the problem, NOT to merge.


Let's discard it anyway. I am wasting my time.

Thanks.


Re: [Patch net v3] mlx5: force CHECKSUM_NONE for short ethernet frames

2018-12-04 Thread Cong Wang
On Mon, Dec 3, 2018 at 11:51 PM Eric Dumazet  wrote:
> > If it really validates L3/L4 checksum, then a full-packet checksum
> > is not needed.
>
> Yes, this is exactly what CHECKSUM_UNNECESSARY means.
> linux stack does not have to perform the check another time.

So you are suggesting to get rid of CHECKSUM_COMPLETE, right?
Like below:

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 7cb988640c9c..b9626c8b6b91 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -764,32 +764,6 @@ static inline void mlx5e_handle_csum(struct
net_device *netdev,
return;
}

-   if (unlikely(test_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, >state)))
-   goto csum_unnecessary;
-
-   if (likely(is_last_ethertype_ip(skb, _depth, ))) {
-   if (unlikely(get_ip_proto(skb, network_depth, proto)
== IPPROTO_SCTP))
-   goto csum_unnecessary;
-
-   skb->ip_summed = CHECKSUM_COMPLETE;
-   skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
-   if (network_depth > ETH_HLEN)
-   /* CQE csum is calculated from the IP header and does
-* not cover VLAN headers (if present). This will add
-* the checksum manually.
-*/
-   skb->csum = csum_partial(skb->data + ETH_HLEN,
-network_depth - ETH_HLEN,
-skb->csum);
-   if (unlikely(netdev->features & NETIF_F_RXFCS))
-   skb->csum = csum_block_add(skb->csum,
-  (__force
__wsum)mlx5e_get_fcs(skb),
-  skb->len - ETH_FCS_LEN);
-   stats->csum_complete++;
-   return;
-   }
-
-csum_unnecessary:


>
> For example, no call to csum_partial() is needed, even for IPv6+TCP or 
> IPv6+UDP

Sure, if you mean to get rid of CHECKSUM_COMPLETE.

Remember you fixed CHECKSUM_COMPLETE too when you touch it,
see d48051c5b837 ("net/mlx5e: fix csum adjustments caused by RXFCS").
So why didn't you remove CHECKSUM_COMPLETE but fixed it instead?

Thanks.


Re: [Patch net v2] mlx5: fixup checksum for short ethernet frame padding

2018-12-04 Thread Cong Wang
On Tue, Dec 4, 2018 at 11:21 AM Saeed Mahameed
 wrote:
> we are still working on it.

No, I give up.

Sorry for wasting time. Let's save everyone's time by discarding it!! :)


Re: [Patch net v3] mlx5: force CHECKSUM_NONE for short ethernet frames

2018-12-04 Thread Saeed Mahameed
On Mon, Dec 3, 2018 at 11:52 PM Eric Dumazet  wrote:
>
> On Mon, Dec 3, 2018 at 11:30 PM Cong Wang  wrote:
> >
> > On Mon, Dec 3, 2018 at 11:08 PM Eric Dumazet  wrote:
> > >
> > > The hardware has probably validated the L3 & L4 checksum just fine.
> > >
> > > Note that if ip_summed is CHECKSUM_UNNECESSARY, the padding bytes (if any)
> > > have no impact on the csum that has been verified by the NIC.
> >
> >
> > Why? Why does the hardware validates L3/L4 checksum when it
> > supplies a full-packet checksum? What's its point here?
>
> The point is that the driver author can decide what is best.
>
> For native IP+TCP or IP+UDP, the NIC has the ability to fully
> understand the packet and fully validate the checksum.

Also for Native IP4 and IP6 plain L3 packets.
The Hardware validates the csum when it can, and always provides
checksum complete for all packets.
One of the reason to validate is that sometimes we want to skip
checksum complete, but still leverage the hw validation,
like in your patch :), or LRO case, or many other cases in other
kernels/OSes/drivers.

So i agree with Eric, let's jump to checksum_unnecessary for short packets.

>
> >
> > If it really validates L3/L4 checksum, then a full-packet checksum
> > is not needed.
>
> Yes, this is exactly what CHECKSUM_UNNECESSARY means.
> linux stack does not have to perform the check another time.
>
> For example, no call to csum_partial() is needed, even for IPv6+TCP or 
> IPv6+UDP


Re: [Patch net] mlx5: check for malformed packets

2018-12-04 Thread Saeed Mahameed
On Sat, Dec 1, 2018 at 12:38 PM Cong Wang  wrote:
>
> is_last_ethertype_ip() is used to check IP/IPv6 protocol before
> parsing IP/IPv6 headers.
>
> But __vlan_get_protocol() is only bound to skb->len, a malicious
> packet could exhaust all skb->len by inserting sufficient ETH_P_8021AD
> headers, and it may not even contain an IP/IPv6 header at all, so we
> have to check if we are still safe to continue to parse IP/IPv6 header.
> If not, treat it as non-IP packet.
>
> This should not cause any crash as we stil have tail room in skb,
> but we can't just rely on it either.

Hi Cong, is this reproducible or just a theory ? which part of the
code you think will cause the invalid access or crash ?
do you have steps to reproduce this?

I would like to investigate this myself, it will take a couple of days
if that's ok with you ..

Thanks,
Saeed.


Re: [PATCH net-next] net: netem: use a list in addition to rbtree

2018-12-04 Thread David Miller
From: Peter Oskolkov 
Date: Tue, 4 Dec 2018 11:10:55 -0800

> Thanks, Stephen!
> 
> I don't care much about braces either. David, do you want me to send a
> new patch with braces moved around?

Single statement basic blocks definitely must not have curly braces,
please remove them and repost.

Thank you.


Re: [PATCH v2 net-next 1/1] net: netem: use a list in addition to rbtree

2018-12-04 Thread Eric Dumazet



On 12/04/2018 11:55 AM, Peter Oskolkov wrote:
> When testing high-bandwidth TCP streams with large windows,
> high latency, and low jitter, netem consumes a lot of CPU cycles
> doing rbtree rebalancing.
> 
> This patch uses a linear list/queue in addition to the rbtree:
> if an incoming packet is past the tail of the linear queue, it is
> added there, otherwise it is inserted into the rbtree.
> 
> Without this patch, perf shows netem_enqueue, netem_dequeue,
> and rb_* functions among the top offenders. With this patch,
> only netem_enqueue is noticeable if jitter is low/absent.
> 
> Suggested-by: Eric Dumazet 
> Signed-off-by: Peter Oskolkov 
> ---

Reviewed-by: Eric Dumazet 



[PATCH bpf-next 0/7] bpf: support BPF_ALU | BPF_ARSH

2018-12-04 Thread Jiong Wang
BPF_ALU | BPF_ARSH | BPF_* were rejected by commit: 7891a87efc71
("bpf: arsh is not supported in 32 bit alu thus reject it"). As explained
in the commit message, this is due to there is no complete support for them
on interpreter and various JIT compilation back-ends.

This patch set is a follow-up which completes the missing bits. This also
pave the way for running bpf program compiled with ALU32 instruction
enabled by specifing -mattr=+alu32 to LLVM for which case there is likely
to have more BPF_ALU | BPF_ARSH insns that will trigger the rejection code.

test_verifier.c is updated accordingly.

I have tested this patch set on x86-64 and NFP, I need help of review and
test on the arch changes (mips/ppc/s390).

Note, there might be merge confict on mips change which is better to be
applied on top of:

  commit: 20b880a05f06 ("mips: bpf: fix encoding bug for mm_srlv32_op"),

which is on mips-fixes branch at the moment.

Thanks.

Jiong Wang (7):
  mips: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_X
  ppc: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*
  s390: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*
  nfp: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*
  bpf: interpreter support BPF_ALU | BPF_ARSH
  bpf: verifier remove the rejection on BPF_ALU | BPF_ARSH
  selftests: bpf: update testcases for BPF_ALU | BPF_ARSH

 arch/mips/include/asm/uasm.h |  1 +
 arch/mips/include/uapi/asm/inst.h|  1 +
 arch/mips/mm/uasm-micromips.c|  1 +
 arch/mips/mm/uasm-mips.c |  1 +
 arch/mips/mm/uasm.c  |  9 ++---
 arch/mips/net/ebpf_jit.c |  4 +++
 arch/powerpc/include/asm/ppc-opcode.h|  2 ++
 arch/powerpc/net/bpf_jit.h   |  4 +++
 arch/powerpc/net/bpf_jit_comp64.c|  6 
 arch/s390/net/bpf_jit_comp.c | 12 +++
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 45 
 kernel/bpf/core.c| 52 
 kernel/bpf/verifier.c|  5 ---
 tools/testing/selftests/bpf/test_verifier.c  | 29 +---
 14 files changed, 137 insertions(+), 35 deletions(-)

-- 
2.7.4



[PATCH bpf-next 2/7] ppc: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*

2018-12-04 Thread Jiong Wang
This patch implements code-gen for BPF_ALU | BPF_ARSH | BPF_*.

Cc: Naveen N. Rao 
Cc: Sandipan Das 
Signed-off-by: Jiong Wang 
---
 arch/powerpc/include/asm/ppc-opcode.h | 2 ++
 arch/powerpc/net/bpf_jit.h| 4 
 arch/powerpc/net/bpf_jit_comp64.c | 6 ++
 3 files changed, 12 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index a6e9e31..9014592 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -342,6 +342,8 @@
 #define PPC_INST_SLW   0x7c30
 #define PPC_INST_SLD   0x7c36
 #define PPC_INST_SRW   0x7c000430
+#define PPC_INST_SRAW  0x7c000630
+#define PPC_INST_SRAWI 0x7c000670
 #define PPC_INST_SRD   0x7c000436
 #define PPC_INST_SRAD  0x7c000634
 #define PPC_INST_SRADI 0x7c000674
diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 47fc666..c2d5192 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -152,6 +152,10 @@
 ___PPC_RS(a) | ___PPC_RB(s))
 #define PPC_SRW(d, a, s)   EMIT(PPC_INST_SRW | ___PPC_RA(d) |\
 ___PPC_RS(a) | ___PPC_RB(s))
+#define PPC_SRAW(d, a, s)  EMIT(PPC_INST_SRAW | ___PPC_RA(d) |   \
+___PPC_RS(a) | ___PPC_RB(s))
+#define PPC_SRAWI(d, a, i) EMIT(PPC_INST_SRAWI | ___PPC_RA(d) |  \
+___PPC_RS(a) | __PPC_SH(i))
 #define PPC_SRD(d, a, s)   EMIT(PPC_INST_SRD | ___PPC_RA(d) |\
 ___PPC_RS(a) | ___PPC_RB(s))
 #define PPC_SRAD(d, a, s)  EMIT(PPC_INST_SRAD | ___PPC_RA(d) |   \
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 17482f5..c685b4f 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -529,9 +529,15 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image,
if (imm != 0)
PPC_SRDI(dst_reg, dst_reg, imm);
break;
+   case BPF_ALU | BPF_ARSH | BPF_X: /* (s32) dst >>= src */
+   PPC_SRAW(dst_reg, dst_reg, src_reg);
+   break;
case BPF_ALU64 | BPF_ARSH | BPF_X: /* (s64) dst >>= src */
PPC_SRAD(dst_reg, dst_reg, src_reg);
break;
+   case BPF_ALU | BPF_ARSH | BPF_K: /* (s32) dst >>= imm */
+   PPC_SRAWI(dst_reg, dst_reg, imm);
+   break;
case BPF_ALU64 | BPF_ARSH | BPF_K: /* (s64) dst >>= imm */
if (imm != 0)
PPC_SRADI(dst_reg, dst_reg, imm);
-- 
2.7.4



[PATCH bpf-next 5/7] bpf: interpreter support BPF_ALU | BPF_ARSH

2018-12-04 Thread Jiong Wang
This patch implements interpreting BPF_ALU | BPF_ARSH. Do arithmetic right
shift on low 32-bit sub-register, and zero the high 32 bits.

Reviewed-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
---
 kernel/bpf/core.c | 52 ++--
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f93ed66..36e31d8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -923,32 +923,34 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 #define BPF_INSN_MAP(INSN_2, INSN_3)   \
/* 32 bit ALU operations. */\
/*   Register based. */ \
-   INSN_3(ALU, ADD, X),\
-   INSN_3(ALU, SUB, X),\
-   INSN_3(ALU, AND, X),\
-   INSN_3(ALU, OR,  X),\
-   INSN_3(ALU, LSH, X),\
-   INSN_3(ALU, RSH, X),\
-   INSN_3(ALU, XOR, X),\
-   INSN_3(ALU, MUL, X),\
-   INSN_3(ALU, MOV, X),\
-   INSN_3(ALU, DIV, X),\
-   INSN_3(ALU, MOD, X),\
+   INSN_3(ALU, ADD,  X),   \
+   INSN_3(ALU, SUB,  X),   \
+   INSN_3(ALU, AND,  X),   \
+   INSN_3(ALU, OR,   X),   \
+   INSN_3(ALU, LSH,  X),   \
+   INSN_3(ALU, RSH,  X),   \
+   INSN_3(ALU, XOR,  X),   \
+   INSN_3(ALU, MUL,  X),   \
+   INSN_3(ALU, MOV,  X),   \
+   INSN_3(ALU, ARSH, X),   \
+   INSN_3(ALU, DIV,  X),   \
+   INSN_3(ALU, MOD,  X),   \
INSN_2(ALU, NEG),   \
INSN_3(ALU, END, TO_BE),\
INSN_3(ALU, END, TO_LE),\
/*   Immediate based. */\
-   INSN_3(ALU, ADD, K),\
-   INSN_3(ALU, SUB, K),\
-   INSN_3(ALU, AND, K),\
-   INSN_3(ALU, OR,  K),\
-   INSN_3(ALU, LSH, K),\
-   INSN_3(ALU, RSH, K),\
-   INSN_3(ALU, XOR, K),\
-   INSN_3(ALU, MUL, K),\
-   INSN_3(ALU, MOV, K),\
-   INSN_3(ALU, DIV, K),\
-   INSN_3(ALU, MOD, K),\
+   INSN_3(ALU, ADD,  K),   \
+   INSN_3(ALU, SUB,  K),   \
+   INSN_3(ALU, AND,  K),   \
+   INSN_3(ALU, OR,   K),   \
+   INSN_3(ALU, LSH,  K),   \
+   INSN_3(ALU, RSH,  K),   \
+   INSN_3(ALU, XOR,  K),   \
+   INSN_3(ALU, MUL,  K),   \
+   INSN_3(ALU, MOV,  K),   \
+   INSN_3(ALU, ARSH, K),   \
+   INSN_3(ALU, DIV,  K),   \
+   INSN_3(ALU, MOD,  K),   \
/* 64 bit ALU operations. */\
/*   Register based. */ \
INSN_3(ALU64, ADD,  X), \
@@ -1127,6 +1129,12 @@ static u64 ___bpf_prog_run(u64 *regs, const struct 
bpf_insn *insn, u64 *stack)
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
+   ALU_ARSH_X:
+   DST = (u64) (u32) ((*(s32 *) ) >> SRC);
+   CONT;
+   ALU_ARSH_K:
+   DST = (u64) (u32) ((*(s32 *) ) >> IMM);
+   CONT;
ALU64_ARSH_X:
(*(s64 *) ) >>= SRC;
CONT;
-- 
2.7.4



[PATCH bpf-next 1/7] mips: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_X

2018-12-04 Thread Jiong Wang
Jitting of BPF_K is supported already, but not BPF_X. This patch complete
the support for the latter on both MIPS and microMIPS.

Cc: Paul Burton 
Cc: linux-m...@vger.kernel.org
Signed-off-by: Jiong Wang 
---
 arch/mips/include/asm/uasm.h  | 1 +
 arch/mips/include/uapi/asm/inst.h | 1 +
 arch/mips/mm/uasm-micromips.c | 1 +
 arch/mips/mm/uasm-mips.c  | 1 +
 arch/mips/mm/uasm.c   | 9 +
 arch/mips/net/ebpf_jit.c  | 4 
 6 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index 59dae37..b1990dd 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -157,6 +157,7 @@ Ip_u2u1s3(_slti);
 Ip_u2u1s3(_sltiu);
 Ip_u3u1u2(_sltu);
 Ip_u2u1u3(_sra);
+Ip_u3u2u1(_srav);
 Ip_u2u1u3(_srl);
 Ip_u3u2u1(_srlv);
 Ip_u3u1u2(_subu);
diff --git a/arch/mips/include/uapi/asm/inst.h 
b/arch/mips/include/uapi/asm/inst.h
index 273ef58..40fbb5d 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -371,6 +371,7 @@ enum mm_32a_minor_op {
mm_srl32_op = 0x040,
mm_srlv32_op = 0x050,
mm_sra_op = 0x080,
+   mm_srav_op = 0x090,
mm_rotr_op = 0x0c0,
mm_lwxs_op = 0x118,
mm_addu32_op = 0x150,
diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c
index 24e5b0d..75ef904 100644
--- a/arch/mips/mm/uasm-micromips.c
+++ b/arch/mips/mm/uasm-micromips.c
@@ -104,6 +104,7 @@ static const struct insn insn_table_MM[insn_invalid] = {
[insn_sltiu]= {M(mm_sltiu32_op, 0, 0, 0, 0, 0), RT | RS | SIMM},
[insn_sltu] = {M(mm_pool32a_op, 0, 0, 0, 0, mm_sltu_op), RT | RS | 
RD},
[insn_sra]  = {M(mm_pool32a_op, 0, 0, 0, 0, mm_sra_op), RT | RS | 
RD},
+   [insn_srav] = {M(mm_pool32a_op, 0, 0, 0, 0, mm_srav_op), RT | RS | 
RD},
[insn_srl]  = {M(mm_pool32a_op, 0, 0, 0, 0, mm_srl32_op), RT | RS | 
RD},
[insn_srlv] = {M(mm_pool32a_op, 0, 0, 0, 0, mm_srlv32_op), RT | RS 
| RD},
[insn_rotr] = {M(mm_pool32a_op, 0, 0, 0, 0, mm_rotr_op), RT | RS | 
RD},
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 60ceb93..6abe40f 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -171,6 +171,7 @@ static const struct insn insn_table[insn_invalid] = {
[insn_sltiu]= {M(sltiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM},
[insn_sltu] = {M(spec_op, 0, 0, 0, 0, sltu_op), RS | RT | RD},
[insn_sra]  = {M(spec_op, 0, 0, 0, 0, sra_op),  RT | RD | RE},
+   [insn_srav] = {M(spec_op, 0, 0, 0, 0, srav_op), RS | RT | RD},
[insn_srl]  = {M(spec_op, 0, 0, 0, 0, srl_op),  RT | RD | RE},
[insn_srlv] = {M(spec_op, 0, 0, 0, 0, srlv_op),  RS | RT | RD},
[insn_subu] = {M(spec_op, 0, 0, 0, 0, subu_op), RS | RT | RD},
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index 57570c0..45b6264 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -61,10 +61,10 @@ enum opcode {
insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_nor,
insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sb,
insn_sc, insn_scd, insn_sd, insn_sh, insn_sll, insn_sllv,
-   insn_slt, insn_slti, insn_sltiu, insn_sltu, insn_sra, insn_srl,
-   insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp,
-   insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor,
-   insn_xori, insn_yield,
+   insn_slt, insn_slti, insn_sltiu, insn_sltu, insn_sra, insn_srav,
+   insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall,
+   insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh,
+   insn_xor, insn_xori, insn_yield,
insn_invalid /* insn_invalid must be last */
 };
 
@@ -353,6 +353,7 @@ I_u2u1s3(_slti)
 I_u2u1s3(_sltiu)
 I_u3u1u2(_sltu)
 I_u2u1u3(_sra)
+I_u3u2u1(_srav)
 I_u2u1u3(_srl)
 I_u3u2u1(_srlv)
 I_u2u1u3(_rotr)
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index aeb7b1b..b16710a 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -854,6 +854,7 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
case BPF_ALU | BPF_MOD | BPF_X: /* ALU_REG */
case BPF_ALU | BPF_LSH | BPF_X: /* ALU_REG */
case BPF_ALU | BPF_RSH | BPF_X: /* ALU_REG */
+   case BPF_ALU | BPF_ARSH | BPF_X: /* ALU_REG */
src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
if (src < 0 || dst < 0)
@@ -913,6 +914,9 @@ static int build_one_insn(const struct bpf_insn *insn, 
struct jit_ctx *ctx,
case BPF_RSH:
emit_instr(ctx, srlv, dst, dst, src);
break;
+   case BPF_ARSH:
+   emit_instr(ctx, srav, dst, dst, src);
+   break;
  

[PATCH bpf-next 7/7] selftests: bpf: update testcases for BPF_ALU | BPF_ARSH

2018-12-04 Thread Jiong Wang
"arsh32 on imm" and "arsh32 on reg" now are accepted. Also added two new
testcases to make sure arsh32 won't be treated as arsh64 during
interpretation or JIT code-gen for which case the high bits will be moved
into low halve that the testcases could catch them.

Reviewed-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
---
 tools/testing/selftests/bpf/test_verifier.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 18d0b7f..e7f1bc7 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -721,8 +721,18 @@ static struct bpf_test tests[] = {
BPF_ALU32_IMM(BPF_ARSH, BPF_REG_0, 5),
BPF_EXIT_INSN(),
},
-   .result = REJECT,
-   .errstr = "unknown opcode c4",
+   .result = ACCEPT,
+   .retval = 0,
+   },
+   {
+   "arsh32 on imm 2",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_0, 0x1122334485667788),
+   BPF_ALU32_IMM(BPF_ARSH, BPF_REG_0, 7),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = -16069393,
},
{
"arsh32 on reg",
@@ -732,8 +742,19 @@ static struct bpf_test tests[] = {
BPF_ALU32_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1),
BPF_EXIT_INSN(),
},
-   .result = REJECT,
-   .errstr = "unknown opcode cc",
+   .result = ACCEPT,
+   .retval = 0,
+   },
+   {
+   "arsh32 on reg 2",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_0, 0x55667788),
+   BPF_MOV64_IMM(BPF_REG_1, 15),
+   BPF_ALU32_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = 43724,
},
{
"arsh64 on imm",
-- 
2.7.4



[PATCH bpf-next 6/7] bpf: verifier remove the rejection on BPF_ALU | BPF_ARSH

2018-12-04 Thread Jiong Wang
This patch remove the rejection on BPF_ALU | BPF_ARSH as we have supported
them on interpreter and all JIT back-ends

Reviewed-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
---
 kernel/bpf/verifier.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ce8a1c3..8ed868e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3649,11 +3649,6 @@ static int check_alu_op(struct bpf_verifier_env *env, 
struct bpf_insn *insn)
return -EINVAL;
}
 
-   if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
-   verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
-   return -EINVAL;
-   }
-
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
 opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
-- 
2.7.4



[PATCH bpf-next 3/7] s390: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*

2018-12-04 Thread Jiong Wang
This patch implements code-gen for BPF_ALU | BPF_ARSH | BPF_*.

Cc: Martin Schwidefsky 
Cc: Heiko Carstens 
Signed-off-by: Jiong Wang 
---
 arch/s390/net/bpf_jit_comp.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index d7052cb..3ff758e 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -821,10 +821,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
/*
 * BPF_ARSH
 */
+   case BPF_ALU | BPF_ARSH | BPF_X: /* ((s32) dst) >>= src */
+   /* sra %dst,%dst,0(%src) */
+   EMIT4_DISP(0x8a00, dst_reg, src_reg, 0);
+   EMIT_ZERO(dst_reg);
+   break;
case BPF_ALU64 | BPF_ARSH | BPF_X: /* ((s64) dst) >>= src */
/* srag %dst,%dst,0(%src) */
EMIT6_DISP_LH(0xeb00, 0x000a, dst_reg, dst_reg, src_reg, 0);
break;
+   case BPF_ALU | BPF_ARSH | BPF_K: /* ((s32) dst >> imm */
+   if (imm == 0)
+   break;
+   /* sra %dst,imm(%r0) */
+   EMIT4_DISP(0x8a00, dst_reg, REG_0, imm);
+   EMIT_ZERO(dst_reg);
+   break;
case BPF_ALU64 | BPF_ARSH | BPF_K: /* ((s64) dst) >>= imm */
if (imm == 0)
break;
-- 
2.7.4



[PATCH bpf-next 4/7] nfp: bpf: implement jitting of BPF_ALU | BPF_ARSH | BPF_*

2018-12-04 Thread Jiong Wang
BPF_X support needs indirect shift mode, please see code comments for
details.

Reviewed-by: Jakub Kicinski 
Signed-off-by: Jiong Wang 
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 45 
 1 file changed, 45 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 97d33bb..662cbc2 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2382,6 +2382,49 @@ static int neg_reg(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
return 0;
 }
 
+static int __ashr_imm(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
+{
+   /* Set signedness bit (MSB of result). */
+   emit_alu(nfp_prog, reg_none(), reg_a(dst), ALU_OP_OR, reg_imm(0));
+   emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR, reg_b(dst),
+SHF_SC_R_SHF, shift_amt);
+   wrp_immed(nfp_prog, reg_both(dst + 1), 0);
+
+   return 0;
+}
+
+static int ashr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   const struct bpf_insn *insn = >insn;
+   u64 umin, umax;
+   u8 dst, src;
+
+   dst = insn->dst_reg * 2;
+   umin = meta->umin_src;
+   umax = meta->umax_src;
+   if (umin == umax)
+   return __ashr_imm(nfp_prog, dst, umin);
+
+   src = insn->src_reg * 2;
+   /* NOTE: the first insn will set both indirect shift amount (source A)
+* and signedness bit (MSB of result).
+*/
+   emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst));
+   emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
+  reg_b(dst), SHF_SC_R_SHF);
+   wrp_immed(nfp_prog, reg_both(dst + 1), 0);
+
+   return 0;
+}
+
+static int ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   const struct bpf_insn *insn = >insn;
+   u8 dst = insn->dst_reg * 2;
+
+   return __ashr_imm(nfp_prog, dst, insn->imm);
+}
+
 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
const struct bpf_insn *insn = >insn;
@@ -3286,6 +3329,8 @@ static const instr_cb_t instr_cb[256] = {
[BPF_ALU | BPF_DIV | BPF_K] =   div_imm,
[BPF_ALU | BPF_NEG] =   neg_reg,
[BPF_ALU | BPF_LSH | BPF_K] =   shl_imm,
+   [BPF_ALU | BPF_ARSH | BPF_X] =  ashr_reg,
+   [BPF_ALU | BPF_ARSH | BPF_K] =  ashr_imm,
[BPF_ALU | BPF_END | BPF_X] =   end_reg32,
[BPF_LD | BPF_IMM | BPF_DW] =   imm_ld8,
[BPF_LD | BPF_ABS | BPF_B] =data_ld1,
-- 
2.7.4



  1   2   >