[PATCH v2 net-next 1/5] net/flow_dissector: Save vlan ethertype from headers

2018-07-05 Thread Jianbo Liu
Change vlan dissector key to save vlan tpid to support both 802.1Q
and 802.1AD ethertype.

Signed-off-by: Jianbo Liu 
Acked-by: Jiri Pirko 
---
 include/net/flow_dissector.h | 2 +-
 net/core/flow_dissector.c| 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index adc24df5..8f89968 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -47,7 +47,7 @@ struct flow_dissector_key_tags {
 struct flow_dissector_key_vlan {
u16 vlan_id:12,
vlan_priority:3;
-   u16 padding;
+   __be16  vlan_tpid;
 };
 
 struct flow_dissector_key_mpls {
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 53f96e4..18cb99b 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -751,6 +751,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
const struct vlan_hdr *vlan;
struct vlan_hdr _vlan;
bool vlan_tag_present = skb && skb_vlan_tag_present(skb);
+   __be16 saved_vlan_tpid = proto;
 
if (vlan_tag_present)
proto = skb->protocol;
@@ -789,6 +790,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
(ntohs(vlan->h_vlan_TCI) &
 VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
}
+   key_vlan->vlan_tpid = saved_vlan_tpid;
}
 
fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
-- 
2.9.5



[PATCH v2 net-next 2/5] net/sched: flower: Add support for matching on vlan ethertype

2018-07-05 Thread Jianbo Liu
As flow dissector stores vlan ethertype, tc flower now can match on that.
It is to make preparation for supporting QinQ.

Signed-off-by: Jianbo Liu 
Acked-by: Jiri Pirko 
---
 net/sched/cls_flower.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 352876b..da9ec30 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -500,6 +500,7 @@ static int fl_set_key_mpls(struct nlattr **tb,
 }
 
 static void fl_set_key_vlan(struct nlattr **tb,
+   __be16 ethertype,
struct flow_dissector_key_vlan *key_val,
struct flow_dissector_key_vlan *key_mask)
 {
@@ -516,6 +517,8 @@ static void fl_set_key_vlan(struct nlattr **tb,
VLAN_PRIORITY_MASK;
key_mask->vlan_priority = VLAN_PRIORITY_MASK;
}
+   key_val->vlan_tpid = ethertype;
+   key_mask->vlan_tpid = cpu_to_be16(~0);
 }
 
 static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
@@ -592,8 +595,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
if (tb[TCA_FLOWER_KEY_ETH_TYPE]) {
ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
 
-   if (ethertype == htons(ETH_P_8021Q)) {
-   fl_set_key_vlan(tb, >vlan, >vlan);
+   if (eth_type_vlan(ethertype)) {
+   fl_set_key_vlan(tb, ethertype, >vlan, >vlan);
fl_set_key_val(tb, >basic.n_proto,
   TCA_FLOWER_KEY_VLAN_ETH_TYPE,
   >basic.n_proto, TCA_FLOWER_UNSPEC,
-- 
2.9.5



[PATCH v2 net-next 5/5] net/sched: flower: Add supprt for matching on QinQ vlan headers

2018-07-05 Thread Jianbo Liu
As support dissecting of QinQ inner and outer vlan headers, user can
add rules to match on QinQ vlan headers.

Signed-off-by: Jianbo Liu 
Acked-by: Jiri Pirko 
---
 include/uapi/linux/pkt_cls.h |  4 +++
 net/sched/cls_flower.c   | 65 ++--
 2 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 84e4c1d..c4262d9 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -469,6 +469,10 @@ enum {
TCA_FLOWER_KEY_IP_TTL,  /* u8 */
TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */
 
+   TCA_FLOWER_KEY_CVLAN_ID,/* be16 */
+   TCA_FLOWER_KEY_CVLAN_PRIO,  /* u8   */
+   TCA_FLOWER_KEY_CVLAN_ETH_TYPE,  /* be16 */
+
__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index e93b13d..487a152 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -35,6 +35,7 @@ struct fl_flow_key {
struct flow_dissector_key_basic basic;
struct flow_dissector_key_eth_addrs eth;
struct flow_dissector_key_vlan vlan;
+   struct flow_dissector_key_vlan cvlan;
union {
struct flow_dissector_key_ipv4_addrs ipv4;
struct flow_dissector_key_ipv6_addrs ipv6;
@@ -449,6 +450,9 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 
1] = {
[TCA_FLOWER_KEY_IP_TOS_MASK]= { .type = NLA_U8 },
[TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_IP_TTL_MASK]= { .type = NLA_U8 },
+   [TCA_FLOWER_KEY_CVLAN_ID]   = { .type = NLA_U16 },
+   [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 },
+   [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -501,19 +505,20 @@ static int fl_set_key_mpls(struct nlattr **tb,
 
 static void fl_set_key_vlan(struct nlattr **tb,
__be16 ethertype,
+   int vlan_id_key, int vlan_prio_key,
struct flow_dissector_key_vlan *key_val,
struct flow_dissector_key_vlan *key_mask)
 {
 #define VLAN_PRIORITY_MASK 0x7
 
-   if (tb[TCA_FLOWER_KEY_VLAN_ID]) {
+   if (tb[vlan_id_key]) {
key_val->vlan_id =
-   nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK;
+   nla_get_u16(tb[vlan_id_key]) & VLAN_VID_MASK;
key_mask->vlan_id = VLAN_VID_MASK;
}
-   if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) {
+   if (tb[vlan_prio_key]) {
key_val->vlan_priority =
-   nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) &
+   nla_get_u8(tb[vlan_prio_key]) &
VLAN_PRIORITY_MASK;
key_mask->vlan_priority = VLAN_PRIORITY_MASK;
}
@@ -596,11 +601,25 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
 
if (eth_type_vlan(ethertype)) {
-   fl_set_key_vlan(tb, ethertype, >vlan, >vlan);
-   fl_set_key_val(tb, >basic.n_proto,
-  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
-  >basic.n_proto, TCA_FLOWER_UNSPEC,
-  sizeof(key->basic.n_proto));
+   fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID,
+   TCA_FLOWER_KEY_VLAN_PRIO, >vlan,
+   >vlan);
+
+   ethertype = 
nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]);
+   if (eth_type_vlan(ethertype)) {
+   fl_set_key_vlan(tb, ethertype,
+   TCA_FLOWER_KEY_CVLAN_ID,
+   TCA_FLOWER_KEY_CVLAN_PRIO,
+   >cvlan, >cvlan);
+   fl_set_key_val(tb, >basic.n_proto,
+  TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+  >basic.n_proto,
+  TCA_FLOWER_UNSPEC,
+  sizeof(key->basic.n_proto));
+   } else {
+   key->basic.n_proto = ethertype;
+   mask->basic.n_proto = cpu_to_be16(~0);
+   }
} else {
key->basic.n_proto = ethertype;
mask->basic.n_proto = cpu_to_be16(~0);
@@ -826,6 +845,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask)
FL_KEY_SET_IF_MASKED(>key, keys, cnt,
 FLOW_DISSECTOR_KEY_VLAN, vlan);

[PATCH v2 net-next 4/5] net/sched: flower: Dump the ethertype encapsulated in vlan

2018-07-05 Thread Jianbo Liu
Currently the encapsulated ethertype is not dumped as it's the same as
TCA_FLOWER_KEY_ETH_TYPE keyvalue. But the dumping result is inconsistent
with input, we add dumping it with TCA_FLOWER_KEY_VLAN_ETH_TYPE.

Signed-off-by: Jianbo Liu 
Acked-by: Jiri Pirko 
---
 net/sched/cls_flower.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index da9ec30..e93b13d 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1313,6 +1313,10 @@ static int fl_dump(struct net *net, struct tcf_proto 
*tp, void *fh,
if (fl_dump_key_vlan(skb, >vlan, >vlan))
goto nla_put_failure;
 
+   if (mask->vlan.vlan_tpid &&
+   nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, key->basic.n_proto))
+   goto nla_put_failure;
+
if ((key->basic.n_proto == htons(ETH_P_IP) ||
 key->basic.n_proto == htons(ETH_P_IPV6)) &&
(fl_dump_key_val(skb, >basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
-- 
2.9.5



[PATCH v2 net-next 3/5] net/flow_dissector: Add support for QinQ dissection

2018-07-05 Thread Jianbo Liu
Dissect the QinQ packets to get both outer and inner vlan information,
then store to the extended flow keys.

Signed-off-by: Jianbo Liu 
Acked-by: Jiri Pirko 
---
 include/net/flow_dissector.h |  2 ++
 net/core/flow_dissector.c| 32 +---
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 8f89968..c644067 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -206,6 +206,7 @@ enum flow_dissector_key_id {
FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */
FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */
FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */
+   FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */
 
FLOW_DISSECTOR_KEY_MAX,
 };
@@ -237,6 +238,7 @@ struct flow_keys {
struct flow_dissector_key_basic basic;
struct flow_dissector_key_tags tags;
struct flow_dissector_key_vlan vlan;
+   struct flow_dissector_key_vlan cvlan;
struct flow_dissector_key_keyid keyid;
struct flow_dissector_key_ports ports;
struct flow_dissector_key_addrs addrs;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 18cb99b..b555fc2 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -589,7 +589,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
enum flow_dissect_ret fdret;
-   bool skip_vlan = false;
+   enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
int num_hdrs = 0;
u8 ip_proto = 0;
bool ret;
@@ -748,15 +748,14 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
}
case htons(ETH_P_8021AD):
case htons(ETH_P_8021Q): {
-   const struct vlan_hdr *vlan;
+   const struct vlan_hdr *vlan = NULL;
struct vlan_hdr _vlan;
-   bool vlan_tag_present = skb && skb_vlan_tag_present(skb);
__be16 saved_vlan_tpid = proto;
 
-   if (vlan_tag_present)
+   if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX &&
+   skb && skb_vlan_tag_present(skb)) {
proto = skb->protocol;
-
-   if (!vlan_tag_present || eth_type_vlan(skb->protocol)) {
+   } else {
vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
data, hlen, &_vlan);
if (!vlan) {
@@ -766,20 +765,23 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 
proto = vlan->h_vlan_encapsulated_proto;
nhoff += sizeof(*vlan);
-   if (skip_vlan) {
-   fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
-   break;
-   }
}
 
-   skip_vlan = true;
-   if (dissector_uses_key(flow_dissector,
-  FLOW_DISSECTOR_KEY_VLAN)) {
+   if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
+   dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
+   } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
+   dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN;
+   } else {
+   fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+   break;
+   }
+
+   if (dissector_uses_key(flow_dissector, dissector_vlan)) {
key_vlan = skb_flow_dissector_target(flow_dissector,
-
FLOW_DISSECTOR_KEY_VLAN,
+dissector_vlan,
 target_container);
 
-   if (vlan_tag_present) {
+   if (!vlan) {
key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
key_vlan->vlan_priority =
(skb_vlan_tag_get_prio(skb) >> 
VLAN_PRIO_SHIFT);
-- 
2.9.5



[PATCH v2 net-next 0/5] Introduce matching on double vlan/QinQ headers for TC flower

2018-07-05 Thread Jianbo Liu
Currently TC flower supports only one vlan tag, it doesn't match on both outer
and inner vlan headers for QinQ. To do this, we add support to get both outer
and inner vlan headers for flow dissector, and then TC flower do matching on
those information.

We also plan to extend TC command to support this feature. We add new
cvlan_id/cvlan_prio/cvlan_ethtype keywords for inner vlan header. The existing
vlan_id/vlan_prio/vlan_ethtype are for outer vlan header, and vlan_ethtype must
be 802.1q or 802.1ad.

The examples for command and output are as the following.
# tc filter add dev ens1f1 parent : protocol 802.1ad pref 33 \
flower vlan_id 1000 vlan_ethtype 802.1q \
cvlan_id 100 cvlan_ethtype ipv4 \
action vlan pop \
action vlan pop \
action mirred egress redirect dev ens1f1_0

# tc filter show dev ens1f1 ingress
filter protocol 802.1ad pref 33 flower chain 0
filter protocol 802.1ad pref 33 flower chain 0 handle 0x1
  vlan_id 1000
  vlan_ethtype 802.1Q
  cvlan_id 100
  cvlan_ethtype ip
  eth_type ipv4
  in_hw
...

v2:
  fix sparse warning.

Jianbo Liu (5):
  net/flow_dissector: Save vlan ethertype from headers
  net/sched: flower: Add support for matching on vlan ethertype
  net/flow_dissector: Add support for QinQ dissection
  net/sched: flower: Dump the ethertype encapsulated in vlan
  net/sched: flower: Add supprt for matching on QinQ vlan headers

 include/net/flow_dissector.h |  4 ++-
 include/uapi/linux/pkt_cls.h |  4 +++
 net/core/flow_dissector.c| 34 +++--
 net/sched/cls_flower.c   | 70 
 4 files changed, 83 insertions(+), 29 deletions(-)

-- 
2.9.5



Re: Crash due to destroying TCP request sockets using SOCK_DESTROY

2018-07-05 Thread Lorenzo Colitti
On Fri, Jul 6, 2018 at 11:37 AM Subash Abhinov Kasiviswanathan
 wrote:
>
>  From the call stack, a TCP socket is being destroyed using netlink_diag.
> The memory dump showed that the socket was an inet request socket (in
> state TCP_NEW_SYN_RECV) with refcount of 0.
> [...]
>   13232.479820:   <2> refcount_t: underflow; use-after-free.
>   13232.479838:   <6> [ cut here ]
>   13232.479843:   <6> kernel BUG at kernel/msm-4.14/lib/refcount.c:204!
>   13232.479849:   <6> Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
> [...]
>   13232.479996:   <6> Process netd (pid: 648, stack limit =
> 0xff801cf98000)
>   13232.479998:   <2> Call trace:
>   13232.48:   <2>  refcount_sub_and_test+0x64/0x78
>   13232.480002:   <2>  refcount_dec_and_test+0x18/0x24
>   13232.480005:   <2>  sock_gen_put+0x1c/0xb0
>   13232.480009:   <2>  tcp_diag_destroy+0x54/0x68
> [...]

Looks like for a TCP_NEW_SYN_RECV socket, sock_diag_destroy
essentially ends up doing:

struct request_sock *req = inet_reqsk(sk);

local_bh_disable();
inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
  req);
local_bh_enable();
...

sock_gen_put(sk);

It looks like inet_csk_reqsk_queue_drop_and_put calls reqsk_put(req),
which frees the socket, and at that point sock_gen_put is a UAF. Do we
just need:

-inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
-   req);
+inet_csk_reqsk_queue_drop(req->rsk_listener, req);

since sock_gen_put will also end up calling reqsk_put() for a
TCP_SYN_RECV socket?

Alastair - you're able to reproduce this UAF using net_test on qemu,
right? If so, could you try that two-line patch above?


[PATCH bpf] xdp: XDP_REDIRECT should check IFF_UP and MTU

2018-07-05 Thread Toshiaki Makita
Otherwise we end up with attempting to send packets from down devices
or to send oversized packets, which may cause unexpected driver/device
behaviour. Generic XDP has already done this check, so reuse the logic
in native XDP.

Fixes: 814abfabef3c ("xdp: add bpf_redirect helper function")
Signed-off-by: Toshiaki Makita 
---
 include/linux/filter.h | 6 +++---
 kernel/bpf/devmap.c| 7 ++-
 net/core/filter.c  | 9 +++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 300baad..c73dd73 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -765,8 +765,8 @@ static inline bool bpf_dump_raw_ok(void)
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
   const struct bpf_insn *patch, u32 len);
 
-static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb,
-  struct net_device *fwd)
+static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
+unsigned int pktlen)
 {
unsigned int len;
 
@@ -774,7 +774,7 @@ static inline int __xdp_generic_ok_fwd_dev(struct sk_buff 
*skb,
return -ENETDOWN;
 
len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
-   if (skb->len > len)
+   if (pktlen > len)
return -EMSGSIZE;
 
return 0;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 642c97f..d361fc1 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct 
xdp_buff *xdp,
 {
struct net_device *dev = dst->dev;
struct xdp_frame *xdpf;
+   int err;
 
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
 
+   err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+   if (unlikely(err))
+   return err;
+
xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
@@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, 
struct sk_buff *skb,
 {
int err;
 
-   err = __xdp_generic_ok_fwd_dev(skb, dst->dev);
+   err = xdp_ok_fwd_dev(dst->dev, skb->len);
if (unlikely(err))
return err;
skb->dev = dst->dev;
diff --git a/net/core/filter.c b/net/core/filter.c
index 0ca6907..2303f73 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3046,12 +3046,16 @@ static int __bpf_tx_xdp(struct net_device *dev,
u32 index)
 {
struct xdp_frame *xdpf;
-   int sent;
+   int err, sent;
 
if (!dev->netdev_ops->ndo_xdp_xmit) {
return -EOPNOTSUPP;
}
 
+   err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+   if (unlikely(err))
+   return err;
+
xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
@@ -3285,7 +3289,8 @@ int xdp_do_generic_redirect(struct net_device *dev, 
struct sk_buff *skb,
goto err;
}
 
-   if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd
+   err = xdp_ok_fwd_dev(fwd, skb->len);
+   if (unlikely(err))
goto err;
 
skb->dev = fwd;
-- 
1.8.3.1




Crash due to destroying TCP request sockets using SOCK_DESTROY

2018-07-05 Thread Subash Abhinov Kasiviswanathan

We are seeing a crash on an ARM64 device with Android 4.14 based kernel.

From the call stack, a TCP socket is being destroyed using netlink_diag.
The memory dump showed that the socket was an inet request socket (in
state TCP_NEW_SYN_RECV) with refcount of 0.

The crash seems to have happened during a regression test where wifi
was toggled with some browser activity but it is not very easily
reproducible. I believe netd on Android tries to destroy all sockets in
a system on change of network.

 13232.479820:   <2> refcount_t: underflow; use-after-free.
 13232.479838:   <6> [ cut here ]
 13232.479843:   <6> kernel BUG at kernel/msm-4.14/lib/refcount.c:204!
 13232.479849:   <6> Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
 13232.479895:   <6> CPU: 4 PID: 648 Comm: netd Tainted: G S  W  O   
 4.14.49+ #1

 13232.479897:   <6> task: fff5d6e28080 task.stack: ff801cf98000
 13232.479908:   <2> pc : refcount_sub_and_test+0x64/0x78
 13232.479910:   <2> lr : refcount_sub_and_test+0x64/0x78
 13232.479911:   <2> sp : ff801cf9ba40 pstate : 20400145
 13232.479911:   <2> x29: ff801cf9ba40 x28: fff5d6e28080
 13232.479914:   <2> x27: ff801cf9bd10 x26: fff4a1428f40
 13232.479915:   <2> x25:  x24: ff91
 13232.479917:   <2> x23: 0015 x22: fff5b837c880
 13232.479919:   <2> x21: fff4a1428f40 x20: 
 13232.479920:   <2> x19: fff4c47c6088 x18: e7b13cd1ecbfea00
 13232.479922:   <2> x17: 0008ec3bb553 x16: 011d8776aa792786
 13232.479924:   <2> x15: e7b13cd1ecbfea00 x14: 2bdb7692
 13232.479925:   <2> x13:  x12: e7b13cd1ecbfea00
 13232.479927:   <2> x11: e7b13cd1ecbfea00 x10: 
 13232.479928:   <2> x9 : e7b13cd1ecbfea00 x8 : 
 13232.479929:   <2> x7 : 0001 x6 : 0001
 13232.479931:   <2> x5 :  x4 : 0c08ed425d69
 13232.479932:   <2> x3 : 0066effb6000 x2 : ff8f09dc5000
 13232.479934:   <2> x1 :  x0 : 0026
 13232.479996:   <6> Process netd (pid: 648, stack limit = 
0xff801cf98000)

 13232.479998:   <2> Call trace:
 13232.48:   <2>  refcount_sub_and_test+0x64/0x78
 13232.480002:   <2>  refcount_dec_and_test+0x18/0x24
 13232.480005:   <2>  sock_gen_put+0x1c/0xb0
 13232.480009:   <2>  tcp_diag_destroy+0x54/0x68
 13232.480010:   <2>  inet_diag_cmd_exact+0x78/0xa0
 13232.480012:   <2>  inet_diag_handler_cmd+0xcc/0xf8
 13232.480018:   <2>  sock_diag_rcv_msg+0x130/0x158
 13232.480021:   <2>  netlink_rcv_skb+0xa4/0x11c
 13232.480023:   <2>  sock_diag_rcv+0x34/0x48
 13232.480025:   <2>  netlink_unicast+0x158/0x1f0
 13232.480026:   <2>  netlink_sendmsg+0x334/0x340
 13232.480028:   <2>  sock_sendmsg+0x44/0x60
 13232.480031:   <2>  sock_write_iter+0xac/0xf4
 13232.480034:   <2>  __vfs_write+0x124/0x154
 13232.480036:   <2>  vfs_write+0xcc/0x188
 13232.480038:   <2>  SyS_write+0x60/0xc0
 13232.480040:   <2>  el0_svc_naked+0x34/0x38
 13232.480042:   <6> Code: 910003fd f0008200 910fd000 97f4158c 
(d421)

 13232.480045:   <6> ---[ end trace 994bad5b8077e394 ]---
 13232.480061:   <6> Kernel panic - not syncing: Fatal exception

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project


Re: [PATCH net-next 0/2] IP listification follow-ups

2018-07-05 Thread David Miller
From: Edward Cree 
Date: Thu, 5 Jul 2018 15:45:04 +0100

> While working on IPv6 list processing, I found another bug in the IPv4
>  version.  So this patch series has that fix, and the IPv6 version with
>  both fixes incorporated.

Series applied.

Edward, please put (" ") around the commit header line text in your
Fixes: tags in the future.  I fixed it up for you this time.

Thank you.


Re: [PATCH net] net: aquantia: vlan unicast address list correct handling

2018-07-05 Thread David Miller
From: Igor Russkikh 
Date: Thu,  5 Jul 2018 17:01:09 +0300

> Setting up macvlan/macvtap networks over atlantic NIC results
> in no traffic over these networks because ndo_set_rx_mode did
> not listed UC MACs as registered in unicast filter.
> 
> Here we fix that taking into account maximum number of UC
> filters supported by hardware. If more than MAX addresses were
> registered, we just enable promisc  and/or allmulti to pass
> the traffic in.
> 
> We also remove MULTICAST_ADDRESS_MAX constant from aq_cfg since
> thats not a configurable parameter at all.
> 
> Fixes: b21f502 ("net:ethernet:aquantia: Fix for multicast filter handling.")
> Signed-off-by: Igor Russkikh 

Applied and queued up for -stable.

Thanks!


Re: [PATCH net] MAINTAINERS: update my email address

2018-07-05 Thread David Miller
From: Stefan Schmidt 
Date: Thu,  5 Jul 2018 13:56:44 +0200

> The mail server hosting the old address is going to fade out.
> Time to update to an address I control directly.
> 
> Signed-off-by: Stefan Schmidt 

Applied, thank you.


[PATCH iproute2-next] tc: m_tunnel_key: Add tunnel option support to act_tunnel_key

2018-07-05 Thread Jakub Kicinski
From: Simon Horman 

Allow setting tunnel options using the act_tunnel_key action.

Options are expressed as class:type:data and multiple options
may be listed using a comma delimiter.

 # ip link add name geneve0 type geneve dstport 0 external
 # tc qdisc add dev eth0 ingress
 # tc filter add dev eth0 protocol ip parent : \
 flower indev eth0 \
ip_proto udp \
action tunnel_key \
set src_ip 10.0.99.192 \
dst_ip 10.0.99.193 \
dst_port 6081 \
id 11 \
geneve_opts 0102:80:00800022,0102:80:00800022 \
action mirred egress redirect dev geneve0

Signed-off-by: Simon Horman 
Signed-off-by: Pieter Jansen van Vuuren 
Reviewed-by: Jakub Kicinski 
---
 man/man8/tc-tunnel_key.8 |  12 ++-
 tc/m_tunnel_key.c| 177 +++
 2 files changed, 188 insertions(+), 1 deletion(-)

diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8
index e979a74715cb..7d4b30e41faf 100644
--- a/man/man8/tc-tunnel_key.8
+++ b/man/man8/tc-tunnel_key.8
@@ -64,7 +64,9 @@ and
 .B dst_ip
 options.
 .B dst_port
-is optional.
+and
+.B geneve_opts
+are optional.
 .RS
 .TP
 .B id
@@ -79,6 +81,14 @@ Outer header destination IP address (IPv4 or IPv6)
 .B dst_port
 Outer header destination UDP port
 .TP
+.B geneve_opts
+Geneve variable length options.
+.B geneve_opts
+is specified in the form CLASS:TYPE:DATA, where CLASS is represented as a
+16bit hexadecimal value, TYPE as an 8bit hexadecimal value and DATA as a
+variable length hexadecimal value. Additionally multiple options may be
+listed using a comma delimiter.
+.TP
 .RB [ no ] csum
 Controlls outer UDP checksum. When set to
 .B csum
diff --git a/tc/m_tunnel_key.c b/tc/m_tunnel_key.c
index 0fa461549ad9..5a0e3fc3c48f 100644
--- a/tc/m_tunnel_key.c
+++ b/tc/m_tunnel_key.c
@@ -29,6 +29,7 @@ static void explain(void)
"src_ip  (mandatory)\n"
"dst_ip  (mandatory)\n"
"dst_port \n"
+   "geneve_opts \n"
"csum | nocsum (default is \"csum\")\n");
 }
 
@@ -81,6 +82,114 @@ static int tunnel_key_parse_dst_port(char *str, int type, 
struct nlmsghdr *n)
return 0;
 }
 
+static int tunnel_key_parse_be16(char *str, int base, int type,
+struct nlmsghdr *n)
+{
+   int ret;
+   __be16 value;
+
+   ret = get_be16(, str, base);
+   if (ret)
+   return ret;
+
+   addattr16(n, MAX_MSG, type, value);
+
+   return 0;
+}
+
+static int tunnel_key_parse_u8(char *str, int base, int type,
+  struct nlmsghdr *n)
+{
+   int ret;
+   __u8 value;
+
+   ret = get_u8(, str, base);
+   if (ret)
+   return ret;
+
+   addattr8(n, MAX_MSG, type, value);
+
+   return 0;
+}
+
+static int tunnel_key_parse_geneve_opt(char *str, struct nlmsghdr *n)
+{
+   char *token, *saveptr = NULL;
+   struct rtattr *nest;
+   int i, ret;
+
+   nest = addattr_nest(n, MAX_MSG, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
+
+   token = strtok_r(str, ":", );
+   i = 1;
+   while (token) {
+   switch (i) {
+   case TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS:
+   {
+   ret = tunnel_key_parse_be16(token, 16, i, n);
+   if (ret)
+   return ret;
+   break;
+   }
+   case TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE:
+   {
+   ret = tunnel_key_parse_u8(token, 16, i, n);
+   if (ret)
+   return ret;
+   break;
+   }
+   case TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA:
+   {
+   size_t token_len = strlen(token);
+   uint8_t *opts;
+
+   opts = malloc(token_len / 2);
+   if (!opts)
+   return -1;
+   if (hex2mem(token, opts, token_len / 2) < 0) {
+   free(opts);
+   return -1;
+   }
+   addattr_l(n, MAX_MSG, i, opts, token_len / 2);
+   free(opts);
+
+   break;
+   }
+   default:
+   return -1;
+   }
+
+   token = strtok_r(NULL, ":", );
+   i++;
+   }
+
+   addattr_nest_end(n, nest);
+
+   return 0;
+}
+
+static int tunnel_key_parse_geneve_opts(char *str, struct nlmsghdr *n)
+{
+   char *token, *saveptr = NULL;
+   struct rtattr *nest;
+   int ret;
+
+   nest = addattr_nest(n, MAX_MSG, TCA_TUNNEL_KEY_ENC_OPTS);
+
+   token = strtok_r(str, ",", );
+   while (token) {
+   ret = tunnel_key_parse_geneve_opt(token, n);
+   if (ret)
+   return 

Re: [PATCH bpf-next 11/11] tools: bpftool: allow reuse of maps with bpftool prog load

2018-07-05 Thread Jakub Kicinski
On Thu, 5 Jul 2018 10:35:24 +0200, Daniel Borkmann wrote:
> On 07/04/2018 04:54 AM, Jakub Kicinski wrote:
> > Add map parameter to prog load which will allow reuse of existing
> > maps instead of creating new ones.
> > 
> > Signed-off-by: Jakub Kicinski 
> > Reviewed-by: Quentin Monnet   
> [...]
> > +
> > +   fd = map_parse_fd(, );
> > +   if (fd < 0)
> > +   goto err_free_reuse_maps;
> > +
> > +   map_replace = reallocarray(map_replace, old_map_fds + 1,
> > +  sizeof(*map_replace));
> > +   if (!map_replace) {
> > +   p_err("mem alloc failed");
> > +   goto err_free_reuse_maps;  
> 
> Series in general looks good to me. However, above reallocarray() doesn't
> exist and hence build fails, please see below. Is that from newest glibc?
> 
> You probably need some fallback implementation or in general have something
> bpftool internal that doesn't make a bet on its availability.
> 
> # make
> 
> Auto-detecting system features:
> ...libbfd: [ on  ]
> ...disassembler-four-args: [ OFF ]
> 
>   CC   bpf_jit_disasm.o
>   LINK bpf_jit_disasm
>   CC   bpf_dbg.o
>   LINK bpf_dbg
>   CC   bpf_asm.o
>   BISONbpf_exp.yacc.c
>   CC   bpf_exp.yacc.o
>   FLEX bpf_exp.lex.c
>   CC   bpf_exp.lex.o
>   LINK bpf_asm
>   DESCEND  bpftool
> 
> Auto-detecting system features:
> ...libbfd: [ on  ]
> ...disassembler-four-args: [ OFF ]
> 
>   CC   map_perf_ring.o
>   CC   xlated_dumper.o
>   CC   perf.o
>   CC   prog.o
> prog.c: In function ‘do_load’:
> prog.c:785:18: warning: implicit declaration of function ‘reallocarray’ 
> [-Wimplicit-function-declaration]
> map_replace = reallocarray(map_replace, old_map_fds + 1,
>   ^~~~
> prog.c:785:16: warning: assignment makes pointer from integer without a cast 
> [-Wint-conversion]
> map_replace = reallocarray(map_replace, old_map_fds + 1,
> ^
>   CC   common.o
>   CC   cgroup.o
>   CC   main.o
>   CC   json_writer.o
>   CC   cfg.o
>   CC   map.o
>   CC   jit_disasm.o
>   CC   disasm.o
> 
> Auto-detecting system features:
> ...libelf: [ on  ]
> ...   bpf: [ on  ]
> 
> Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from 
> latest version at 'include/uapi/linux/bpf.h'
>   CC   libbpf.o
>   CC   bpf.o
>   CC   nlattr.o
>   CC   btf.o
>   LD   libbpf-in.o
>   LINK libbpf.a
>   LINK bpftool
> prog.o: In function `do_load':
> prog.c:(.text+0x23d): undefined reference to `reallocarray'
> collect2: error: ld returned 1 exit status
> Makefile:89: recipe for target 'bpftool' failed
> make[1]: *** [bpftool] Error 1
> Makefile:99: recipe for target 'bpftool' failed
> make: *** [bpftool] Error 2

Oh no..  Sorry & thanks for catching this.  It would be nice to not
depend on Glibc version defines, in case someone is using a different
library.  Jiong suggested we can just use the feature detection, so I
have something like this:

---

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 0911b00b25cc..20a691659381 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -52,8 +52,8 @@ INSTALL ?= install
 RM ?= rm -f
 
 FEATURE_USER = .bpftool
-FEATURE_TESTS = libbfd disassembler-four-args
-FEATURE_DISPLAY = libbfd disassembler-four-args
+FEATURE_TESTS = libbfd disassembler-four-args reallocarray
+FEATURE_DISPLAY = libbfd disassembler-four-args reallocarray
 
 check_feat := 1
 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install 
doc-uninstall
diff --git a/tools/bpf/bpftool/compat.h b/tools/bpf/bpftool/compat.h
new file mode 100644
index ..7885cedc9efe
--- /dev/null
+++ b/tools/bpf/bpftool/compat.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (C) 2018 Netronome Systems, Inc. */
+
+#ifndef __BPF_TOOL_COMPAT_H
+#define __BPF_TOOL_COMPAT_H
+
+#define _GNU_SOURCE
+#include 
+
+static inline void *reallocarray(void *ptr, size_t nmemb, size_t size)
+{
+   return realloc(ptr, nmemb * size);
+}
+#endif
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 1a9a2aefa014..2106adb73631 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -43,6 +43,7 @@
 #include 
 #include 
 
+#include "compat.h"
 #include "json_writer.h"
 
 #define ptr_to_u64(ptr)((__u64)(unsigned long)(ptr))
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index dac9563b5470..0516259be70f 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -14,6 +14,7 @@ FILES=  \
  test-libaudit.bin  \
  test-libbfd.bin

[PATCH v3 iproute2 0/3] Add support for ETF qdisc

2018-07-05 Thread Jesus Sanchez-Palencia
Changes since v2:
 - Added man page for tc-etf.

The ETF (earliest txtime first) qdisc was recently merged into net-next
[1], so this patchset adds support for it through the tc command line
tool.

An initial man page is also provided.

The first commit in this series is adding an updated version of
include/uapi/linux/pkt_sched.h and is not meant to be merged. It's
provided here just as a convenience for those who want to easily build
this patchset.

[1] https://patchwork.ozlabs.org/cover/938991/

Jesus Sanchez-Palencia (2):
  uapi pkt_sched: Add etf info - DO NOT COMMIT
  man: Add initial manpage for tc-etf(8)

Vinicius Costa Gomes (1):
  tc: Add support for the ETF Qdisc

 include/uapi/linux/pkt_sched.h |  21 +
 man/man8/tc-etf.8  | 141 +++
 tc/Makefile|   1 +
 tc/q_etf.c | 168 +
 4 files changed, 331 insertions(+)
 create mode 100644 man/man8/tc-etf.8
 create mode 100644 tc/q_etf.c

-- 
2.18.0



[PATCH v3 iproute2 3/3] man: Add initial manpage for tc-etf(8)

2018-07-05 Thread Jesus Sanchez-Palencia
Add an initial manpage for tc-etf covering all config options, basic
concepts and operation modes.

Signed-off-by: Jesus Sanchez-Palencia 
---
 man/man8/tc-etf.8 | 141 ++
 1 file changed, 141 insertions(+)
 create mode 100644 man/man8/tc-etf.8

diff --git a/man/man8/tc-etf.8 b/man/man8/tc-etf.8
new file mode 100644
index ..30a12de7
--- /dev/null
+++ b/man/man8/tc-etf.8
@@ -0,0 +1,141 @@
+.TH ETF 8 "05 Jul 2018" "iproute2" "Linux"
+.SH NAME
+ETF \- Earliest TxTime First (ETF) Qdisc
+.SH SYNOPSIS
+.B tc qdisc ... dev
+dev
+.B parent
+classid
+.B [ handle
+major:
+.B ] etf clockid
+clockid
+.B [ delta
+delta_nsecs
+.B ] [ deadline_mode ]
+.B [ offload ]
+
+.SH DESCRIPTION
+The ETF (Earliest TxTime First) qdisc allows applications to control
+the instant when a packet should be dequeued from the traffic control
+layer into the netdevice. If
+.B offload
+is configured and supported by the network interface card, the it will
+also control when packets leave the network controller.
+
+ETF achieves that by buffering packets until a configurable time
+before their transmission time (i.e. txtime, or deadline), which can
+be configured through the
+.B delta
+option.
+
+The qdisc uses a rb-tree internally so packets are always 'ordered' by
+their txtime and will be dequeued following the (next) earliest txtime
+first.
+
+It relies on the SO_TXTIME socket option and the SCM_TXTIME CMSG in
+each packet field to configure the behavior of time dependent sockets:
+the clockid to be used as a reference, if the expected mode of txtime
+for that socket is deadline or strict mode, and if packet drops should
+be reported on the socket's error queue. See
+.BR socket(7)
+for more information.
+
+The etf qdisc will drop any packets with a txtime in the past, or if a
+packet expires while waiting for being dequeued.
+
+This queueing discipline is intended to be used by TSN (Time Sensitive
+Networking) applications, and it exposes a traffic shaping functionality
+that is commonly documented as "Launch Time" or "Time-Based Scheduling"
+by vendors and the documentation of network interface controllers.
+
+ETF is meant to be installed under another qdisc that maps packet flows
+to traffic classes, one example is
+.BR mqprio(8).
+
+.SH PARAMETERS
+.TP
+clockid
+.br
+Specifies the clock to be used by qdisc's internal timer for measuring
+time and scheduling events. The qdisc expects that packets passing
+through it to be using this same
+.B clockid
+as the reference of their txtime timestamps. It will drop packets
+coming from sockets that do not comply with that.
+
+For more information about time and clocks on Linux, please refer
+to
+.BR time(7)
+and
+.BR clock_gettime(3).
+
+.TP
+delta
+.br
+After enqueueing or dequeueing a packet, the qdisc will schedule its
+next wake-up time for the next txtime minus this delta value.
+This means
+.B delta
+can be used as a fudge factor for the scheduler latency of a system.
+This value must be specified in nanoseconds.
+The default value is 0 nanoseconds.
+
+.TP
+deadline_mode
+.br
+When
+.B deadline_mode
+is set, the qdisc will handle txtime with a different semantics,
+changed from a 'strict' transmission time to a deadline.
+In practice, this means during the dequeue flow
+.BR etf(8)
+will set the txtime of the packet being dequeued to 'now'.
+The default is for this option to be disabled.
+
+.TP
+offload
+.br
+When
+.B offload
+is set,
+.BR etf(8)
+will try to configure the network interface so time-based transmission
+arbitration is enabled in the controller. This feature is commonly
+referred to as "Launch Time" or "Time-Based Scheduling" by the
+documentation of network interface controllers.
+The default is for this option to be disabled.
+
+.SH EXAMPLES
+
+ETF is used to enforce a Quality of Service. It controls when each
+packets should be dequeued and transmitted, and can be used for
+limiting the data rate of a traffic class. To separate packets into
+traffic classes the user may choose
+.BR mqprio(8),
+and configure it like this:
+
+.EX
+# tc qdisc add dev eth0 handle 100: parent root mqprio num_tc 3 \\
+   map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\
+   queues 1@0 1@1 2@2 \\
+   hw 0
+.EE
+.P
+To replace the current queueing discipline by ETF in traffic class
+number 0, issue:
+.P
+.EX
+# tc qdisc replace dev eth0 parent 100:1 etf \\
+   clockid CLOCK_TAI delta 30 offload
+.EE
+
+With the options above, etf will be configured to use CLOCK_TAI as
+its clockid_t, will schedule packets for 300 us before their txtime,
+and will enable the functionality on that in the network interface
+card. Deadline mode will not be configured for this mode.
+
+.SH AUTHORS
+Jesus Sanchez-Palencia 
+.br
+Vinicius Costa Gomes 
-- 
2.18.0



[PATCH v3 iproute2 2/3] tc: Add support for the ETF Qdisc

2018-07-05 Thread Jesus Sanchez-Palencia
From: Vinicius Costa Gomes 

The "Earliest TxTime First" (ETF) queueing discipline allows precise
control of the transmission time of packets by providing a sorted
time-based scheduling of packets.

The syntax is:

tc qdisc add dev DEV parent NODE etf delta 
 clockid  [offload] [deadline_mode]

Signed-off-by: Vinicius Costa Gomes 
Signed-off-by: Jesus Sanchez-Palencia 
---
 tc/Makefile |   1 +
 tc/q_etf.c  | 168 
 2 files changed, 169 insertions(+)
 create mode 100644 tc/q_etf.c

diff --git a/tc/Makefile b/tc/Makefile
index dfd00267..4525c0fb 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -71,6 +71,7 @@ TCMODULES += q_clsact.o
 TCMODULES += e_bpf.o
 TCMODULES += f_matchall.o
 TCMODULES += q_cbs.o
+TCMODULES += q_etf.o
 
 TCSO :=
 ifeq ($(TC_CONFIG_ATM),y)
diff --git a/tc/q_etf.c b/tc/q_etf.c
new file mode 100644
index ..5db1dd6f
--- /dev/null
+++ b/tc/q_etf.c
@@ -0,0 +1,168 @@
+/*
+ * q_etf.c Earliest TxTime First (ETF).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Vinicius Costa Gomes 
+ * Jesus Sanchez-Palencia 
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+#include "tc_util.h"
+
+#define CLOCKID_INVALID (-1)
+static void explain(void)
+{
+   fprintf(stderr, "Usage: ... etf delta NANOS clockid CLOCKID [offload] 
[deadline_mode]\n");
+   fprintf(stderr, "CLOCKID must be a valid SYS-V id (i.e. CLOCK_TAI)\n");
+}
+
+static void explain1(const char *arg, const char *val)
+{
+   fprintf(stderr, "etf: illegal value for \"%s\": \"%s\"\n", arg, val);
+}
+
+static void explain_clockid(const char *val)
+{
+   fprintf(stderr, "etf: illegal value for \"clockid\": \"%s\".\n", val);
+   fprintf(stderr, "It must be a valid SYS-V id (i.e. CLOCK_TAI)");
+}
+
+static int get_clockid(__s32 *val, const char *arg)
+{
+   const struct static_clockid {
+   const char *name;
+   clockid_t clockid;
+   } clockids_sysv[] = {
+   { "CLOCK_REALTIME", CLOCK_REALTIME },
+   { "CLOCK_TAI", CLOCK_TAI },
+   { "CLOCK_BOOTTIME", CLOCK_BOOTTIME },
+   { "CLOCK_MONOTONIC", CLOCK_MONOTONIC },
+   { NULL }
+   };
+
+   const struct static_clockid *c;
+
+   for (c = clockids_sysv; c->name; c++) {
+   if (strncasecmp(c->name, arg, 25) == 0) {
+   *val = c->clockid;
+
+   return 0;
+   }
+   }
+
+   return -1;
+}
+
+
+static int etf_parse_opt(struct qdisc_util *qu, int argc,
+char **argv, struct nlmsghdr *n, const char *dev)
+{
+   struct tc_etf_qopt opt = {
+   .clockid = CLOCKID_INVALID,
+   };
+   struct rtattr *tail;
+
+   while (argc > 0) {
+   if (matches(*argv, "offload") == 0) {
+   if (opt.flags & TC_ETF_OFFLOAD_ON) {
+   fprintf(stderr, "etf: duplicate \"offload\" 
specification\n");
+   return -1;
+   }
+
+   opt.flags |= TC_ETF_OFFLOAD_ON;
+   } else if (matches(*argv, "deadline_mode") == 0) {
+   if (opt.flags & TC_ETF_DEADLINE_MODE_ON) {
+   fprintf(stderr, "etf: duplicate 
\"deadline_mode\" specification\n");
+   return -1;
+   }
+
+   opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+   } else if (matches(*argv, "delta") == 0) {
+   NEXT_ARG();
+   if (opt.delta) {
+   fprintf(stderr, "etf: duplicate \"delta\" 
specification\n");
+   return -1;
+   }
+   if (get_s32(, *argv, 0)) {
+   explain1("delta", *argv);
+   return -1;
+   }
+   } else if (matches(*argv, "clockid") == 0) {
+   NEXT_ARG();
+   if (opt.clockid != CLOCKID_INVALID) {
+   fprintf(stderr, "etf: duplicate \"clockid\" 
specification\n");
+   return -1;
+   }
+   if (get_clockid(, *argv)) {
+   explain_clockid(*argv);
+   return -1;
+   }
+   } else if (strcmp(*argv, "help") == 0) {
+   explain();
+   return -1;
+  

[PATCH v3 iproute2 1/3] uapi pkt_sched: Add etf info - DO NOT COMMIT

2018-07-05 Thread Jesus Sanchez-Palencia
This should come from the next uapi headers update.
Sending it now just as a convenience so anyone can build tc with etf
and taprio support.

Signed-off-by: Jesus Sanchez-Palencia 
---
 include/uapi/linux/pkt_sched.h | 21 +
 1 file changed, 21 insertions(+)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096a..94911846 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -539,6 +539,7 @@ enum {
TCA_NETEM_LATENCY64,
TCA_NETEM_JITTER64,
TCA_NETEM_SLOT,
+   TCA_NETEM_SLOT_DIST,
__TCA_NETEM_MAX,
 };
 
@@ -581,6 +582,8 @@ struct tc_netem_slot {
__s64   max_delay;
__s32   max_packets;
__s32   max_bytes;
+   __s64   dist_delay; /* nsec */
+   __s64   dist_jitter; /* nsec */
 };
 
 enum {
@@ -934,4 +937,22 @@ enum {
 
 #define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
 
+
+/* ETF */
+struct tc_etf_qopt {
+   __s32 delta;
+   __s32 clockid;
+   __u32 flags;
+#define TC_ETF_DEADLINE_MODE_ONBIT(0)
+#define TC_ETF_OFFLOAD_ON  BIT(1)
+};
+
+enum {
+   TCA_ETF_UNSPEC,
+   TCA_ETF_PARMS,
+   __TCA_ETF_MAX,
+};
+
+#define TCA_ETF_MAX (__TCA_ETF_MAX - 1)
+
 #endif
-- 
2.18.0



Business Proposal

2018-07-05 Thread BRENDA WILSON



I am Sgt.Brenda Wilson, originally from Lake Jackson Texas USA.I personally 
made a special research and I came across your information. I am presently 
writing this mail to you from U.S Military base Kabul Afghanistan I have a 
secured business proposal for you. Reply for more details via my private E-mail 
( brendawilson...@hotmail.com )


Business Proposal

2018-07-05 Thread BRENDA WILSON



I am Sgt.Brenda Wilson, originally from Lake Jackson Texas USA.I personally 
made a special research and I came across your information. I am presently 
writing this mail to you from U.S Military base Kabul Afghanistan I have a 
secured business proposal for you. Reply for more details via my private E-mail 
( brendawilson...@hotmail.com )


RE: [PATCH v1 net-next 6/9] lan743x: Add power management support

2018-07-05 Thread Bryan.Whitehead
> > +   data = lan743x_csr_read(adapter, PMT_CTL);
> 
> Hi Bryan
> 
> Why do you do this read? You do not use the result.
> 
Good catch, I'll remove it.

> > +
> > +   wol->supported = WAKE_BCAST | WAKE_UCAST | WAKE_MCAST |
> > +   WAKE_MAGIC | WAKE_PHY | WAKE_ARP;
> > +
> > +   wol->wolopts = adapter->wolopts;
> > +}
> > +#endif /* CONFIG_PM */
> > +
> > +static int lan743x_pm_wakeframe_crc16(const u8 *buf, int len) {
> > +   const u16 crc16poly = 0x8005;
> > +   u16 bit, crc, msb;
> > +   u8 data;
> > +   int i;
> > +
> > +   crc = 0x;
> > +   for (i = 0; i < len; i++) {
> > +   data = *buf++;
> > +   for (bit = 0; bit < 8; bit++) {
> > +   msb = crc >> 15;
> > +   crc <<= 1;
> > +
> > +   if (msb ^ (u16)(data & 1)) {
> > +   crc ^= crc16poly;
> > +   crc |= (u16)0x0001U;
> > +   }
> > +   data >>= 1;
> > +   }
> > +   }
> > +
> 
> There are a few different crc algorithms in lib. Can you use one of them,
> rather than implementing it yourself?

OK I'll check.

> 
> > +#if CONFIG_PM
> > +static int lan743x_pm_suspend(struct device *dev) {
> > +   struct pci_dev *pdev = to_pci_dev(dev);
> > +   struct net_device *netdev = pci_get_drvdata(pdev);
> > +   struct lan743x_adapter *adapter = netdev_priv(netdev);
> > +   u16 phydata;
> > +   int ret;
> > +
> > +   if (adapter->wolopts & WAKE_PHY) {
> > +   phydata = phy_read(netdev->phydev, 27);
> > +   phydata |= 0x0500;
> > +   phy_write(netdev->phydev, 27, phydata);
> > +   }
> 
> Shouldn't the PHY driver do this?

Perhaps so. I'll check with the PM writer.

Thanks Andrew


RE: [PATCH v1 net-next 5/9] lan743x: Add support for ethtool eeprom access

2018-07-05 Thread Bryan.Whitehead
> 
> MAX_EEPROM_SIZE ?
> 
... snip ...
> 
>   Andrew

Thanks Andrew, I'll change it.


[PATCH net-next 1/2] selftests: forwarding: Allow importing dependent libraries

2018-07-05 Thread Petr Machata
The next patch introduces a new mlxsw-specific test that uses
mirror_gre_lib.sh and mirror_gre_topo_lib.sh.

However when sourcing their own deps, these libraries assume that the
test that's running is in the same directory. That's not the case for
driver-specific tests.

So change the libraries to source their deps through $relative_path.
That variable is set up by lib.sh, which should be imported by the test
in question in any case.

Signed-off-by: Petr Machata 
---
 tools/testing/selftests/net/forwarding/mirror_gre_lib.sh  | 2 +-
 tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh 
b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
index 1c18e332cd4f..fac486178ef7 100644
--- a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-source mirror_lib.sh
+source "$relative_path/mirror_lib.sh"
 
 quick_test_span_gre_dir_ips()
 {
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh 
b/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
index 253419564708..39c03e2867f4 100644
--- a/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
@@ -33,7 +33,7 @@
 #   | |
 #   +-+
 
-source mirror_topo_lib.sh
+source "$relative_path/mirror_topo_lib.sh"
 
 mirror_gre_topo_h3_create()
 {
-- 
2.4.11



[PATCH net-next 2/2] selftests: mlxsw: Add mlxsw-specific test for mirror to gretap

2018-07-05 Thread Petr Machata
Test several aspects of offloading mirror to gretap and ip6gretap
netdevices that are specific to mlxsw, such as requirements for TTL and
TOS values.

Signed-off-by: Petr Machata 
Reviewed-by: Jiri Pirko 
---
 .../selftests/drivers/net/mlxsw/mirror_gre.sh  | 217 +
 1 file changed, 217 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/mirror_gre.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/mirror_gre.sh 
b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre.sh
new file mode 100755
index ..76f1ab4898d9
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# ../../../net/forwarding/mirror_gre_topo_lib.sh for more details.
+#
+# Test offloading various features of offloading gretap mirrors specific to
+# mlxsw.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=6
+source $lib_dir/lib.sh
+source $lib_dir/mirror_lib.sh
+source $lib_dir/mirror_gre_lib.sh
+source $lib_dir/mirror_gre_topo_lib.sh
+
+setup_keyful()
+{
+   tunnel_create gt6-key ip6gretap 2001:db8:3::1 2001:db8:3::2 \
+ ttl 100 tos inherit allow-localremote \
+ key 1234
+
+   tunnel_create h3-gt6-key ip6gretap 2001:db8:3::2 2001:db8:3::1 \
+ key 1234
+   ip link set h3-gt6-key vrf v$h3
+   matchall_sink_create h3-gt6-key
+
+   ip address add dev $swp3 2001:db8:3::1/64
+   ip address add dev $h3 2001:db8:3::2/64
+}
+
+cleanup_keyful()
+{
+   ip address del dev $h3 2001:db8:3::2/64
+   ip address del dev $swp3 2001:db8:3::1/64
+
+   tunnel_destroy h3-gt6-key
+   tunnel_destroy gt6-key
+}
+
+setup_soft()
+{
+   # Set up a topology for testing underlay routes that point at an
+   # unsupported soft device.
+
+   tunnel_create gt6-soft ip6gretap 2001:db8:4::1 2001:db8:4::2 \
+ ttl 100 tos inherit allow-localremote
+
+   tunnel_create h3-gt6-soft ip6gretap 2001:db8:4::2 2001:db8:4::1
+   ip link set h3-gt6-soft vrf v$h3
+   matchall_sink_create h3-gt6-soft
+
+   ip link add name v1 type veth peer name v2
+   ip link set dev v1 up
+   ip address add dev v1 2001:db8:4::1/64
+
+   ip link set dev v2 vrf v$h3
+   ip link set dev v2 up
+   ip address add dev v2 2001:db8:4::2/64
+}
+
+cleanup_soft()
+{
+   ip link del dev v1
+
+   tunnel_destroy h3-gt6-soft
+   tunnel_destroy gt6-soft
+}
+
+setup_prepare()
+{
+   h1=${NETIFS[p1]}
+   swp1=${NETIFS[p2]}
+
+   swp2=${NETIFS[p3]}
+   h2=${NETIFS[p4]}
+
+   swp3=${NETIFS[p5]}
+   h3=${NETIFS[p6]}
+
+   vrf_prepare
+   mirror_gre_topo_create
+
+   ip address add dev $swp3 2001:db8:2::1/64
+   ip address add dev $h3 2001:db8:2::2/64
+
+   ip address add dev $swp3 192.0.2.129/28
+   ip address add dev $h3 192.0.2.130/28
+
+   setup_keyful
+   setup_soft
+}
+
+cleanup()
+{
+   pre_cleanup
+
+   cleanup_soft
+   cleanup_keyful
+
+   ip address del dev $h3 2001:db8:2::2/64
+   ip address del dev $swp3 2001:db8:2::1/64
+
+   ip address del dev $h3 192.0.2.130/28
+   ip address del dev $swp3 192.0.2.129/28
+
+   mirror_gre_topo_destroy
+   vrf_cleanup
+}
+
+test_span_gre_ttl_inherit()
+{
+   local tundev=$1; shift
+   local type=$1; shift
+   local what=$1; shift
+
+   RET=0
+
+   ip link set dev $tundev type $type ttl inherit
+   mirror_install $swp1 ingress $tundev "matchall $tcflags"
+   fail_test_span_gre_dir $tundev ingress
+
+   ip link set dev $tundev type $type ttl 100
+
+   quick_test_span_gre_dir $tundev ingress
+   mirror_uninstall $swp1 ingress
+
+   log_test "$what: no offload on TTL of inherit ($tcflags)"
+}
+
+test_span_gre_tos_fixed()
+{
+   local tundev=$1; shift
+   local type=$1; shift
+   local what=$1; shift
+
+   RET=0
+
+   ip link set dev $tundev type $type tos 0x10
+   mirror_install $swp1 ingress $tundev "matchall $tcflags"
+   fail_test_span_gre_dir $tundev ingress
+
+   ip link set dev $tundev type $type tos inherit
+   quick_test_span_gre_dir $tundev ingress
+   mirror_uninstall $swp1 ingress
+
+   log_test "$what: no offload on a fixed TOS ($tcflags)"
+}
+
+test_span_failable()
+{
+   local should_fail=$1; shift
+   local tundev=$1; shift
+   local what=$1; shift
+
+   RET=0
+
+   mirror_install $swp1 ingress $tundev "matchall $tcflags"
+   if ((should_fail)); then
+   fail_test_span_gre_dir  $tundev ingress
+   else
+   quick_test_span_gre_dir $tundev ingress
+   fi
+   mirror_uninstall $swp1 ingress
+
+   log_test "$what: should_fail=$should_fail ($tcflags)"
+}
+
+test_failable()
+{
+   local should_fail=$1; shift
+
+   test_span_failable 

[PATCH net-next 0/2] Add a mlxsw-specific test for mirror-to-gretap

2018-07-05 Thread Petr Machata
Some configurations of mirror-to-gretap are impossible for mlxsw to
offload. Add a test that checks that these out-of-domain conditions are
handled properly by mlxsw.

In patch #1, fix mirror_gre_lib.sh and mirror_gre_topo_lib.sh so that
they can be imported from directories other than forwarding/.

In patch #2, add a test to check handling of several scenarios that
mlxsw is expected to fail to offload.

Petr Machata (2):
  selftests: forwarding: Allow importing dependent libraries
  selftests: mlxsw: Add mlxsw-specific test for mirror to gretap

 .../selftests/drivers/net/mlxsw/mirror_gre.sh  | 217 +
 .../selftests/net/forwarding/mirror_gre_lib.sh |   2 +-
 .../net/forwarding/mirror_gre_topo_lib.sh  |   2 +-
 3 files changed, 219 insertions(+), 2 deletions(-)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/mirror_gre.sh

-- 
2.4.11



RE: [PATCH v1 net-next 3/9] lan743x: Add support for ethtool statistics

2018-07-05 Thread Bryan.Whitehead
> ARRAY_SIZE(lan743x_set0_hw_cnt_addr) ?
> 
...snip...
> 
>   Andrew

Will do, I will resubmit with these changes.



RE: [PATCH v1 net-next 1/9] lan743x: Add support for ethtool get_drvinfo

2018-07-05 Thread Bryan.Whitehead
> Hi Bryan
> 
> It is normal to put something in the commit message, even if it is the Subject
> line said in a different way.
> 
> Otherwise, this looks O.K.
> 
>   Andrew

OK, thanks Andrew


Re: [PATCH bpf-next 7/7] nfp: bpf: migrate to advanced reciprocal divide in reciprocal_div.h

2018-07-05 Thread Jiong Wang

On 26/06/2018 21:59, Jakub Kicinski wrote:

On Sun, 24 Jun 2018 20:54:21 -0700, Jakub Kicinski wrote:

+* NOTE: because we are using "reciprocal_value_adv" which doesn't
+* support dividend with MSB set, so we need to JIT separate NFP
+* sequence to handle such case. It could be a simple sequence if there
+* is conditional move, however there isn't for NFP. So, we don't bother
+* generating compare-if-set-branch sequence by rejecting the program
+* straight away when the u32 dividend has MSB set. Divide by such a
+* large constant would be rare in practice. Also, the programmer could
+* simply rewrite it as "result = divisor >= the_const".

Thinking about this again, can we just use carry bit?


Good catch, yes we can.


The code may end
up shorter than the explanation why we don't support that case :P

immed[c, 0]
alu[--, a, -, b]
alu[c, c, +carry, 0]


eBPF input will be "a = a / b", given "immed" doesn't affect carry bit,
I'd reorder the sequence so we only need one tmp register for holding
"b" who is constant.

  alu[--, a, -, b]
  immed[b, 0]
  alu[a, b, +carry, 0]
 
Thanks.

Regards,
Jiong



Should be equivalent to:

c = a >= b

(Thanks to Edwin for double-checking the carry semantics.)




Re: [PATCH v2 net-next 1/3] rds: Changing IP address internal representation to struct in6_addr

2018-07-05 Thread Santosh Shilimkar

On 6/27/2018 3:23 AM, Ka-Cheong Poon wrote:

This patch changes the internal representation of an IP address to use
struct in6_addr.  IPv4 address is stored as an IPv4 mapped address.
All the functions which take an IP address as argument are also
changed to use struct in6_addr.  But RDS socket layer is not modified
such that it still does not accept IPv6 address from an application.
And RDS layer does not accept nor initiate IPv6 connections.

v2: Fixed sparse warnings.

Signed-off-by: Ka-Cheong Poon 
---
  net/rds/af_rds.c | 138 --
  net/rds/bind.c   |  91 +-
  net/rds/cong.c   |  23 ++--
  net/rds/connection.c | 132 +
  net/rds/ib.c |  17 +--
  net/rds/ib.h |  45 +--
  net/rds/ib_cm.c  | 300 +++
  net/rds/ib_rdma.c|  15 +--
  net/rds/ib_recv.c|  18 +--
  net/rds/ib_send.c|  10 +-
  net/rds/loop.c   |   7 +-
  net/rds/rdma.c   |   6 +-
  net/rds/rdma_transport.c |  56 ++---
  net/rds/rds.h|  69 +++
  net/rds/recv.c   |  51 +---
  net/rds/send.c   |  67 ---
  net/rds/tcp.c|  32 -
  net/rds/tcp_connect.c|  34 +++---
  net/rds/tcp_listen.c |  18 +--
  net/rds/tcp_recv.c   |   9 +-
  net/rds/tcp_send.c   |   4 +-
  net/rds/threads.c|  69 +--
  net/rds/transport.c  |  15 ++-
  23 files changed, 857 insertions(+), 369 deletions(-)




diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5aa3a64..3822886 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
   *
   * This software is available to you under a choice of one of two
   * licenses.  You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -42,42 +43,58 @@
  
  static const struct rhashtable_params ht_parms = {

.nelem_hint = 768,
-   .key_len = sizeof(u64),
+   .key_len = RDS_BOUND_KEY_LEN,

Do we really need the scope id to be part of the key ? With link
local/global, do you see any collisions. Please educate me
on the actual usecase. This can avoid bunch of changes and hence
the question.


@@ -114,7 +132,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, 
__be16 *port)
  rs, , (int)ntohs(*port));
break;
} else {
-   rs->rs_bound_addr = 0;
+   rs->rs_bound_addr = in6addr_any;

Can you elaborate why 0 is not ok ?


rds_sock_put(rs);
ret = -ENOMEM;
break;
@@ -127,44 +145,61 @@ static int rds_add_bound(struct rds_sock *rs, __be32 
addr, __be16 *port)
  void rds_remove_bound(struct rds_sock *rs)
  {
  
-	if (!rs->rs_bound_addr)

+   if (ipv6_addr_any(>rs_bound_addr))
return;
  
-	rdsdebug("rs %p unbinding from %pI4:%d\n",

+   rdsdebug("rs %p unbinding from %pI6c:%d\n",
 rs, >rs_bound_addr,
 ntohs(rs->rs_bound_port));
  
  	rhashtable_remove_fast(_hash_table, >rs_bound_node, ht_parms);

rds_sock_put(rs);
-   rs->rs_bound_addr = 0;
+   rs->rs_bound_addr = in6addr_any;
  }
  
  int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)

  {
struct sock *sk = sock->sk;
-   struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sk);
+   struct in6_addr v6addr, *binding_addr;
struct rds_transport *trans;
+   __u32 scope_id = 0;
int ret = 0;
+   __be16 port;
  
+	/* We only allow an RDS socket to be bound to and IPv4 address. IPv6

s/'bound to and IPv4'/'bound to an IPv4'

+* address support will be added later.
+*/
+   if (addr_len == sizeof(struct sockaddr_in)) {
+   struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+
+   if (sin->sin_family != AF_INET ||
+   sin->sin_addr.s_addr == htonl(INADDR_ANY))
+   return -EINVAL;
+   ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, );
+   binding_addr = 
+   port = sin->sin_port;
+   } else if (addr_len == sizeof(struct sockaddr_in6)) {
+   return -EPROTONOSUPPORT;
+   } else {
+   return -EINVAL;
+   }
lock_sock(sk);
  
-	if (addr_len != sizeof(struct sockaddr_in) ||

-   sin->sin_family != AF_INET ||
-   rs->rs_bound_addr ||
-   sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+   /* RDS socket does not allow re-binding. */
+   if (!ipv6_addr_any(>rs_bound_addr)) {
ret = -EINVAL;

Re: [PATCH v2 net-next 0/3] rds: IPv6 support

2018-07-05 Thread Sowmini Varadhan


Some additional comments on this patchset (consolidated here, 
please tease this apart into patch1/patch2/patch3 as appropriate)

I looked at the most of rds-core, and the rds-tcp changes.
Please make sure santosh looks at these carefully, especially.
- RDS bind key  changes
- connection.c
- all the rds_rdma related changes (e.g., the ib* and rdma* files)

- rds_getname(): one of the existing properties of PF_RDS is that a
  connect() does not involve an implicit bind(). Note that you are basing
  the changes in rds_bind() on this behavior, thus I guess we make the
  choice of treating this as a feature, not a bug.

  Since we are choosing to treat this behavior as a feature we
  need to be consistent in rds_getname(). If we find
  (!peer and !ipv6_addr_any(rs_conn_addr)) and the socket is not yet bound, 
  the returned address (address size, address family) should be based
  on the rs_conn_addr. (Otherwise if I connect to abc::1 without doing a 
  bind(), and try to do a getsockname(), I get back a sockaddr_in?!)

- rds_cancel_sent_to() and rds_connect and rds_bind and rds_sendmsg
  As DaveM has already pointed out, we should be using sa_family to figure
  out sockaddr_in vs sockaddr_in6 (not the other way around of inspecting
  len first, and then the family- that way wont work if I pass in
  sockaddr_storage).  E.g., see inet_dgram_connect.

if (addr_len < sizeof(uaddr->sa_family))
return -EINVAL;

- In net/rds/rds.h;

  /* The following ports, 16385, 18634, 18635, are registered with IANA as
   * the ports to be used for RDS over TCP and UDP.  18634 is the historical

  What is "RDS over TCP and UDP"? There is no such thing as RDS-over-UDP. 
  IN fact RDS has nothing to do with UDP. The comment is confused. See next
  item below, where the comment disappears.
  
- Also in net/rds/rds.h
  Please dont define transport specific parameters like RD_CM_PORT in the 
  common rds.h header. It is unfortunate that we already have RDS_PORT there,
  and we should try to clean that up as well. NOte that RDS_TCP_PORT 
  is now in the correct transport-module-specific header (net/rds/tcp.h)
  and its unclean to drag it from there, into the common header as you are 
  doing.

  In fact I just tried to move the RDS_PORT definition into 
  net/rds/rdma_transport.h and it built just-fine. So to summarize,
  please do the following:
  1. move RDS_PORT into rdma_transport.h
  2. add RDS_CM_PORT into rdma_transport.h
  3. stop dragging RDS_TCP_PORT from its current happy home in net/rds/tcp.h
 to net/rds/rds.h

- net/rds/connection.c
  As we have discussed offline before, the fact that we cannot report
  TCP seq# etc info via the existing rds-info API is not "a bug in the
  design of MPRDS" but rather a lacking in the API design. Moreover,
  much of the useful information around the TCP socket is already
  available via procfs, TCP_INFO etc, so the info by rds-info is rarely
  used for rds-tcp- the more useful information is around the RDS socket
  itself.  So there is a bug in the comment, would be nice if you removed it.

  Also, while you are there, s/exisiting/existing, please.

General comments:
-
I remain unconvinced by your global <-> link-local arguments.

For UDP sockets we can do this:

 eth0
   host1 -- host2
 abc::1/64   fe80::2
 add abc::/64 as onlink subnet route


  host1# traceroute6 -i eth0 -s abc::1 fe80::2

You just broke this for RDS and are using polemic to defend your case,
but the main thrust of your diatribe seems to be "why would you need 
this?" I'll try to address that briefly here. 

- There may be lot of valid reasons why host2 does not want to be 
  configured with a global prefix. e.g., I only want host2 to be able 
  to talk to onlink hosts.

- RDS mandatorily requires sockets to be bound. So the normal src addr
  selection (that would have caused host1 to use a link-local to talk
  to host2) is suppressed in this case 

  This is exactly the same as a UDP socket bound to abc::1

  Note well, that one of the use-cases for RDS-TCP is to replace 
  existing infra that uses UDP for cluster-IPC. This has come up before
  on netdev:

  See 
https://www.mail-archive.com/search?l=netdev@vger.kernel.org=subject:%22Re%5C%3A+%5C%5BPATCH+net%5C-next+0%5C%2F6%5C%5D+kcm%5C%3A+Kernel+Connection+Multiplexor+%5C%28KCM%5C%29%22=newest=1

  so feature parity with udp is just as important as feature-parity
  for rds_rdma. 

  I hope that helps you see why we need to not break this gratuituously
  for rds-tcp.

BTW, etiquette is to cc folks who have offered review comments on the
code. Please make sure to cc me in follow-ups to this thread.

Thank you.

--Sowmini


Re: [PATCH v1 net-next 6/9] lan743x: Add power management support

2018-07-05 Thread Andrew Lunn
> +static void lan743x_ethtool_get_wol(struct net_device *netdev,
> + struct ethtool_wolinfo *wol)
> +{
> + struct lan743x_adapter *adapter = netdev_priv(netdev);
> + u32 data;
> +
> + data = lan743x_csr_read(adapter, PMT_CTL);

Hi Bryan

Why do you do this read? You do not use the result.

> +
> + wol->supported = WAKE_BCAST | WAKE_UCAST | WAKE_MCAST |
> + WAKE_MAGIC | WAKE_PHY | WAKE_ARP;
> +
> + wol->wolopts = adapter->wolopts;
> +}
> +#endif /* CONFIG_PM */
> +
> +static int lan743x_pm_wakeframe_crc16(const u8 *buf, int len)
> +{
> + const u16 crc16poly = 0x8005;
> + u16 bit, crc, msb;
> + u8 data;
> + int i;
> +
> + crc = 0x;
> + for (i = 0; i < len; i++) {
> + data = *buf++;
> + for (bit = 0; bit < 8; bit++) {
> + msb = crc >> 15;
> + crc <<= 1;
> +
> + if (msb ^ (u16)(data & 1)) {
> + crc ^= crc16poly;
> + crc |= (u16)0x0001U;
> + }
> + data >>= 1;
> + }
> + }
> +

There are a few different crc algorithms in lib. Can you use one of
them, rather than implementing it yourself?

> +#if CONFIG_PM
> +static int lan743x_pm_suspend(struct device *dev)
> +{
> + struct pci_dev *pdev = to_pci_dev(dev);
> + struct net_device *netdev = pci_get_drvdata(pdev);
> + struct lan743x_adapter *adapter = netdev_priv(netdev);
> + u16 phydata;
> + int ret;
> +
> + if (adapter->wolopts & WAKE_PHY) {
> + phydata = phy_read(netdev->phydev, 27);
> + phydata |= 0x0500;
> + phy_write(netdev->phydev, 27, phydata);
> + }

Shouldn't the PHY driver do this?

  Andrew


Re: [PATCH v1 net-next 5/9] lan743x: Add support for ethtool eeprom access

2018-07-05 Thread Andrew Lunn
Hi Bryan

> +static int lan743x_ethtool_set_eeprom(struct net_device *netdev,
> +   struct ethtool_eeprom *ee, u8 *data)
> +{
> + struct lan743x_adapter *adapter = netdev_priv(netdev);
> + int ret = -EINVAL;
> +
> + if (ee->magic == LAN743X_EEPROM_MAGIC)
> + ret = lan743x_eeprom_write(adapter, ee->offset, ee->len,
> +data);
> + /* Beware!  OTP is One Time Programming ONLY!
> +  * So do some strict condition check before messing up
> +  */
> + else if ((ee->magic == LAN743X_OTP_MAGIC) &&
> +  (ee->offset == 0) &&
> +  (ee->len == 512) &&

MAX_EEPROM_SIZE ?

> +  (data[0] == OTP_INDICATOR_1))
> + ret = lan743x_otp_write(adapter, ee->offset, ee->len, data);
> +
> + return ret;
> +}

  Andrew


Re: [PATCH v1 net-next 3/9] lan743x: Add support for ethtool statistics

2018-07-05 Thread Andrew Lunn
Hi Bryan

> +static void lan743x_ethtool_get_ethtool_stats(struct net_device *netdev,
> +   struct ethtool_stats *stats,
> +   u64 *data)
> +{
> + struct lan743x_adapter *adapter = netdev_priv(netdev);
> + int data_index = 0;
> + u32 buf;
> + int i;
> +
> + for (i = 0; i < (sizeof(lan743x_set0_hw_cnt_addr) / (sizeof(u32)));

ARRAY_SIZE(lan743x_set0_hw_cnt_addr) ?

> +  i++) {
> + buf = lan743x_csr_read(adapter, lan743x_set0_hw_cnt_addr[i]);
> + data[data_index++] = (u64)buf;
> + }
> + for (i = 0; i < 4; i++)

ARRAY_SIZE(lan743x_set1_sw_cnt_strings) ??

> + data[data_index++] = (u64)(adapter->rx[i].frame_count);
> + for (i = 0; i < (sizeof(lan743x_set2_hw_cnt_addr) / (sizeof(u32)));

ARRAY_SIZE()

> +  i++) {
> + buf = lan743x_csr_read(adapter, lan743x_set2_hw_cnt_addr[i]);
> + data[data_index++] = (u64)buf;
> + }
> +}

  Andrew


Re: [RFC bpf-next 2/6] net: xdp: RX meta data infrastructure

2018-07-05 Thread Jakub Kicinski
On Wed, 4 Jul 2018 09:51:54 +0200, Daniel Borkmann wrote:
> On 07/04/2018 02:57 AM, Saeed Mahameed wrote:
> > On Tue, 2018-07-03 at 16:01 -0700, Alexei Starovoitov wrote:  
> >> How about we make driver+firmware provide a BTF definition of
> >> metadata that they
> >> can provide? There can be multiple definitions of such structs.
> >> Then in userpsace we can have BTF->plain C converter.
> >> (bpftool practically ready to do that already).
> >> Then the programmer can take such generated C definition, add it to
> >> .h and include
> >> it in their programs. llvm will compile the whole thing and will
> >> include BTF
> >> of maps, progs and this md struct in the target elf file.
> >> During loading the kernel can check that BTF in elf is matching one-
> >> to-one
> >> to what driver+firmware are saying they support.  
> 
> I do like the above idea of utilizing BTF for this, seems like a good fit.
>
> > Just thinking out loud, can't we do this at program load ? just run a
> > setup function in the xdp program to load nic md BTF definition into
> > the elf section ?
> >   
> >> No ambiguity and no possibility of mistake, since offsets and field
> >> names
> >> are verified.  
> > 
> > But what about the dynamic nature of this feature ? Sometimes you only
> > want HW/Driver to provide a subset of whatever the HW can provide and
> > save md buffer for other stuff.
> > 
> > Yes a well defined format is favorable here, but we need to make sure
> > there is no computational overhead in data path just to extract each
> > field! for example if i want to know what is the offset of the hash
> > will i need to go parse (for every packet) the whole BTF definition of
> > metadata just to find the offset of type=hash ?  
> 
> I don't think this would be the case that you'd need to walk BTF in fast
> path here. In the ideal case, the only thing that driver would need to do
> in fast path would be to set proper xdp->data_meta offset and _that_ would
> be it. For the rest, program would know how to access the data since it's
> already aware of it from BTF definition the driver provided. Other drivers
> which would be less flexible on that regard would internally prep the buffer
> based on the progs needs more or less similar as in 
> mlx5e_xdp_fill_data_meta(),
> but it would be really up to the driver how to handle this internally. The
> BTF it would check at XDP setup time to do the configuration needed in the
> driver. Verifier would only check BTF, pass it along for XDP setup, prog
> rewrites in verifier aren't even needed since LLVM compiled everything
> already.

I don't think we should force drivers to place such meta data in the
buffer.  The moment that happens we loose the "zero-touch" abilities
Jesper was trying to achieve.

Besides what happens to the meta data after XDP is finished.  We really
need the ability to communicate the modified fields further to the
stack.  With meta data in the buffer we don't really know if the
information place there after XDP finishes is still valid or did the
program overwrite it with something completely different.

I'm also not 100% on board with the argument that "future" FW can
reshuffle things whatever way it wants to.  Is the assumption that
future ASICs/FW will be designed to always use the "blessed" BTF
format?  Or will it be reconfigurable at runtime?

> >> Every driver can have their own BTF for md and their own special
> >> features.
> >> We can try to standardize the names (like vlan and csum), so xdp
> >> programs
> >> can stay relatively portable across NICs.  
> > 
> > Yes this is a must.  
> 
> Agree, there needs to be a basic common set that would be provided by
> every XDP aware driver.

I'm sorry to bring this up again, but can we really not let drivers
define their own "get_XYZ/set_XYZ" helpers, and link those to the
program at attachment time?  Sure we'd have to create a new copy of the
program for each driver it's used with, but is that really a problem?
That'd have the lowest impact on the performance and complexity of the
driver fast path.  The BTF solution already has all the same problems
WRT tail calls and not being sure which driver the program is attached
to.

> >> Such api will address exposing asic+firmware metadata to the xdp
> >> program.
> >> Once we tackle this problem, we'll think how to do the backward
> >> config
> >> (to do firmware reconfig for specific BTF definition of md supplied
> >> by the prog).
> >> What people think?  
> > 
> > For legacy HW, we can do it already in the driver, provide whatever the
> > prog requested, its only a matter of translation to the BTF format in
> > the driver xdp setup and pushing the values accordingly into the md
> > offsets on data path.
> > 
> > Question: how can you share the md BTF from the driver/HW with the xdp
> > program ?
> 
> I think this would likely be a new query as in XDP_QUERY_META_BTF
> implemented in ndo_bpf callback and then exported e.g. via bpf(2)
> or netlink such that bpftool can 

Re: [PATCH v1 net-next 1/9] lan743x: Add support for ethtool get_drvinfo

2018-07-05 Thread Andrew Lunn
On Thu, Jul 05, 2018 at 12:39:18PM -0400, Bryan Whitehead wrote:

Hi Bryan

It is normal to put something in the commit message, even if it is the
Subject line said in a different way.

Otherwise, this looks O.K.

Andrew


[PATCH] net/sunrpc: Make rpc_auth_create_args a const

2018-07-05 Thread Sargun Dhillon
This turns rpc_auth_create_args into a const as it gets passed through the
auth stack.

Signed-off-by: Sargun Dhillon 
---
 include/linux/sunrpc/auth.h| 5 +++--
 net/sunrpc/auth.c  | 2 +-
 net/sunrpc/auth_gss/auth_gss.c | 9 +
 net/sunrpc/auth_null.c | 2 +-
 net/sunrpc/auth_unix.c | 2 +-
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index d9af474a857d..58a6765c1c5e 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -125,7 +125,8 @@ struct rpc_authops {
struct module   *owner;
rpc_authflavor_tau_flavor;  /* flavor (RPC_AUTH_*) */
char *  au_name;
-   struct rpc_auth *   (*create)(struct rpc_auth_create_args *, struct 
rpc_clnt *);
+   struct rpc_auth *   (*create)(const struct rpc_auth_create_args *,
+ struct rpc_clnt *);
void(*destroy)(struct rpc_auth *);
 
int (*hash_cred)(struct auth_cred *, unsigned int);
@@ -174,7 +175,7 @@ struct rpc_cred *   rpc_lookup_generic_cred(struct 
auth_cred *, int, gfp_t);
 struct rpc_cred *  rpc_lookup_machine_cred(const char *service_name);
 intrpcauth_register(const struct rpc_authops *);
 intrpcauth_unregister(const struct rpc_authops *);
-struct rpc_auth *  rpcauth_create(struct rpc_auth_create_args *,
+struct rpc_auth *  rpcauth_create(const struct rpc_auth_create_args *,
struct rpc_clnt *);
 void   rpcauth_release(struct rpc_auth *);
 rpc_authflavor_t   rpcauth_get_pseudoflavor(rpc_authflavor_t,
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index d2623b9f23d6..661e2277f468 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -253,7 +253,7 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size)
 EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
 
 struct rpc_auth *
-rpcauth_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
struct rpc_auth *auth;
const struct rpc_authops *ops;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index be8f103d22fd..21a19a9f0e33 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -985,7 +985,7 @@ static void gss_pipe_free(struct gss_pipe *p)
  * parameters based on the input flavor (which must be a pseudoflavor)
  */
 static struct gss_auth *
-gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
rpc_authflavor_t flavor = args->pseudoflavor;
struct gss_auth *gss_auth;
@@ -1132,7 +1132,7 @@ gss_destroy(struct rpc_auth *auth)
  * (which is guaranteed to last as long as any of its descendants).
  */
 static struct gss_auth *
-gss_auth_find_or_add_hashed(struct rpc_auth_create_args *args,
+gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args,
struct rpc_clnt *clnt,
struct gss_auth *new)
 {
@@ -1169,7 +1169,8 @@ gss_auth_find_or_add_hashed(struct rpc_auth_create_args 
*args,
 }
 
 static struct gss_auth *
-gss_create_hashed(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+gss_create_hashed(const struct rpc_auth_create_args *args,
+ struct rpc_clnt *clnt)
 {
struct gss_auth *gss_auth;
struct gss_auth *new;
@@ -1188,7 +1189,7 @@ gss_create_hashed(struct rpc_auth_create_args *args, 
struct rpc_clnt *clnt)
 }
 
 static struct rpc_auth *
-gss_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+gss_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
struct gss_auth *gss_auth;
struct rpc_xprt_switch *xps = 
rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 75d72e109a04..4b48228ee8c7 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -19,7 +19,7 @@ static struct rpc_auth null_auth;
 static struct rpc_cred null_cred;
 
 static struct rpc_auth *
-nul_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
atomic_inc(_auth.au_count);
return _auth;
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index dafd6b870ba3..185e56d4f9ae 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -30,7 +30,7 @@ static struct rpc_authunix_auth;
 static const struct rpc_credopsunix_credops;
 
 static struct rpc_auth *
-unx_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
dprintk("RPC:   

[PATCH v1 net-next 1/9] lan743x: Add support for ethtool get_drvinfo

2018-07-05 Thread Bryan Whitehead
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/Makefile  |  2 +-
 drivers/net/ethernet/microchip/lan743x_ethtool.c | 21 +
 drivers/net/ethernet/microchip/lan743x_ethtool.h | 11 +++
 drivers/net/ethernet/microchip/lan743x_main.c|  2 ++
 4 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/microchip/lan743x_ethtool.c
 create mode 100644 drivers/net/ethernet/microchip/lan743x_ethtool.h

diff --git a/drivers/net/ethernet/microchip/Makefile 
b/drivers/net/ethernet/microchip/Makefile
index 2e982cc..43f47cb 100644
--- a/drivers/net/ethernet/microchip/Makefile
+++ b/drivers/net/ethernet/microchip/Makefile
@@ -6,4 +6,4 @@ obj-$(CONFIG_ENC28J60) += enc28j60.o
 obj-$(CONFIG_ENCX24J600) += encx24j600.o encx24j600-regmap.o
 obj-$(CONFIG_LAN743X) += lan743x.o
 
-lan743x-objs := lan743x_main.o
+lan743x-objs := lan743x_main.o lan743x_ethtool.o
diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c 
b/drivers/net/ethernet/microchip/lan743x_ethtool.c
new file mode 100644
index 000..0e20758
--- /dev/null
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/* Copyright (C) 2018 Microchip Technology Inc. */
+
+#include 
+#include "lan743x_main.h"
+#include "lan743x_ethtool.h"
+#include 
+
+static void lan743x_ethtool_get_drvinfo(struct net_device *netdev,
+   struct ethtool_drvinfo *info)
+{
+   struct lan743x_adapter *adapter = netdev_priv(netdev);
+
+   strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver));
+   strlcpy(info->bus_info,
+   pci_name(adapter->pdev), sizeof(info->bus_info));
+}
+
+const struct ethtool_ops lan743x_ethtool_ops = {
+   .get_drvinfo = lan743x_ethtool_get_drvinfo,
+};
diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.h 
b/drivers/net/ethernet/microchip/lan743x_ethtool.h
new file mode 100644
index 000..d0d11a7
--- /dev/null
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/* Copyright (C) 2018 Microchip Technology Inc. */
+
+#ifndef _LAN743X_ETHTOOL_H
+#define _LAN743X_ETHTOOL_H
+
+#include "linux/ethtool.h"
+
+extern const struct ethtool_ops lan743x_ethtool_ops;
+
+#endif /* _LAN743X_ETHTOOL_H */
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c 
b/drivers/net/ethernet/microchip/lan743x_main.c
index e1747a4..ade3b04 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include "lan743x_main.h"
+#include "lan743x_ethtool.h"
 
 static void lan743x_pci_cleanup(struct lan743x_adapter *adapter)
 {
@@ -2689,6 +2690,7 @@ static int lan743x_pcidev_probe(struct pci_dev *pdev,
goto cleanup_hardware;
 
adapter->netdev->netdev_ops = _netdev_ops;
+   adapter->netdev->ethtool_ops = _ethtool_ops;
adapter->netdev->features = NETIF_F_SG | NETIF_F_TSO | NETIF_F_HW_CSUM;
adapter->netdev->hw_features = adapter->netdev->features;
 
-- 
2.7.4



[PATCH v1 net-next 4/9] lan743x: Add support for ethtool message level

2018-07-05 Thread Bryan Whitehead
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/lan743x_ethtool.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c 
b/drivers/net/ethernet/microchip/lan743x_ethtool.c
index 988c67c..addd628 100644
--- a/drivers/net/ethernet/microchip/lan743x_ethtool.c
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -17,6 +17,21 @@ static void lan743x_ethtool_get_drvinfo(struct net_device 
*netdev,
pci_name(adapter->pdev), sizeof(info->bus_info));
 }
 
+static u32 lan743x_ethtool_get_msglevel(struct net_device *netdev)
+{
+   struct lan743x_adapter *adapter = netdev_priv(netdev);
+
+   return adapter->msg_enable;
+}
+
+static void lan743x_ethtool_set_msglevel(struct net_device *netdev,
+u32 msglevel)
+{
+   struct lan743x_adapter *adapter = netdev_priv(netdev);
+
+   adapter->msg_enable = msglevel;
+}
+
 static const char lan743x_set0_hw_cnt_strings[][ETH_GSTRING_LEN] = {
"RX FCS Errors",
"RX Alignment Errors",
@@ -198,6 +213,8 @@ static int lan743x_ethtool_get_sset_count(struct net_device 
*netdev, int sset)
 
 const struct ethtool_ops lan743x_ethtool_ops = {
.get_drvinfo = lan743x_ethtool_get_drvinfo,
+   .get_msglevel = lan743x_ethtool_get_msglevel,
+   .set_msglevel = lan743x_ethtool_set_msglevel,
.get_link = ethtool_op_get_link,
 
.get_strings = lan743x_ethtool_get_strings,
-- 
2.7.4



[PATCH v1 net-next 0/9] lan743x: Add features to lan743x driver.

2018-07-05 Thread Bryan Whitehead
This patch series adds extra features to the lan743x driver.

Bryan Whitehead (9):
  lan743x: Add support for ethtool get_drvinfo
  lan743x: Add support for ethtool link settings
  lan743x: Add support for ethtool statistics
  lan743x: Add support for ethtool message level
  lan743x: Add support for ethtool eeprom access
  lan743x: Add power management support
  lan743x: Add EEE support
  lan743x: Add RSS support
  lan743x: Add PTP support

 drivers/net/ethernet/microchip/Makefile  |2 +-
 drivers/net/ethernet/microchip/lan743x_ethtool.c |  734 +
 drivers/net/ethernet/microchip/lan743x_ethtool.h |   11 +
 drivers/net/ethernet/microchip/lan743x_main.c|  319 +-
 drivers/net/ethernet/microchip/lan743x_main.h|  229 -
 drivers/net/ethernet/microchip/lan743x_ptp.c | 1194 ++
 drivers/net/ethernet/microchip/lan743x_ptp.h |   78 ++
 7 files changed, 2559 insertions(+), 8 deletions(-)
 create mode 100644 drivers/net/ethernet/microchip/lan743x_ethtool.c
 create mode 100644 drivers/net/ethernet/microchip/lan743x_ethtool.h
 create mode 100644 drivers/net/ethernet/microchip/lan743x_ptp.c
 create mode 100644 drivers/net/ethernet/microchip/lan743x_ptp.h

-- 
2.7.4



[PATCH v1 net-next 3/9] lan743x: Add support for ethtool statistics

2018-07-05 Thread Bryan Whitehead
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/lan743x_ethtool.c | 182 +++
 drivers/net/ethernet/microchip/lan743x_main.c|   6 +-
 drivers/net/ethernet/microchip/lan743x_main.h|  31 
 3 files changed, 216 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c 
b/drivers/net/ethernet/microchip/lan743x_ethtool.c
index 5c4582c..988c67c 100644
--- a/drivers/net/ethernet/microchip/lan743x_ethtool.c
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -17,10 +17,192 @@ static void lan743x_ethtool_get_drvinfo(struct net_device 
*netdev,
pci_name(adapter->pdev), sizeof(info->bus_info));
 }
 
+static const char lan743x_set0_hw_cnt_strings[][ETH_GSTRING_LEN] = {
+   "RX FCS Errors",
+   "RX Alignment Errors",
+   "Rx Fragment Errors",
+   "RX Jabber Errors",
+   "RX Undersize Frame Errors",
+   "RX Oversize Frame Errors",
+   "RX Dropped Frames",
+   "RX Unicast Byte Count",
+   "RX Broadcast Byte Count",
+   "RX Multicast Byte Count",
+   "RX Unicast Frames",
+   "RX Broadcast Frames",
+   "RX Multicast Frames",
+   "RX Pause Frames",
+   "RX 64 Byte Frames",
+   "RX 65 - 127 Byte Frames",
+   "RX 128 - 255 Byte Frames",
+   "RX 256 - 511 Bytes Frames",
+   "RX 512 - 1023 Byte Frames",
+   "RX 1024 - 1518 Byte Frames",
+   "RX Greater 1518 Byte Frames",
+};
+
+static const char lan743x_set1_sw_cnt_strings[][ETH_GSTRING_LEN] = {
+   "RX Queue 0 Frames",
+   "RX Queue 1 Frames",
+   "RX Queue 2 Frames",
+   "RX Queue 3 Frames",
+};
+
+static const char lan743x_set2_hw_cnt_strings[][ETH_GSTRING_LEN] = {
+   "RX Total Frames",
+   "EEE RX LPI Transitions",
+   "EEE RX LPI Time",
+   "RX Counter Rollover Status",
+   "TX FCS Errors",
+   "TX Excess Deferral Errors",
+   "TX Carrier Errors",
+   "TX Bad Byte Count",
+   "TX Single Collisions",
+   "TX Multiple Collisions",
+   "TX Excessive Collision",
+   "TX Late Collisions",
+   "TX Unicast Byte Count",
+   "TX Broadcast Byte Count",
+   "TX Multicast Byte Count",
+   "TX Unicast Frames",
+   "TX Broadcast Frames",
+   "TX Multicast Frames",
+   "TX Pause Frames",
+   "TX 64 Byte Frames",
+   "TX 65 - 127 Byte Frames",
+   "TX 128 - 255 Byte Frames",
+   "TX 256 - 511 Bytes Frames",
+   "TX 512 - 1023 Byte Frames",
+   "TX 1024 - 1518 Byte Frames",
+   "TX Greater 1518 Byte Frames",
+   "TX Total Frames",
+   "EEE TX LPI Transitions",
+   "EEE TX LPI Time",
+   "TX Counter Rollover Status",
+};
+
+static const u32 lan743x_set0_hw_cnt_addr[] = {
+   STAT_RX_FCS_ERRORS,
+   STAT_RX_ALIGNMENT_ERRORS,
+   STAT_RX_FRAGMENT_ERRORS,
+   STAT_RX_JABBER_ERRORS,
+   STAT_RX_UNDERSIZE_FRAME_ERRORS,
+   STAT_RX_OVERSIZE_FRAME_ERRORS,
+   STAT_RX_DROPPED_FRAMES,
+   STAT_RX_UNICAST_BYTE_COUNT,
+   STAT_RX_BROADCAST_BYTE_COUNT,
+   STAT_RX_MULTICAST_BYTE_COUNT,
+   STAT_RX_UNICAST_FRAMES,
+   STAT_RX_BROADCAST_FRAMES,
+   STAT_RX_MULTICAST_FRAMES,
+   STAT_RX_PAUSE_FRAMES,
+   STAT_RX_64_BYTE_FRAMES,
+   STAT_RX_65_127_BYTE_FRAMES,
+   STAT_RX_128_255_BYTE_FRAMES,
+   STAT_RX_256_511_BYTES_FRAMES,
+   STAT_RX_512_1023_BYTE_FRAMES,
+   STAT_RX_1024_1518_BYTE_FRAMES,
+   STAT_RX_GREATER_1518_BYTE_FRAMES,
+};
+
+static const u32 lan743x_set2_hw_cnt_addr[] = {
+   STAT_RX_TOTAL_FRAMES,
+   STAT_EEE_RX_LPI_TRANSITIONS,
+   STAT_EEE_RX_LPI_TIME,
+   STAT_RX_COUNTER_ROLLOVER_STATUS,
+   STAT_TX_FCS_ERRORS,
+   STAT_TX_EXCESS_DEFERRAL_ERRORS,
+   STAT_TX_CARRIER_ERRORS,
+   STAT_TX_BAD_BYTE_COUNT,
+   STAT_TX_SINGLE_COLLISIONS,
+   STAT_TX_MULTIPLE_COLLISIONS,
+   STAT_TX_EXCESSIVE_COLLISION,
+   STAT_TX_LATE_COLLISIONS,
+   STAT_TX_UNICAST_BYTE_COUNT,
+   STAT_TX_BROADCAST_BYTE_COUNT,
+   STAT_TX_MULTICAST_BYTE_COUNT,
+   STAT_TX_UNICAST_FRAMES,
+   STAT_TX_BROADCAST_FRAMES,
+   STAT_TX_MULTICAST_FRAMES,
+   STAT_TX_PAUSE_FRAMES,
+   STAT_TX_64_BYTE_FRAMES,
+   STAT_TX_65_127_BYTE_FRAMES,
+   STAT_TX_128_255_BYTE_FRAMES,
+   STAT_TX_256_511_BYTES_FRAMES,
+   STAT_TX_512_1023_BYTE_FRAMES,
+   STAT_TX_1024_1518_BYTE_FRAMES,
+   STAT_TX_GREATER_1518_BYTE_FRAMES,
+   STAT_TX_TOTAL_FRAMES,
+   STAT_EEE_TX_LPI_TRANSITIONS,
+   STAT_EEE_TX_LPI_TIME,
+   STAT_TX_COUNTER_ROLLOVER_STATUS
+};
+
+static void lan743x_ethtool_get_strings(struct net_device *netdev,
+   u32 stringset, u8 *data)
+{
+   switch (stringset) {
+   case ETH_SS_STATS:
+   memcpy(data, lan743x_set0_hw_cnt_strings,
+  sizeof(lan743x_set0_hw_cnt_strings));
+   memcpy([sizeof(lan743x_set0_hw_cnt_strings)],
+

[PATCH v1 net-next 2/9] lan743x: Add support for ethtool link settings

2018-07-05 Thread Bryan Whitehead
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/lan743x_ethtool.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c 
b/drivers/net/ethernet/microchip/lan743x_ethtool.c
index 0e20758..5c4582c 100644
--- a/drivers/net/ethernet/microchip/lan743x_ethtool.c
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -5,6 +5,7 @@
 #include "lan743x_main.h"
 #include "lan743x_ethtool.h"
 #include 
+#include 
 
 static void lan743x_ethtool_get_drvinfo(struct net_device *netdev,
struct ethtool_drvinfo *info)
@@ -18,4 +19,8 @@ static void lan743x_ethtool_get_drvinfo(struct net_device 
*netdev,
 
 const struct ethtool_ops lan743x_ethtool_ops = {
.get_drvinfo = lan743x_ethtool_get_drvinfo,
+   .get_link = ethtool_op_get_link,
+
+   .get_link_ksettings = phy_ethtool_get_link_ksettings,
+   .set_link_ksettings = phy_ethtool_set_link_ksettings,
 };
-- 
2.7.4



[PATCH v1 net-next 7/9] lan743x: Add EEE support

2018-07-05 Thread Bryan Whitehead
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/lan743x_ethtool.c | 89 
 drivers/net/ethernet/microchip/lan743x_main.h|  3 +
 2 files changed, 92 insertions(+)

diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c 
b/drivers/net/ethernet/microchip/lan743x_ethtool.c
index 0709a8d..edbe8b1 100644
--- a/drivers/net/ethernet/microchip/lan743x_ethtool.c
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -417,6 +417,93 @@ static int lan743x_ethtool_get_sset_count(struct 
net_device *netdev, int sset)
}
 }
 
+static int lan743x_ethtool_get_eee(struct net_device *netdev,
+  struct ethtool_eee *eee)
+{
+   struct lan743x_adapter *adapter = netdev_priv(netdev);
+   struct phy_device *phydev = netdev->phydev;
+   u32 buf;
+   int ret;
+
+   if (!phydev)
+   return -EIO;
+   if (!phydev->drv) {
+   netif_err(adapter, drv, adapter->netdev,
+ "Missing PHY Driver\n");
+   return -EIO;
+   }
+
+   ret = phy_ethtool_get_eee(phydev, eee);
+   if (ret < 0)
+   return ret;
+
+   buf = lan743x_csr_read(adapter, MAC_CR);
+   if (buf & MAC_CR_EEE_EN_) {
+   eee->eee_enabled = true;
+   eee->eee_active = !!(eee->advertised & eee->lp_advertised);
+   eee->tx_lpi_enabled = true;
+   /* EEE_TX_LPI_REQ_DLY & tx_lpi_timer are same uSec unit */
+   buf = lan743x_csr_read(adapter, MAC_EEE_TX_LPI_REQ_DLY_CNT);
+   eee->tx_lpi_timer = buf;
+   } else {
+   eee->eee_enabled = false;
+   eee->eee_active = false;
+   eee->tx_lpi_enabled = false;
+   eee->tx_lpi_timer = 0;
+   }
+
+   return 0;
+}
+
+static int lan743x_ethtool_set_eee(struct net_device *netdev,
+  struct ethtool_eee *eee)
+{
+   struct lan743x_adapter *adapter = netdev_priv(netdev);
+   struct phy_device *phydev = NULL;
+   u32 buf = 0;
+   int ret = 0;
+
+   if (!netdev)
+   return -EINVAL;
+   adapter = netdev_priv(netdev);
+   if (!adapter)
+   return -EINVAL;
+   phydev = netdev->phydev;
+   if (!phydev)
+   return -EIO;
+   if (!phydev->drv) {
+   netif_err(adapter, drv, adapter->netdev,
+ "Missing PHY Driver\n");
+   return -EIO;
+   }
+
+   if (eee->eee_enabled) {
+   ret = phy_init_eee(phydev, 0);
+   if (ret) {
+   netif_err(adapter, drv, adapter->netdev,
+ "EEE initialization failed\n");
+   return ret;
+   }
+
+   buf = lan743x_csr_read(adapter, MAC_CR);
+   buf |= MAC_CR_EEE_EN_;
+   lan743x_csr_write(adapter, MAC_CR, buf);
+
+   phy_ethtool_set_eee(phydev, eee);
+
+   buf = (u32)eee->tx_lpi_timer;
+   lan743x_csr_write(adapter, MAC_EEE_TX_LPI_REQ_DLY_CNT, buf);
+   netif_info(adapter, drv, adapter->netdev, "Enabled EEE\n");
+   } else {
+   buf = lan743x_csr_read(adapter, MAC_CR);
+   buf &= ~MAC_CR_EEE_EN_;
+   lan743x_csr_write(adapter, MAC_CR, buf);
+   netif_info(adapter, drv, adapter->netdev, "Disabled EEE\n");
+   }
+
+   return 0;
+}
+
 #ifdef CONFIG_PM
 static void lan743x_ethtool_get_wol(struct net_device *netdev,
struct ethtool_wolinfo *wol)
@@ -476,6 +563,8 @@ const struct ethtool_ops lan743x_ethtool_ops = {
.get_strings = lan743x_ethtool_get_strings,
.get_ethtool_stats = lan743x_ethtool_get_ethtool_stats,
.get_sset_count = lan743x_ethtool_get_sset_count,
+   .get_eee = lan743x_ethtool_get_eee,
+   .set_eee = lan743x_ethtool_set_eee,
.get_link_ksettings = phy_ethtool_get_link_ksettings,
.set_link_ksettings = phy_ethtool_set_link_ksettings,
 #ifdef CONFIG_PM
diff --git a/drivers/net/ethernet/microchip/lan743x_main.h 
b/drivers/net/ethernet/microchip/lan743x_main.h
index 72b9beb..93cb60a 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.h
+++ b/drivers/net/ethernet/microchip/lan743x_main.h
@@ -82,6 +82,7 @@
((value << 0) & FCT_FLOW_CTL_ON_THRESHOLD_)
 
 #define MAC_CR (0x100)
+#define MAC_CR_EEE_EN_ BIT(17)
 #define MAC_CR_ADD_BIT(12)
 #define MAC_CR_ASD_BIT(11)
 #define MAC_CR_CNTR_RST_   BIT(5)
@@ -117,6 +118,8 @@
 
 #define MAC_MII_DATA   (0x124)
 
+#define MAC_EEE_TX_LPI_REQ_DLY_CNT (0x130)
+
 #define MAC_WUCSR  (0x140)
 #define MAC_WUCSR_RFE_WAKE_EN_ BIT(14)
 #define MAC_WUCSR_PFDA_EN_ BIT(3)
-- 
2.7.4



[PATCH v1 net-next 5/9] lan743x: Add support for ethtool eeprom access

2018-07-05 Thread Bryan Whitehead
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/lan743x_ethtool.c | 209 +++
 drivers/net/ethernet/microchip/lan743x_main.h|  33 
 2 files changed, 242 insertions(+)

diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c 
b/drivers/net/ethernet/microchip/lan743x_ethtool.c
index addd628..0d0c997 100644
--- a/drivers/net/ethernet/microchip/lan743x_ethtool.c
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -7,6 +7,178 @@
 #include 
 #include 
 
+/* eeprom */
+#define LAN743X_EEPROM_MAGIC   (0x74A5)
+#define LAN743X_OTP_MAGIC  (0x74F3)
+#define EEPROM_INDICATOR_1 (0xA5)
+#define EEPROM_INDICATOR_2 (0xAA)
+#define EEPROM_MAC_OFFSET  (0x01)
+#define MAX_EEPROM_SIZE512
+#define OTP_INDICATOR_1(0xF3)
+#define OTP_INDICATOR_2(0xF7)
+
+static int lan743x_otp_write(struct lan743x_adapter *adapter, u32 offset,
+u32 length, u8 *data)
+{
+   unsigned long timeout;
+   u32 buf;
+   int i;
+
+   buf = lan743x_csr_read(adapter, OTP_PWR_DN);
+
+   if (buf & OTP_PWR_DN_PWRDN_N_) {
+   /* clear it and wait to be cleared */
+   lan743x_csr_write(adapter, OTP_PWR_DN, 0);
+
+   timeout = jiffies + HZ;
+   do {
+   udelay(1);
+   buf = lan743x_csr_read(adapter, OTP_PWR_DN);
+   if (time_after(jiffies, timeout)) {
+   netif_warn(adapter, drv, adapter->netdev,
+  "timeout on OTP_PWR_DN 
completion\n");
+   return -EIO;
+   }
+   } while (buf & OTP_PWR_DN_PWRDN_N_);
+   }
+
+   /* set to BYTE program mode */
+   lan743x_csr_write(adapter, OTP_PRGM_MODE, OTP_PRGM_MODE_BYTE_);
+
+   for (i = 0; i < length; i++) {
+   lan743x_csr_write(adapter, OTP_ADDR1,
+ ((offset + i) >> 8) &
+ OTP_ADDR1_15_11_MASK_);
+   lan743x_csr_write(adapter, OTP_ADDR2,
+ ((offset + i) &
+ OTP_ADDR2_10_3_MASK_));
+   lan743x_csr_write(adapter, OTP_PRGM_DATA, data[i]);
+   lan743x_csr_write(adapter, OTP_TST_CMD, OTP_TST_CMD_PRGVRFY_);
+   lan743x_csr_write(adapter, OTP_CMD_GO, OTP_CMD_GO_GO_);
+
+   timeout = jiffies + HZ;
+   do {
+   udelay(1);
+   buf = lan743x_csr_read(adapter, OTP_STATUS);
+   if (time_after(jiffies, timeout)) {
+   netif_warn(adapter, drv, adapter->netdev,
+  "Timeout on OTP_STATUS 
completion\n");
+   return -EIO;
+   }
+   } while (buf & OTP_STATUS_BUSY_);
+   }
+
+   return 0;
+}
+
+static int lan743x_eeprom_wait(struct lan743x_adapter *adapter)
+{
+   unsigned long start_time = jiffies;
+   u32 val;
+
+   do {
+   val = lan743x_csr_read(adapter, E2P_CMD);
+
+   if (!(val & E2P_CMD_EPC_BUSY_) ||
+   (val & E2P_CMD_EPC_TIMEOUT_))
+   break;
+   usleep_range(40, 100);
+   } while (!time_after(jiffies, start_time + HZ));
+
+   if (val & (E2P_CMD_EPC_TIMEOUT_ | E2P_CMD_EPC_BUSY_)) {
+   netif_warn(adapter, drv, adapter->netdev,
+  "EEPROM read operation timeout\n");
+   return -EIO;
+   }
+
+   return 0;
+}
+
+static int lan743x_eeprom_confirm_not_busy(struct lan743x_adapter *adapter)
+{
+   unsigned long start_time = jiffies;
+   u32 val;
+
+   do {
+   val = lan743x_csr_read(adapter, E2P_CMD);
+
+   if (!(val & E2P_CMD_EPC_BUSY_))
+   return 0;
+
+   usleep_range(40, 100);
+   } while (!time_after(jiffies, start_time + HZ));
+
+   netif_warn(adapter, drv, adapter->netdev, "EEPROM is busy\n");
+   return -EIO;
+}
+
+static int lan743x_eeprom_read(struct lan743x_adapter *adapter,
+  u32 offset, u32 length, u8 *data)
+{
+   int retval;
+   u32 val;
+   int i;
+
+   retval = lan743x_eeprom_confirm_not_busy(adapter);
+   if (retval)
+   return retval;
+
+   for (i = 0; i < length; i++) {
+   val = E2P_CMD_EPC_BUSY_ | E2P_CMD_EPC_CMD_READ_;
+   val |= (offset & E2P_CMD_EPC_ADDR_MASK_);
+   lan743x_csr_write(adapter, E2P_CMD, val);
+
+   retval = lan743x_eeprom_wait(adapter);
+   if (retval < 0)
+   return retval;
+
+   val = lan743x_csr_read(adapter, 

[PATCH v1 net-next 6/9] lan743x: Add power management support

2018-07-05 Thread Bryan Whitehead
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/lan743x_ethtool.c |  51 ++
 drivers/net/ethernet/microchip/lan743x_main.c| 210 +++
 drivers/net/ethernet/microchip/lan743x_main.h|  47 +
 3 files changed, 308 insertions(+)

diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c 
b/drivers/net/ethernet/microchip/lan743x_ethtool.c
index 0d0c997..0709a8d 100644
--- a/drivers/net/ethernet/microchip/lan743x_ethtool.c
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -417,6 +417,53 @@ static int lan743x_ethtool_get_sset_count(struct 
net_device *netdev, int sset)
}
 }
 
+#ifdef CONFIG_PM
+static void lan743x_ethtool_get_wol(struct net_device *netdev,
+   struct ethtool_wolinfo *wol)
+{
+   struct lan743x_adapter *adapter = netdev_priv(netdev);
+   u32 data;
+
+   data = lan743x_csr_read(adapter, PMT_CTL);
+
+   wol->supported = WAKE_BCAST | WAKE_UCAST | WAKE_MCAST |
+   WAKE_MAGIC | WAKE_PHY | WAKE_ARP;
+
+   wol->wolopts = adapter->wolopts;
+}
+#endif /* CONFIG_PM */
+
+#if CONFIG_PM
+static int lan743x_ethtool_set_wol(struct net_device *netdev,
+  struct ethtool_wolinfo *wol)
+{
+   struct lan743x_adapter *adapter = netdev_priv(netdev);
+
+   if (wol->wolopts & WAKE_MAGICSECURE)
+   return -EOPNOTSUPP;
+
+   adapter->wolopts = 0;
+   if (wol->wolopts & WAKE_UCAST)
+   adapter->wolopts |= WAKE_UCAST;
+   if (wol->wolopts & WAKE_MCAST)
+   adapter->wolopts |= WAKE_MCAST;
+   if (wol->wolopts & WAKE_BCAST)
+   adapter->wolopts |= WAKE_BCAST;
+   if (wol->wolopts & WAKE_MAGIC)
+   adapter->wolopts |= WAKE_MAGIC;
+   if (wol->wolopts & WAKE_PHY)
+   adapter->wolopts |= WAKE_PHY;
+   if (wol->wolopts & WAKE_ARP)
+   adapter->wolopts |= WAKE_ARP;
+
+   device_set_wakeup_enable(>pdev->dev, (bool)wol->wolopts);
+
+   phy_ethtool_set_wol(netdev->phydev, wol);
+
+   return 0;
+}
+#endif /* CONFIG_PM */
+
 const struct ethtool_ops lan743x_ethtool_ops = {
.get_drvinfo = lan743x_ethtool_get_drvinfo,
.get_msglevel = lan743x_ethtool_get_msglevel,
@@ -431,4 +478,8 @@ const struct ethtool_ops lan743x_ethtool_ops = {
.get_sset_count = lan743x_ethtool_get_sset_count,
.get_link_ksettings = phy_ethtool_get_link_ksettings,
.set_link_ksettings = phy_ethtool_set_link_ksettings,
+#ifdef CONFIG_PM
+   .get_wol = lan743x_ethtool_get_wol,
+   .set_wol = lan743x_ethtool_set_wol,
+#endif
 };
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c 
b/drivers/net/ethernet/microchip/lan743x_main.c
index 1e2f8c6..52ca8b9 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -2749,10 +2749,217 @@ static void lan743x_pcidev_shutdown(struct pci_dev 
*pdev)
lan743x_netdev_close(netdev);
rtnl_unlock();
 
+#ifdef CONFIG_PM
+   pci_save_state(pdev);
+#endif
+
/* clean up lan743x portion */
lan743x_hardware_cleanup(adapter);
 }
 
+#ifdef CONFIG_PM
+static int lan743x_pm_wakeframe_crc16(const u8 *buf, int len)
+{
+   const u16 crc16poly = 0x8005;
+   u16 bit, crc, msb;
+   u8 data;
+   int i;
+
+   crc = 0x;
+   for (i = 0; i < len; i++) {
+   data = *buf++;
+   for (bit = 0; bit < 8; bit++) {
+   msb = crc >> 15;
+   crc <<= 1;
+
+   if (msb ^ (u16)(data & 1)) {
+   crc ^= crc16poly;
+   crc |= (u16)0x0001U;
+   }
+   data >>= 1;
+   }
+   }
+
+   return crc;
+}
+#endif /* CONFIG_PM */
+
+#if CONFIG_PM
+static void lan743x_pm_set_wol(struct lan743x_adapter *adapter)
+{
+   const u8 ipv4_multicast[3] = { 0x01, 0x00, 0x5E };
+   const u8 ipv6_multicast[3] = { 0x33, 0x33 };
+   const u8 arp_type[2] = { 0x08, 0x06 };
+   int mask_index;
+   u32 pmtctl;
+   u32 wucsr;
+   u32 macrx;
+   u16 crc;
+
+   for (mask_index = 0; mask_index < MAC_NUM_OF_WUF_CFG; mask_index++)
+   lan743x_csr_write(adapter, MAC_WUF_CFG(mask_index), 0);
+
+   /* clear wake settings */
+   pmtctl = lan743x_csr_read(adapter, PMT_CTL);
+   pmtctl |= PMT_CTL_WUPS_MASK_;
+   pmtctl &= ~(PMT_CTL_GPIO_WAKEUP_EN_ | PMT_CTL_EEE_WAKEUP_EN_ |
+   PMT_CTL_WOL_EN_ | PMT_CTL_MAC_D3_RX_CLK_OVR_ |
+   PMT_CTL_RX_FCT_RFE_D3_CLK_OVR_ | PMT_CTL_ETH_PHY_WAKE_EN_);
+
+   macrx = lan743x_csr_read(adapter, MAC_RX);
+
+   wucsr = 0;
+   mask_index = 0;
+
+   pmtctl |= PMT_CTL_ETH_PHY_D3_COLD_OVR_ | PMT_CTL_ETH_PHY_D3_OVR_;
+
+   if (adapter->wolopts & WAKE_PHY) {
+   pmtctl |= PMT_CTL_ETH_PHY_EDPD_PLL_CTL_;
+  

[PATCH v1 net-next 8/9] lan743x: Add RSS support

2018-07-05 Thread Bryan Whitehead
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/lan743x_ethtool.c | 132 +++
 drivers/net/ethernet/microchip/lan743x_main.c|  20 
 drivers/net/ethernet/microchip/lan743x_main.h|  19 
 3 files changed, 171 insertions(+)

diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c 
b/drivers/net/ethernet/microchip/lan743x_ethtool.c
index edbe8b1..aa3421e 100644
--- a/drivers/net/ethernet/microchip/lan743x_ethtool.c
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -417,6 +417,133 @@ static int lan743x_ethtool_get_sset_count(struct 
net_device *netdev, int sset)
}
 }
 
+static int lan743x_ethtool_get_rxnfc(struct net_device *netdev,
+struct ethtool_rxnfc *rxnfc,
+u32 *rule_locs)
+{
+   switch (rxnfc->cmd) {
+   case ETHTOOL_GRXFH:
+   rxnfc->data = 0;
+   switch (rxnfc->flow_type) {
+   case TCP_V4_FLOW:case UDP_V4_FLOW:
+   case TCP_V6_FLOW:case UDP_V6_FLOW:
+   rxnfc->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+   /* fall through */
+   case IPV4_FLOW: case IPV6_FLOW:
+   rxnfc->data |= RXH_IP_SRC | RXH_IP_DST;
+   return 0;
+   }
+   break;
+   case ETHTOOL_GRXRINGS:
+   rxnfc->data = LAN743X_USED_RX_CHANNELS;
+   return 0;
+   }
+   return -EOPNOTSUPP;
+}
+
+static u32 lan743x_ethtool_get_rxfh_key_size(struct net_device *netdev)
+{
+   return 40;
+}
+
+static u32 lan743x_ethtool_get_rxfh_indir_size(struct net_device *netdev)
+{
+   return 128;
+}
+
+static int lan743x_ethtool_get_rxfh(struct net_device *netdev,
+   u32 *indir, u8 *key, u8 *hfunc)
+{
+   struct lan743x_adapter *adapter = netdev_priv(netdev);
+
+   if (indir) {
+   int dw_index;
+   int byte_index = 0;
+
+   for (dw_index = 0; dw_index < 32; dw_index++) {
+   u32 four_entries =
+   lan743x_csr_read(adapter, RFE_INDX(dw_index));
+
+   byte_index = dw_index << 2;
+   indir[byte_index + 0] =
+   ((four_entries >> 0) & 0x00FF);
+   indir[byte_index + 1] =
+   ((four_entries >> 8) & 0x00FF);
+   indir[byte_index + 2] =
+   ((four_entries >> 16) & 0x00FF);
+   indir[byte_index + 3] =
+   ((four_entries >> 24) & 0x00FF);
+   }
+   }
+   if (key) {
+   int dword_index;
+   int byte_index = 0;
+
+   for (dword_index = 0; dword_index < 10; dword_index++) {
+   u32 four_entries =
+   lan743x_csr_read(adapter,
+RFE_HASH_KEY(dword_index));
+
+   byte_index = dword_index << 2;
+   key[byte_index + 0] =
+   ((four_entries >> 0) & 0x00FF);
+   key[byte_index + 1] =
+   ((four_entries >> 8) & 0x00FF);
+   key[byte_index + 2] =
+   ((four_entries >> 16) & 0x00FF);
+   key[byte_index + 3] =
+   ((four_entries >> 24) & 0x00FF);
+   }
+   }
+   if (hfunc)
+   (*hfunc) = ETH_RSS_HASH_TOP;
+   return 0;
+}
+
+static int lan743x_ethtool_set_rxfh(struct net_device *netdev,
+   const u32 *indir, const u8 *key,
+   const u8 hfunc)
+{
+   struct lan743x_adapter *adapter = netdev_priv(netdev);
+
+   if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+   return -EOPNOTSUPP;
+
+   if (indir) {
+   u32 indir_value = 0;
+   int dword_index = 0;
+   int byte_index = 0;
+
+   for (dword_index = 0; dword_index < 32; dword_index++) {
+   byte_index = dword_index << 2;
+   indir_value =
+   (((indir[byte_index + 0] & 0x00FF) << 0) |
+   ((indir[byte_index + 1] & 0x00FF) << 8) |
+   ((indir[byte_index + 2] & 0x00FF) << 16) |
+   ((indir[byte_index + 3] & 0x00FF) << 24));
+   lan743x_csr_write(adapter, RFE_INDX(dword_index),
+ indir_value);
+   }
+   }
+   if (key) {
+   int dword_index = 0;
+   int byte_index = 0;
+   u32 key_value = 0;
+
+   

[PATCH v1 net-next 9/9] lan743x: Add PTP support

2018-07-05 Thread Bryan Whitehead
Signed-off-by: Bryan Whitehead 
---
 drivers/net/ethernet/microchip/Makefile  |2 +-
 drivers/net/ethernet/microchip/lan743x_ethtool.c |   28 +
 drivers/net/ethernet/microchip/lan743x_main.c|   81 +-
 drivers/net/ethernet/microchip/lan743x_main.h|   96 +-
 drivers/net/ethernet/microchip/lan743x_ptp.c | 1194 ++
 drivers/net/ethernet/microchip/lan743x_ptp.h |   78 ++
 6 files changed, 1474 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/ethernet/microchip/lan743x_ptp.c
 create mode 100644 drivers/net/ethernet/microchip/lan743x_ptp.h

diff --git a/drivers/net/ethernet/microchip/Makefile 
b/drivers/net/ethernet/microchip/Makefile
index 43f47cb..538926d 100644
--- a/drivers/net/ethernet/microchip/Makefile
+++ b/drivers/net/ethernet/microchip/Makefile
@@ -6,4 +6,4 @@ obj-$(CONFIG_ENC28J60) += enc28j60.o
 obj-$(CONFIG_ENCX24J600) += encx24j600.o encx24j600-regmap.o
 obj-$(CONFIG_LAN743X) += lan743x.o
 
-lan743x-objs := lan743x_main.o lan743x_ethtool.o
+lan743x-objs := lan743x_main.o lan743x_ethtool.o lan743x_ptp.o
diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c 
b/drivers/net/ethernet/microchip/lan743x_ethtool.c
index aa3421e..de17fdf 100644
--- a/drivers/net/ethernet/microchip/lan743x_ethtool.c
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -4,6 +4,7 @@
 #include 
 #include "lan743x_main.h"
 #include "lan743x_ethtool.h"
+#include 
 #include 
 #include 
 
@@ -544,6 +545,32 @@ static int lan743x_ethtool_set_rxfh(struct net_device 
*netdev,
return 0;
 }
 
+static int lan743x_ethtool_get_ts_info(struct net_device *netdev,
+  struct ethtool_ts_info *ts_info)
+{
+   struct lan743x_adapter *adapter = netdev_priv(netdev);
+
+   ts_info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+  SOF_TIMESTAMPING_RX_SOFTWARE |
+  SOF_TIMESTAMPING_SOFTWARE |
+  SOF_TIMESTAMPING_TX_HARDWARE |
+  SOF_TIMESTAMPING_RX_HARDWARE |
+  SOF_TIMESTAMPING_RAW_HARDWARE;
+#ifdef CONFIG_PTP_1588_CLOCK
+   if (adapter->ptp.ptp_clock)
+   ts_info->phc_index = ptp_clock_index(adapter->ptp.ptp_clock);
+   else
+   ts_info->phc_index = -1;
+#else
+   ts_info->phc_index = -1;
+#endif
+   ts_info->tx_types = BIT(HWTSTAMP_TX_OFF) |
+   BIT(HWTSTAMP_TX_ON);
+   ts_info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
+ BIT(HWTSTAMP_FILTER_ALL);
+   return 0;
+}
+
 static int lan743x_ethtool_get_eee(struct net_device *netdev,
   struct ethtool_eee *eee)
 {
@@ -695,6 +722,7 @@ const struct ethtool_ops lan743x_ethtool_ops = {
.get_rxfh_indir_size = lan743x_ethtool_get_rxfh_indir_size,
.get_rxfh = lan743x_ethtool_get_rxfh,
.set_rxfh = lan743x_ethtool_set_rxfh,
+   .get_ts_info = lan743x_ethtool_get_ts_info,
.get_eee = lan743x_ethtool_get_eee,
.set_eee = lan743x_ethtool_set_eee,
.get_link_ksettings = phy_ethtool_get_link_ksettings,
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c 
b/drivers/net/ethernet/microchip/lan743x_main.c
index 01296e1..89fe9f3 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -266,6 +266,10 @@ static void lan743x_intr_shared_isr(void *context, u32 
int_sts, u32 flags)
lan743x_intr_software_isr(adapter);
int_sts &= ~INT_BIT_SW_GP_;
}
+   if (int_sts & INT_BIT_1588_) {
+   lan743x_ptp_isr(adapter);
+   int_sts &= ~INT_BIT_1588_;
+   }
}
if (int_sts)
lan743x_csr_write(adapter, INT_EN_CLR, int_sts);
@@ -975,6 +979,7 @@ static void lan743x_phy_link_status_change(struct 
net_device *netdev)
   ksettings.base.duplex,
   local_advertisement,
   remote_advertisement);
+   lan743x_ptp_update_latency(adapter, ksettings.base.speed);
}
 }
 
@@ -1255,11 +1260,29 @@ static void lan743x_tx_release_desc(struct lan743x_tx 
*tx,
buffer_info->dma_ptr = 0;
buffer_info->buffer_length = 0;
}
-   if (buffer_info->skb) {
+   if (!buffer_info->skb)
+   goto clear_active;
+
+   if (!(buffer_info->flags &
+   TX_BUFFER_INFO_FLAG_TIMESTAMP_REQUESTED)) {
dev_kfree_skb(buffer_info->skb);
-   buffer_info->skb = NULL;
+   goto clear_skb;
}
 
+   if (cleanup) {
+   lan743x_ptp_unrequest_tx_timestamp(tx->adapter);
+   dev_kfree_skb(buffer_info->skb);
+   } else {
+  

[bpf PATCH v2 4/4] bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb

2018-07-05 Thread John Fastabend
In commit

  'bpf: bpf_compute_data uses incorrect cb structure' (8108a7751512)

we added the routine bpf_compute_data_end_sk_skb() to compute the
correct data_end values, but this has since been lost. In kernel
v4.14 this was correct and the above patch was applied in it
entirety. Then when v4.14 was merged into v4.15-rc1 net-next tree
we lost the piece that renamed bpf_compute_data_pointers to the
new function bpf_compute_data_end_sk_skb. This was done here,

e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net")

When it conflicted with the following rename patch,

6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers")

Finally, after a refactor I thought even the function
bpf_compute_data_end_sk_skb() was no longer needed and it was
erroneously removed.

However, we never reverted the sk_skb_convert_ctx_access() usage of
tcp_skb_cb which had been committed and survived the merge conflict.
Here we fix this by adding back the helper and *_data_end_sk_skb()
usage. Using the bpf_skc_data_end mapping is not correct because it
expects a qdisc_skb_cb object but at the sock layer this is not the
case. Even though it happens to work here because we don't overwrite
any data in-use at the socket layer and the cb structure is cleared
later this has potential to create some subtle issues. But, even
more concretely the filter.c access check uses tcp_skb_cb.

And by some act of chance though,

struct bpf_skb_data_end {
struct qdisc_skb_cbqdisc_cb; /* 028 */

/* XXX 4 bytes hole, try to pack */

void * data_meta;/*32 8 */
void * data_end; /*40 8 */

/* size: 48, cachelines: 1, members: 3 */
/* sum members: 44, holes: 1, sum holes: 4 */
/* last cacheline: 48 bytes */
};

and then tcp_skb_cb,

struct tcp_skb_cb {
[...]
struct {
__u32  flags;/*24 4 */
struct sock * sk_redir;  /*32 8 */
void * data_end; /*40 8 */
} bpf;   /*  24 */
};

So when we use offset_of() to track down the byte offset we get 40 in
either case and everything continues to work. Fix this mess and use
correct structures its unclear how long this might actually work for
until someone moves the structs around.

Reported-by: Martin KaFai Lau 
Fixes: e1ea2f9856b7 ("Merge 
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net")
Fixes: 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into 
bpf_compute_data_pointers")
Signed-off-by: John Fastabend 
---
 include/net/tcp.h|4 ++
 kernel/bpf/sockmap.c |4 +-
 net/core/filter.c|   98 ++
 3 files changed, 97 insertions(+), 9 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 800582b..af3ec72 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -828,6 +828,10 @@ struct tcp_skb_cb {
 
 #define TCP_SKB_CB(__skb)  ((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
+static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
+{
+   TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
+}
 
 #if IS_ENABLED(CONFIG_IPV6)
 /* This is the variant of inet6_iif() that must be used by TCP,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index dfc8a8a..98fb793 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1236,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, 
struct sk_buff *skb)
 */
TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
skb->sk = psock->sock;
-   bpf_compute_data_pointers(skb);
+   bpf_compute_data_end_sk_skb(skb);
preempt_disable();
rc = (*prog->bpf_func)(skb, prog->insnsi);
preempt_enable();
@@ -1491,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser 
*strp,
 * any socket yet.
 */
skb->sk = psock->sock;
-   bpf_compute_data_pointers(skb);
+   bpf_compute_data_end_sk_skb(skb);
rc = (*prog->bpf_func)(skb, prog->insnsi);
skb->sk = NULL;
rcu_read_unlock();
diff --git a/net/core/filter.c b/net/core/filter.c
index 3095f1b..4702680 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1762,6 +1762,37 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff 
*skb)
.arg2_type  = ARG_ANYTHING,
 };
 
+static inline int sk_skb_try_make_writable(struct sk_buff *skb,
+  unsigned int write_len)
+{
+   int err = __bpf_try_make_writable(skb, write_len);
+
+   bpf_compute_data_end_sk_skb(skb);
+   return err;
+}
+
+BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+   /* Idea is the following: should the needed direct read/write
+* test fail 

[bpf PATCH v2 3/4] bpf: sockmap, consume_skb in close path

2018-07-05 Thread John Fastabend
Currently, when a sock is closed and the bpf_tcp_close() callback is
used we remove memory but do not free the skb. Call consume_skb() if
the skb is attached to the buffer.

Reported-by: syzbot+d464d2c20c717ef5a...@syzkaller.appspotmail.com
Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
---
 kernel/bpf/sockmap.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 9c67e96..dfc8a8a 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -571,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct 
sk_msg_buff *md)
while (sg[i].length) {
free += sg[i].length;
sk_mem_uncharge(sk, sg[i].length);
-   put_page(sg_page([i]));
+   if (!md->skb)
+   put_page(sg_page([i]));
sg[i].length = 0;
sg[i].page_link = 0;
sg[i].offset = 0;
@@ -580,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct 
sk_msg_buff *md)
if (i == MAX_SKB_FRAGS)
i = 0;
}
+   if (md->skb)
+   consume_skb(md->skb);
 
return free;
 }



[bpf PATCH v2 0/4] sockhash/sockmap fixes

2018-07-05 Thread John Fastabend
First three patches resolve issues found while testing sockhash and
reviewing code. Syzbot also found them about the same time as I was
working on fixes. The main issue is in the sockhash path we reduced
the scope of sk_callback lock but this meant we could get update and
close running in parallel so fix that here.

Then testing sk_msg and sk_skb programs together found that skb->dev
is not always assigned and some of the helpers were depending on this
to lookup max mtu. Fix this by using SKB_MAX_ALLOC when no MTU is
available.

Finally, Martin spotted that the sockmap code was still using the
qdisc skb cb structure. But I was sure we had fixed this long ago.
Looks like we missed it in a merge conflict resolution and then by
chance data_end offset was the same in both structures so everything
sort of continued to work even though it could break at any moment
if the structs ever change. So redo the conversion and this time
also convert the helpers.

v2: fix '0 files changed' issue in patches

---

John Fastabend (4):
  bpf: fix sk_skb programs without skb->dev assigned
  bpf: sockhash, disallow bpf_tcp_close and update in parallel
  bpf: sockmap, consume_skb in close path
  bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb


 include/net/tcp.h|4 ++
 kernel/bpf/sockmap.c |   24 ++--
 kernel/bpf/syscall.c |4 +-
 net/core/filter.c|  101 ++
 4 files changed, 121 insertions(+), 12 deletions(-)

--
Signature


[bpf PATCH v2 2/4] bpf: sockhash, disallow bpf_tcp_close and update in parallel

2018-07-05 Thread John Fastabend
After latest lock updates there is no longer anything preventing a
close and recvmsg call running in parallel. Additionally, we can
race update with close if we close a socket and simultaneously update
if via the BPF userspace API (note the cgroup ops are already run
with sock_lock held).

To resolve this take sock_lock in close and update paths.

Reported-by: syzbot+b680e42077a0d7c9a...@syzkaller.appspotmail.com
Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")
Signed-off-by: John Fastabend 
---
 kernel/bpf/sockmap.c |   15 +++
 kernel/bpf/syscall.c |4 +++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 00fb2e3..9c67e96 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -312,10 +312,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
struct smap_psock *psock;
struct sock *osk;
 
+   lock_sock(sk);
rcu_read_lock();
psock = smap_psock_sk(sk);
if (unlikely(!psock)) {
rcu_read_unlock();
+   release_sock(sk);
return sk->sk_prot->close(sk, timeout);
}
 
@@ -371,6 +373,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
e = psock_map_pop(sk, psock);
}
rcu_read_unlock();
+   release_sock(sk);
close_fun(sk, timeout);
 }
 
@@ -2069,7 +2072,13 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EOPNOTSUPP;
}
 
+   lock_sock(skops.sk);
+   preempt_disable();
+   rcu_read_lock();
err = sock_map_ctx_update_elem(, map, key, flags);
+   rcu_read_unlock();
+   preempt_enable();
+   release_sock(skops.sk);
fput(socket->file);
return err;
 }
@@ -2410,7 +2419,13 @@ static int sock_hash_update_elem(struct bpf_map *map,
return -EINVAL;
}
 
+   lock_sock(skops.sk);
+   preempt_disable();
+   rcu_read_lock();
err = sock_hash_ctx_update_elem(, map, key, flags);
+   rcu_read_unlock();
+   preempt_enable();
+   release_sock(skops.sk);
fput(socket->file);
return err;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d10ecd7..a31a1ba 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -735,7 +735,9 @@ static int map_update_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_update_elem(map, key, value, attr->flags);
goto out;
-   } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+   } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+  map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+  map->map_type == BPF_MAP_TYPE_SOCKMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
}



[bpf PATCH v2 1/4] bpf: fix sk_skb programs without skb->dev assigned

2018-07-05 Thread John Fastabend
Multiple BPF helpers in use by sk_skb programs calculate the max
skb length using the __bpf_skb_max_len function. However, this
calculates the max length using the skb->dev pointer which can be
NULL when an sk_skb program is paired with an sk_msg program.

To force this a sk_msg program needs to redirect into the ingress
path of a sock with an attach sk_skb program. Then the the sk_skb
program would need to call one of the helpers that adjust the skb
size.

To fix the null ptr dereference use SKB_MAX_ALLOC size if no dev
is available.

Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support")
Signed-off-by: John Fastabend 
---
 net/core/filter.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 0ca6907..3095f1b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2779,7 +2779,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 
len_diff)
 
 static u32 __bpf_skb_max_len(const struct sk_buff *skb)
 {
-   return skb->dev->mtu + skb->dev->hard_header_len;
+   return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
+ SKB_MAX_ALLOC;
 }
 
 static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)



Re: [bpf PATCH 0/4] sockhash/sockmap fixes

2018-07-05 Thread John Fastabend
On 07/05/2018 08:36 AM, John Fastabend wrote:
> First three patches resolve issues found while testing sockhash and
> reviewing code. Syzbot also found them about the same time as I was
> working on fixes. The main issue is in the sockhash path we reduced
> the scope of sk_callback lock but this meant we could get update and
> close running in parallel so fix that here.

[...]

> ---
> 
> John Fastabend (4):
>   bpf: fix sk_skb programs without skb->dev assigned
>   bpf: sockhash, disallow bpf_tcp_close and update in parallel
>   bpf: sockmap, consume_skb in close path
>   bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb
> 
> 
>  0 files changed
^^^

Ah dang, something about my workflow causes git to report 0 files
changed occasionally, I forgot to check it before sending. I'll send
a v2. And get some more coffee I guess. Thanks sorry for the noise.


Re: [PATCHv2 net-next 2/2] selftests: add a selftest for directed broadcast forwarding

2018-07-05 Thread Xin Long
On Thu, Jul 5, 2018 at 4:21 PM, Xin Long  wrote:
> On Thu, Jul 5, 2018 at 4:39 AM, Ido Schimmel  wrote:
>> On Thu, Jul 05, 2018 at 01:56:23AM +0800, Xin Long wrote:
>>> On Wed, Jul 4, 2018 at 3:23 AM, David Ahern  wrote:
>>> > your commands are not a proper test. The test should succeed and fail
>>> > based on the routing lookup, not iptables rules.
>>> A proper test can be done easily with netns, as vrf can't isolate much.
>>> I don't want to bother forwarding/ directory with netns, so I will probably
>>> just drop this selftest, and let the feature patch go first.
>>>
>>> What do you think?
>>
>> You can add a tc rule on the ingress of h2 and make sure that in the
>> first case ping succeeds and the tc rule wasn't hit. In the second case
>> ping should also succeed, but the tc rule should be hit. This is similar
>> to your original netns test.
> With netns, it will be much easier to use
> sysctl net.ipv4.icmp_echo_ignore_broadcasts
> to block the echo_request on r1 or h2, and check if ping works.
> (this's more like the idea of using 'iptables' above) :D
>
>>
>> You can look at tc_flower.sh for reference and in particular at
>> tc_check_packets().
Just noticed this doesn't require reply with MZ. that's better.
Thanks.

> This is a way similar idea of using tcpdump, I just feel it's too much,
> this test should be an as simple test as route.sh. :)


[bpf PATCH 3/4] bpf: sockmap, consume_skb in close path

2018-07-05 Thread John Fastabend
Currently, when a sock is closed and the bpf_tcp_close() callback is
used we remove memory but do not free the skb. Call consume_skb() if
the skb is attached to the buffer.

Reported-by: syzbot+d464d2c20c717ef5a...@syzkaller.appspotmail.com
Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend 
---
 0 files changed

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 9c67e96..dfc8a8a 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -571,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct 
sk_msg_buff *md)
while (sg[i].length) {
free += sg[i].length;
sk_mem_uncharge(sk, sg[i].length);
-   put_page(sg_page([i]));
+   if (!md->skb)
+   put_page(sg_page([i]));
sg[i].length = 0;
sg[i].page_link = 0;
sg[i].offset = 0;
@@ -580,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct 
sk_msg_buff *md)
if (i == MAX_SKB_FRAGS)
i = 0;
}
+   if (md->skb)
+   consume_skb(md->skb);
 
return free;
 }



[bpf PATCH 4/4] bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb

2018-07-05 Thread John Fastabend
In commit

  'bpf: bpf_compute_data uses incorrect cb structure' (8108a7751512)

we added the routine bpf_compute_data_end_sk_skb() to compute the
correct data_end values, but this has since been lost. In kernel
v4.14 this was correct and the above patch was applied in it
entirety. Then when v4.14 was merged into v4.15-rc1 net-next tree
we lost the piece that renamed bpf_compute_data_pointers to the
new function bpf_compute_data_end_sk_skb. This was done here,

e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net")

When it conflicted with the following rename patch,

6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers")

Finally, after a refactor I thought even the function
bpf_compute_data_end_sk_skb() was no longer needed and it was
erroneously removed.

However, we never reverted the sk_skb_convert_ctx_access() usage of
tcp_skb_cb which had been committed and survived the merge conflict.
Here we fix this by adding back the helper and *_data_end_sk_skb()
usage. Using the bpf_skc_data_end mapping is not correct because it
expects a qdisc_skb_cb object but at the sock layer this is not the
case. Even though it happens to work here because we don't overwrite
any data in-use at the socket layer and the cb structure is cleared
later this has potential to create some subtle issues. But, even
more concretely the filter.c access check uses tcp_skb_cb.

And by some act of chance though,

struct bpf_skb_data_end {
struct qdisc_skb_cbqdisc_cb; /* 028 */

/* XXX 4 bytes hole, try to pack */

void * data_meta;/*32 8 */
void * data_end; /*40 8 */

/* size: 48, cachelines: 1, members: 3 */
/* sum members: 44, holes: 1, sum holes: 4 */
/* last cacheline: 48 bytes */
};

and then tcp_skb_cb,

struct tcp_skb_cb {
[...]
struct {
__u32  flags;/*24 4 */
struct sock * sk_redir;  /*32 8 */
void * data_end; /*40 8 */
} bpf;   /*  24 */
};

So when we use offset_of() to track down the byte offset we get 40 in
either case and everything continues to work. Fix this mess and use
correct structures its unclear how long this might actually work for
until someone moves the structs around.

Reported-by: Martin KaFai Lau 
Fixes: e1ea2f9856b7 ("Merge 
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net")
Fixes: 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into 
bpf_compute_data_pointers")
Signed-off-by: John Fastabend 
---
 0 files changed

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 800582b..af3ec72 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -828,6 +828,10 @@ struct tcp_skb_cb {
 
 #define TCP_SKB_CB(__skb)  ((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
+static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
+{
+   TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
+}
 
 #if IS_ENABLED(CONFIG_IPV6)
 /* This is the variant of inet6_iif() that must be used by TCP,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index dfc8a8a..98fb793 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1236,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, 
struct sk_buff *skb)
 */
TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
skb->sk = psock->sock;
-   bpf_compute_data_pointers(skb);
+   bpf_compute_data_end_sk_skb(skb);
preempt_disable();
rc = (*prog->bpf_func)(skb, prog->insnsi);
preempt_enable();
@@ -1491,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser 
*strp,
 * any socket yet.
 */
skb->sk = psock->sock;
-   bpf_compute_data_pointers(skb);
+   bpf_compute_data_end_sk_skb(skb);
rc = (*prog->bpf_func)(skb, prog->insnsi);
skb->sk = NULL;
rcu_read_unlock();
diff --git a/net/core/filter.c b/net/core/filter.c
index 3095f1b..4702680 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1762,6 +1762,37 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff 
*skb)
.arg2_type  = ARG_ANYTHING,
 };
 
+static inline int sk_skb_try_make_writable(struct sk_buff *skb,
+  unsigned int write_len)
+{
+   int err = __bpf_try_make_writable(skb, write_len);
+
+   bpf_compute_data_end_sk_skb(skb);
+   return err;
+}
+
+BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+   /* Idea is the following: should the needed direct read/write
+* test fail during runtime, we can pull in more data and redo
+* again, since implicitly, we invalidate previous checks here.
+*
+* Or, since we know how much we 

[bpf PATCH 0/4] sockhash/sockmap fixes

2018-07-05 Thread John Fastabend
First three patches resolve issues found while testing sockhash and
reviewing code. Syzbot also found them about the same time as I was
working on fixes. The main issue is in the sockhash path we reduced
the scope of sk_callback lock but this meant we could get update and
close running in parallel so fix that here.

Then testing sk_msg and sk_skb programs together found that skb->dev
is not always assigned and some of the helpers were depending on this
to lookup max mtu. Fix this by using SKB_MAX_ALLOC when no MTU is
available.

Finally, Martin spotted that the sockmap code was still using the
qdisc skb cb structure. But I was sure we had fixed this long ago.
Looks like we missed it in a merge conflict resolution and then by
chance data_end offset was the same in both structures so everything
sort of continued to work even though it could break at any moment
if the structs ever change. So redo the conversion and this time
also convert the helpers.

---

John Fastabend (4):
  bpf: fix sk_skb programs without skb->dev assigned
  bpf: sockhash, disallow bpf_tcp_close and update in parallel
  bpf: sockmap, consume_skb in close path
  bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb


 0 files changed

--
Signature


[bpf PATCH 2/4] bpf: sockhash, disallow bpf_tcp_close and update in parallel

2018-07-05 Thread John Fastabend
After latest lock updates there is no longer anything preventing a
close and recvmsg call running in parallel. Additionally, we can
race update with close if we close a socket and simultaneously update
if via the BPF userspace API (note the cgroup ops are already run
with sock_lock held).

To resolve this take sock_lock in close and update paths.

Reported-by: syzbot+b680e42077a0d7c9a...@syzkaller.appspotmail.com
Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")
Signed-off-by: John Fastabend 
---
 0 files changed

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 00fb2e3..9c67e96 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -312,10 +312,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
struct smap_psock *psock;
struct sock *osk;
 
+   lock_sock(sk);
rcu_read_lock();
psock = smap_psock_sk(sk);
if (unlikely(!psock)) {
rcu_read_unlock();
+   release_sock(sk);
return sk->sk_prot->close(sk, timeout);
}
 
@@ -371,6 +373,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
e = psock_map_pop(sk, psock);
}
rcu_read_unlock();
+   release_sock(sk);
close_fun(sk, timeout);
 }
 
@@ -2069,7 +2072,13 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EOPNOTSUPP;
}
 
+   lock_sock(skops.sk);
+   preempt_disable();
+   rcu_read_lock();
err = sock_map_ctx_update_elem(, map, key, flags);
+   rcu_read_unlock();
+   preempt_enable();
+   release_sock(skops.sk);
fput(socket->file);
return err;
 }
@@ -2410,7 +2419,13 @@ static int sock_hash_update_elem(struct bpf_map *map,
return -EINVAL;
}
 
+   lock_sock(skops.sk);
+   preempt_disable();
+   rcu_read_lock();
err = sock_hash_ctx_update_elem(, map, key, flags);
+   rcu_read_unlock();
+   preempt_enable();
+   release_sock(skops.sk);
fput(socket->file);
return err;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d10ecd7..a31a1ba 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -735,7 +735,9 @@ static int map_update_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_update_elem(map, key, value, attr->flags);
goto out;
-   } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+   } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+  map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+  map->map_type == BPF_MAP_TYPE_SOCKMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
}



[bpf PATCH 1/4] bpf: fix sk_skb programs without skb->dev assigned

2018-07-05 Thread John Fastabend
Multiple BPF helpers in use by sk_skb programs calculate the max
skb length using the __bpf_skb_max_len function. However, this
calculates the max length using the skb->dev pointer which can be
NULL when an sk_skb program is paired with an sk_msg program.

To force this a sk_msg program needs to redirect into the ingress
path of a sock with an attach sk_skb program. Then the the sk_skb
program would need to call one of the helpers that adjust the skb
size.

To fix the null ptr dereference use SKB_MAX_ALLOC size if no dev
is available.

Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support")
Signed-off-by: John Fastabend 
---
 0 files changed

diff --git a/net/core/filter.c b/net/core/filter.c
index 0ca6907..3095f1b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2779,7 +2779,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 
len_diff)
 
 static u32 __bpf_skb_max_len(const struct sk_buff *skb)
 {
-   return skb->dev->mtu + skb->dev->hard_header_len;
+   return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
+ SKB_MAX_ALLOC;
 }
 
 static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)



[PATCH v2 iproute2] man: Fix typos on tc-cbs

2018-07-05 Thread Jesus Sanchez-Palencia
Fix 2 typos on the man page of the CBS qdisc.

Signed-off-by: Jesus Sanchez-Palencia 
Reviewed-by: Simon Horman 
---
 man/man8/tc-cbs.8 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/man/man8/tc-cbs.8 b/man/man8/tc-cbs.8
index 32e1e0d4..ad1d8821 100644
--- a/man/man8/tc-cbs.8
+++ b/man/man8/tc-cbs.8
@@ -28,7 +28,7 @@ defined rate limiting method to the traffic.
 This queueing discipline is intended to be used by TSN (Time Sensitive
 Networking) applications, the CBS parameters are derived directly by
 what is described by the Annex L of the IEEE 802.1Q-2014
-Sepcification. The algorithm and how it affects the latency are
+Specification. The algorithm and how it affects the latency are
 detailed there.
 
 CBS is meant to be installed under another qdisc that maps packet
@@ -60,7 +60,7 @@ packet size, which is then used for calculating the idleslope.
 sendslope
 Sendslope is the rate of credits that is depleted (it should be a
 negative number of kilobits per second) when a transmission is
-ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
+occurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
 8.6.8.2 item g):
 
 sendslope = idleslope - port_transmit_rate
-- 
2.18.0



[bpf PATCH v2 1/2] bpf: sockmap, error path can not release psock in multi-map case

2018-07-05 Thread John Fastabend
The current code, in the error path of sock_hash_ctx_update_elem,
checks if the sock has a psock in the user data and if so decrements
the reference count of the psock. However, if the error happens early
in the error path we may have never incremented the psock reference
count and if the psock exists because the sock is in another map then
we may inadvertently decrement the reference count.

Fix this by making the error path only call smap_release_sock if the
error happens after the increment.

Reported-by: syzbot+d464d2c20c717ef5a...@syzkaller.appspotmail.com
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend 
---
 kernel/bpf/sockmap.c |   17 ++---
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index cf7b6a6..3847a7c 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1896,7 +1896,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
if (!e) {
err = -ENOMEM;
-   goto out_progs;
+   goto out_free;
}
}
 
@@ -2342,7 +2342,10 @@ static int sock_hash_ctx_update_elem(struct 
bpf_sock_ops_kern *skops,
if (err)
goto err;
 
-   /* bpf_map_update_elem() can be called in_irq() */
+   /* psock is valid here because otherwise above *ctx_update_elem would
+* have thrown an error. It is safe to skip error check.
+*/
+   psock = smap_psock_sk(sock);
raw_spin_lock_bh(>lock);
l_old = lookup_elem_raw(head, hash, key, key_size);
if (l_old && map_flags == BPF_NOEXIST) {
@@ -2360,12 +2363,6 @@ static int sock_hash_ctx_update_elem(struct 
bpf_sock_ops_kern *skops,
goto bucket_err;
}
 
-   psock = smap_psock_sk(sock);
-   if (unlikely(!psock)) {
-   err = -EINVAL;
-   goto bucket_err;
-   }
-
rcu_assign_pointer(e->hash_link, l_new);
rcu_assign_pointer(e->htab,
   container_of(map, struct bpf_htab, map));
@@ -2388,12 +2385,10 @@ static int sock_hash_ctx_update_elem(struct 
bpf_sock_ops_kern *skops,
raw_spin_unlock_bh(>lock);
return 0;
 bucket_err:
+   smap_release_sock(psock, sock);
raw_spin_unlock_bh(>lock);
 err:
kfree(e);
-   psock = smap_psock_sk(sock);
-   if (psock)
-   smap_release_sock(psock, sock);
return err;
 }
 



[bpf PATCH v2 2/2] bpf: sockmap, hash table is RCU so readers do not need locks

2018-07-05 Thread John Fastabend
This removes locking from readers of RCU hash table. Its not
necessary.

Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend 
---
 kernel/bpf/sockmap.c |2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 3847a7c..00fb2e3 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2467,10 +2467,8 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map 
*map, void *key)
b = __select_bucket(htab, hash);
head = >head;
 
-   raw_spin_lock_bh(>lock);
l = lookup_elem_raw(head, hash, key, key_size);
sk = l ? l->sk : NULL;
-   raw_spin_unlock_bh(>lock);
return sk;
 }
 



[bpf PATCH v2 0/2] sockmap, syzbot fix error path and RCU fix

2018-07-05 Thread John Fastabend
I missed fixing the error path in the sockhash code to align with
supporting socks in multiple maps. Simply checking if the psock is
present does not mean we can decrement the reference count because
it could be part of another map. Fix this by cleaning up the error
path so this situation does not happen.

---

John Fastabend (2):
  bpf: sockmap, error path can not release psock in multi-map case
  bpf: sockmap, hash table is RCU so readers do not need locks


 kernel/bpf/sockmap.c |   19 ++-
 1 file changed, 6 insertions(+), 13 deletions(-)

--
Signature


[PATCH net-next 2/2] net: ipv6: listify ipv6_rcv() and ip6_rcv_finish()

2018-07-05 Thread Edward Cree
Essentially the same as the ipv4 equivalents.

Signed-off-by: Edward Cree 
---
 include/net/ipv6.h   |   2 +
 net/ipv6/af_inet6.c  |   1 +
 net/ipv6/ip6_input.c | 131 ---
 3 files changed, 118 insertions(+), 16 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 16475c269749..b7843e0b16ee 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -922,6 +922,8 @@ static inline __be32 flowi6_get_flowlabel(const struct 
flowi6 *fl6)
 
 int ipv6_rcv(struct sk_buff *skb, struct net_device *dev,
 struct packet_type *pt, struct net_device *orig_dev);
+void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
+  struct net_device *orig_dev);
 
 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9ed0eae91758..c9535354149f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
 static struct packet_type ipv6_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IPV6),
.func = ipv6_rcv,
+   .list_func = ipv6_list_rcv,
 };
 
 static int __init ipv6_packet_init(void)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index f08d34491ece..6242682be876 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -47,17 +47,11 @@
 #include 
 #include 
 
-int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
+   struct sk_buff *skb)
 {
void (*edemux)(struct sk_buff *skb);
 
-   /* if ingress device is enslaved to an L3 master device pass the
-* skb to its handler for processing
-*/
-   skb = l3mdev_ip6_rcv(skb);
-   if (!skb)
-   return NET_RX_SUCCESS;
-
if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == 
NULL) {
const struct inet6_protocol *ipprot;
 
@@ -67,20 +61,73 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct 
sk_buff *skb)
}
if (!skb_valid_dst(skb))
ip6_route_input(skb);
+}
+
+int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+   /* if ingress device is enslaved to an L3 master device pass the
+* skb to its handler for processing
+*/
+   skb = l3mdev_ip6_rcv(skb);
+   if (!skb)
+   return NET_RX_SUCCESS;
+   ip6_rcv_finish_core(net, sk, skb);
 
return dst_input(skb);
 }
 
-int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type 
*pt, struct net_device *orig_dev)
+static void ip6_sublist_rcv_finish(struct list_head *head)
+{
+   struct sk_buff *skb, *next;
+
+   list_for_each_entry_safe(skb, next, head, list)
+   dst_input(skb);
+}
+
+static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
+   struct list_head *head)
+{
+   struct dst_entry *curr_dst = NULL;
+   struct sk_buff *skb, *next;
+   struct list_head sublist;
+
+   INIT_LIST_HEAD();
+   list_for_each_entry_safe(skb, next, head, list) {
+   struct dst_entry *dst;
+
+   list_del(>list);
+   /* if ingress device is enslaved to an L3 master device pass the
+* skb to its handler for processing
+*/
+   skb = l3mdev_ip6_rcv(skb);
+   if (!skb)
+   continue;
+   ip6_rcv_finish_core(net, sk, skb);
+   dst = skb_dst(skb);
+   if (curr_dst != dst) {
+   /* dispatch old sublist */
+   if (!list_empty())
+   ip6_sublist_rcv_finish();
+   /* start new sublist */
+   INIT_LIST_HEAD();
+   curr_dst = dst;
+   }
+   list_add_tail(>list, );
+   }
+   /* dispatch final sublist */
+   ip6_sublist_rcv_finish();
+}
+
+static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device 
*dev,
+   struct net *net)
 {
const struct ipv6hdr *hdr;
u32 pkt_len;
struct inet6_dev *idev;
-   struct net *net = dev_net(skb->dev);
 
if (skb->pkt_type == PACKET_OTHERHOST) {
kfree_skb(skb);
-   return NET_RX_DROP;
+   return NULL;
}
 
rcu_read_lock();
@@ -196,7 +243,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, 
struct packet_type *pt
if (ipv6_parse_hopopts(skb) < 0) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
rcu_read_unlock();
-   return NET_RX_DROP;
+   return NULL;
}
}
 
@@ -205,15 +252,67 @@ int 

[PATCH net-next 1/2] net: ipv4: fix list processing on L3 slave devices

2018-07-05 Thread Edward Cree
If we have an L3 master device, l3mdev_ip_rcv() will steal the skb, but
 we were returning NET_RX_SUCCESS from ip_rcv_finish_core() which meant
 that ip_list_rcv_finish() would keep it on the list.  Instead let's
 move the l3mdev_ip_rcv() call into the caller, so that our response to
 a steal can be different in the single packet path (return
 NET_RX_SUCCESS) and the list path (forget this packet and continue).

Fixes: 5fa12739a53d net: ipv4: listify ip_rcv_finish
Signed-off-by: Edward Cree 
---
 net/ipv4/ip_input.c | 23 +++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 14ba628b2761..1a3b6f32b1c9 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -316,13 +316,6 @@ static int ip_rcv_finish_core(struct net *net, struct sock 
*sk,
struct rtable *rt;
int err;
 
-   /* if ingress device is enslaved to an L3 master device pass the
-* skb to its handler for processing
-*/
-   skb = l3mdev_ip_rcv(skb);
-   if (!skb)
-   return NET_RX_SUCCESS;
-
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
@@ -408,8 +401,16 @@ static int ip_rcv_finish_core(struct net *net, struct sock 
*sk,
 
 static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-   int ret = ip_rcv_finish_core(net, sk, skb);
+   int ret;
+
+   /* if ingress device is enslaved to an L3 master device pass the
+* skb to its handler for processing
+*/
+   skb = l3mdev_ip_rcv(skb);
+   if (!skb)
+   return NET_RX_SUCCESS;
 
+   ret = ip_rcv_finish_core(net, sk, skb);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -545,6 +546,12 @@ static void ip_list_rcv_finish(struct net *net, struct 
sock *sk,
struct dst_entry *dst;
 
list_del(>list);
+   /* if ingress device is enslaved to an L3 master device pass the
+* skb to its handler for processing
+*/
+   skb = l3mdev_ip_rcv(skb);
+   if (!skb)
+   continue;
if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
continue;
 



[PATCH net-next 0/2] IP listification follow-ups

2018-07-05 Thread Edward Cree
While working on IPv6 list processing, I found another bug in the IPv4
 version.  So this patch series has that fix, and the IPv6 version with
 both fixes incorporated.

Edward Cree (2):
  net: ipv4: fix list processing on L3 slave devices
  net: ipv6: listify ipv6_rcv() and ip6_rcv_finish()

 include/net/ipv6.h   |   2 +
 net/ipv4/ip_input.c  |  23 +
 net/ipv6/af_inet6.c  |   1 +
 net/ipv6/ip6_input.c | 131 ---
 4 files changed, 133 insertions(+), 24 deletions(-)



Re: [bpf PATCH 1/2] bpf: sockmap, error path can not release psock in multi-map case

2018-07-05 Thread John Fastabend
On 07/03/2018 07:40 AM, Daniel Borkmann wrote:
> On 06/30/2018 03:51 PM, John Fastabend wrote:
>> The current code, in the error path of sock_hash_ctx_update_elem,
>> checks if the sock has a psock in the user data and if so decrements
>> the reference count of the psock. However, if the error happens early
>> in the error path we may have never incremented the psock reference
>> count and if the psock exists because the sock is in another map then
>> we may inadvertently decrement the reference count.
>>
>> Fix this by making the error path only call smap_release_sock if the
>> error happens after the increment.
>>
>> Reported-by: syzbot+d464d2c20c717ef5a...@syzkaller.appspotmail.com
>> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
>> Signed-off-by: John Fastabend 
>> ---

[...]

>> @@ -2324,7 +2324,12 @@ static int sock_hash_ctx_update_elem(struct 
>> bpf_sock_ops_kern *skops,
>>  if (err)
>>  goto err;
>>  
>> -/* bpf_map_update_elem() can be called in_irq() */
>> +psock = smap_psock_sk(sock);
>> +if (unlikely(!psock)) {
>> +err = -EINVAL;
>> +goto err;
>> +}
> 
> Is an error even possible at this point? If __sock_map_ctx_update_elem() 
> succeeds,
> we either allocated and linked a new psock to the sock or we inc'ed the 
> existing
> one's refcount. From my reading it seems we should always succeed the 
> subsequent
> smap_psock_sk(). If we would have failed here in between it would mean we'd 
> have
> a refcount imbalance somewhere?
> 

Its not possible will replace with a comment. Thanks.


[PATCH net-next v6 02/11] net: sched: change type of reference and bind counters

2018-07-05 Thread Vlad Buslov
Change type of action reference counter to refcount_t.

Change type of action bind counter to atomic_t.
This type is used to allow decrementing bind counter without testing
for 0 result.

Reviewed-by: Marcelo Ricardo Leitner 
Signed-off-by: Vlad Buslov 
Signed-off-by: Jiri Pirko 
---
 include/net/act_api.h  |  5 +++--
 net/sched/act_api.c| 32 ++--
 net/sched/act_bpf.c|  4 ++--
 net/sched/act_connmark.c   |  4 ++--
 net/sched/act_csum.c   |  4 ++--
 net/sched/act_gact.c   |  4 ++--
 net/sched/act_ife.c|  4 ++--
 net/sched/act_ipt.c|  4 ++--
 net/sched/act_mirred.c |  4 ++--
 net/sched/act_nat.c|  4 ++--
 net/sched/act_pedit.c  |  4 ++--
 net/sched/act_police.c |  4 ++--
 net/sched/act_sample.c |  4 ++--
 net/sched/act_simple.c |  4 ++--
 net/sched/act_skbedit.c|  4 ++--
 net/sched/act_skbmod.c |  4 ++--
 net/sched/act_tunnel_key.c |  4 ++--
 net/sched/act_vlan.c   |  4 ++--
 18 files changed, 57 insertions(+), 44 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index ffc3ef321776..2759226527a2 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -6,6 +6,7 @@
  * Public action API for classifiers/qdiscs
 */
 
+#include 
 #include 
 #include 
 #include 
@@ -26,8 +27,8 @@ struct tc_action {
struct tcf_idrinfo  *idrinfo;
 
u32 tcfa_index;
-   int tcfa_refcnt;
-   int tcfa_bindcnt;
+   refcount_t  tcfa_refcnt;
+   atomic_ttcfa_bindcnt;
u32 tcfa_capab;
int tcfa_action;
struct tcf_ttcfa_tm;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 02670c7489e3..4f064ecab882 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -105,14 +105,26 @@ int __tcf_idr_release(struct tc_action *p, bool bind, 
bool strict)
 
ASSERT_RTNL();
 
+   /* Release with strict==1 and bind==0 is only called through act API
+* interface (classifiers always bind). Only case when action with
+* positive reference count and zero bind count can exist is when it was
+* also created with act API (unbinding last classifier will destroy the
+* action if it was created by classifier). So only case when bind count
+* can be changed after initial check is when unbound action is
+* destroyed by act API while classifier binds to action with same id
+* concurrently. This result either creation of new action(same behavior
+* as before), or reusing existing action if concurrent process
+* increments reference count before action is deleted. Both scenarios
+* are acceptable.
+*/
if (p) {
if (bind)
-   p->tcfa_bindcnt--;
-   else if (strict && p->tcfa_bindcnt > 0)
+   atomic_dec(>tcfa_bindcnt);
+   else if (strict && atomic_read(>tcfa_bindcnt) > 0)
return -EPERM;
 
-   p->tcfa_refcnt--;
-   if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
+   if (atomic_read(>tcfa_bindcnt) <= 0 &&
+   refcount_dec_and_test(>tcfa_refcnt)) {
if (p->ops->cleanup)
p->ops->cleanup(p);
tcf_idr_remove(p->idrinfo, p);
@@ -304,8 +316,8 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, 
struct tc_action **a,
 
if (index && p) {
if (bind)
-   p->tcfa_bindcnt++;
-   p->tcfa_refcnt++;
+   atomic_inc(>tcfa_bindcnt);
+   refcount_inc(>tcfa_refcnt);
*a = p;
return true;
}
@@ -324,9 +336,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
 
if (unlikely(!p))
return -ENOMEM;
-   p->tcfa_refcnt = 1;
+   refcount_set(>tcfa_refcnt, 1);
if (bind)
-   p->tcfa_bindcnt = 1;
+   atomic_set(>tcfa_bindcnt, 1);
 
if (cpustats) {
p->cpu_bstats = netdev_alloc_pcpu_stats(struct 
gnet_stats_basic_cpu);
@@ -782,7 +794,7 @@ static void cleanup_a(struct list_head *actions, int ovr)
return;
 
list_for_each_entry(a, actions, list)
-   a->tcfa_refcnt--;
+   refcount_dec(>tcfa_refcnt);
 }
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
@@ -810,7 +822,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, 
struct nlattr *nla,
act->order = i;
sz += tcf_action_fill_size(act);
if (ovr)
-   act->tcfa_refcnt++;
+   

[PATCH net-next v6 03/11] net: sched: implement unlocked action init API

2018-07-05 Thread Vlad Buslov
Add additional 'rtnl_held' argument to act API init functions. It is
required to implement actions that need to release rtnl lock before loading
kernel module and reacquire if afterwards.

Reviewed-by: Marcelo Ricardo Leitner 
Signed-off-by: Vlad Buslov 
Signed-off-by: Jiri Pirko 
---
Changes from V1 to V2:
- Rename "unlocked" to "rtnl_held" for clarity.

 include/net/act_api.h  |  6 --
 net/sched/act_api.c| 18 +++---
 net/sched/act_bpf.c|  3 ++-
 net/sched/act_connmark.c   |  2 +-
 net/sched/act_csum.c   |  3 ++-
 net/sched/act_gact.c   |  3 ++-
 net/sched/act_ife.c|  3 ++-
 net/sched/act_ipt.c|  6 --
 net/sched/act_mirred.c |  5 +++--
 net/sched/act_nat.c|  2 +-
 net/sched/act_pedit.c  |  3 ++-
 net/sched/act_police.c |  2 +-
 net/sched/act_sample.c |  3 ++-
 net/sched/act_simple.c |  3 ++-
 net/sched/act_skbedit.c|  3 ++-
 net/sched/act_skbmod.c |  3 ++-
 net/sched/act_tunnel_key.c |  3 ++-
 net/sched/act_vlan.c   |  3 ++-
 net/sched/cls_api.c|  5 +++--
 19 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 2759226527a2..27823f4e24c4 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -92,7 +92,8 @@ struct tc_action_ops {
  struct netlink_ext_ack *extack);
int (*init)(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **act, int ovr,
-   int bind, struct netlink_ext_ack *extack);
+   int bind, bool rtnl_held,
+   struct netlink_ext_ack *extack);
int (*walk)(struct net *, struct sk_buff *,
struct netlink_callback *, int,
const struct tc_action_ops *,
@@ -168,10 +169,11 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action 
**actions,
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
struct list_head *actions, size_t *attr_size,
-   struct netlink_ext_ack *extack);
+   bool rtnl_held, struct netlink_ext_ack *extack);
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
+   bool rtnl_held,
struct netlink_ext_ack *extack);
 int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4f064ecab882..256b0c93916c 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -671,6 +671,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr 
**tb)
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
+   bool rtnl_held,
struct netlink_ext_ack *extack)
 {
struct tc_action *a;
@@ -721,9 +722,11 @@ struct tc_action *tcf_action_init_1(struct net *net, 
struct tcf_proto *tp,
a_o = tc_lookup_action_n(act_name);
if (a_o == NULL) {
 #ifdef CONFIG_MODULES
-   rtnl_unlock();
+   if (rtnl_held)
+   rtnl_unlock();
request_module("act_%s", act_name);
-   rtnl_lock();
+   if (rtnl_held)
+   rtnl_lock();
 
a_o = tc_lookup_action_n(act_name);
 
@@ -746,9 +749,10 @@ struct tc_action *tcf_action_init_1(struct net *net, 
struct tcf_proto *tp,
/* backward compatibility for policer */
if (name == NULL)
err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, , ovr, bind,
-   extack);
+   rtnl_held, extack);
else
-   err = a_o->init(net, nla, est, , ovr, bind, extack);
+   err = a_o->init(net, nla, est, , ovr, bind, rtnl_held,
+   extack);
if (err < 0)
goto err_mod;
 
@@ -800,7 +804,7 @@ static void cleanup_a(struct list_head *actions, int ovr)
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
struct list_head *actions, size_t *attr_size,
-   struct netlink_ext_ack *extack)
+   bool rtnl_held, struct netlink_ext_ack *extack)
 {
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
@@ -814,7 +818,7 @@ 

[PATCH net-next v6 05/11] net: sched: implement action API that deletes action by index

2018-07-05 Thread Vlad Buslov
Implement new action API function that atomically finds and deletes action
from idr by index. Intended to be used by lockless actions that do not rely
on rtnl lock.

Reviewed-by: Marcelo Ricardo Leitner 
Signed-off-by: Vlad Buslov 
Signed-off-by: Jiri Pirko 
---
Changes from V1 to V2:
- Rename tcf_idr_find_delete to tcf_idr_delete_index.

 include/net/act_api.h |  1 +
 net/sched/act_api.c   | 39 +++
 2 files changed, 40 insertions(+)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 27823f4e24c4..a8eaae67c264 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -153,6 +153,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
   int bind, bool cpustats);
 void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
 
+int tcf_idr_delete_index(struct tc_action_net *tn, u32 index);
 int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
 
 static inline int tcf_idr_release(struct tc_action *a, bool bind)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index aa304d36fee0..0f31f09946ab 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -319,6 +319,45 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, 
struct tc_action **a,
 }
 EXPORT_SYMBOL(tcf_idr_check);
 
+int tcf_idr_delete_index(struct tc_action_net *tn, u32 index)
+{
+   struct tcf_idrinfo *idrinfo = tn->idrinfo;
+   struct tc_action *p;
+   int ret = 0;
+
+   spin_lock(>lock);
+   p = idr_find(>action_idr, index);
+   if (!p) {
+   spin_unlock(>lock);
+   return -ENOENT;
+   }
+
+   if (!atomic_read(>tcfa_bindcnt)) {
+   if (refcount_dec_and_test(>tcfa_refcnt)) {
+   struct module *owner = p->ops->owner;
+
+   WARN_ON(p != idr_remove(>action_idr,
+   p->tcfa_index));
+   spin_unlock(>lock);
+
+   if (p->ops->cleanup)
+   p->ops->cleanup(p);
+
+   gen_kill_estimator(>tcfa_rate_est);
+   free_tcf(p);
+   module_put(owner);
+   return 0;
+   }
+   ret = 0;
+   } else {
+   ret = -EPERM;
+   }
+
+   spin_unlock(>lock);
+   return ret;
+}
+EXPORT_SYMBOL(tcf_idr_delete_index);
+
 int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
   struct tc_action **a, const struct tc_action_ops *ops,
   int bind, bool cpustats)
-- 
2.7.5



[PATCH net-next v6 10/11] net: sched: atomically check-allocate action

2018-07-05 Thread Vlad Buslov
Implement function that atomically checks if action exists and either takes
reference to it, or allocates idr slot for action index to prevent
concurrent allocations of actions with same index. Use EBUSY error pointer
to indicate that idr slot is reserved.

Implement cleanup helper function that removes temporary error pointer from
idr. (in case of error between idr allocation and insertion of newly
created action to specified index)

Refactor all action init functions to insert new action to idr using this
API.

Reviewed-by: Marcelo Ricardo Leitner 
Signed-off-by: Vlad Buslov 
Signed-off-by: Jiri Pirko 
---
Changes from V1 to V2:
- Remove unique idr insertion function. Change original idr insert to do
  the same thing.
- Refactor action check-alloc code into standalone function.

 include/net/act_api.h  |  3 ++
 net/sched/act_api.c| 92 --
 net/sched/act_bpf.c| 11 --
 net/sched/act_connmark.c   | 10 +++--
 net/sched/act_csum.c   | 11 --
 net/sched/act_gact.c   | 11 --
 net/sched/act_ife.c|  6 ++-
 net/sched/act_ipt.c| 13 ++-
 net/sched/act_mirred.c | 16 ++--
 net/sched/act_nat.c| 11 --
 net/sched/act_pedit.c  | 12 --
 net/sched/act_police.c |  9 -
 net/sched/act_sample.c | 11 --
 net/sched/act_simple.c | 11 +-
 net/sched/act_skbedit.c| 11 +-
 net/sched/act_skbmod.c | 11 +-
 net/sched/act_tunnel_key.c |  9 -
 net/sched/act_vlan.c   | 17 -
 18 files changed, 216 insertions(+), 59 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index b9ed2b8256a5..8090de2edab7 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -154,6 +154,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
   int bind, bool cpustats);
 void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
 
+void tcf_idr_cleanup(struct tc_action_net *tn, u32 index);
+int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
+   struct tc_action **a, int bind);
 int tcf_idr_delete_index(struct tc_action_net *tn, u32 index);
 int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index eefe8c2fe667..9511502e1cbb 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -303,7 +303,9 @@ static bool __tcf_idr_check(struct tc_action_net *tn, u32 
index,
 
spin_lock(>lock);
p = idr_find(>action_idr, index);
-   if (p) {
+   if (IS_ERR(p)) {
+   p = NULL;
+   } else if (p) {
refcount_inc(>tcfa_refcnt);
if (bind)
atomic_inc(>tcfa_bindcnt);
@@ -371,7 +373,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
 {
struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
struct tcf_idrinfo *idrinfo = tn->idrinfo;
-   struct idr *idr = >action_idr;
int err = -ENOMEM;
 
if (unlikely(!p))
@@ -389,20 +390,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
goto err2;
}
spin_lock_init(>tcfa_lock);
-   idr_preload(GFP_KERNEL);
-   spin_lock(>lock);
-   /* user doesn't specify an index */
-   if (!index) {
-   index = 1;
-   err = idr_alloc_u32(idr, NULL, , UINT_MAX, GFP_ATOMIC);
-   } else {
-   err = idr_alloc_u32(idr, NULL, , index, GFP_ATOMIC);
-   }
-   spin_unlock(>lock);
-   idr_preload_end();
-   if (err)
-   goto err3;
-
p->tcfa_index = index;
p->tcfa_tm.install = jiffies;
p->tcfa_tm.lastuse = jiffies;
@@ -412,7 +399,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
>tcfa_rate_est,
>tcfa_lock, NULL, est);
if (err)
-   goto err4;
+   goto err3;
}
 
p->idrinfo = idrinfo;
@@ -420,8 +407,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
INIT_LIST_HEAD(>list);
*a = p;
return 0;
-err4:
-   idr_remove(idr, index);
 err3:
free_percpu(p->cpu_qstats);
 err2:
@@ -437,11 +422,78 @@ void tcf_idr_insert(struct tc_action_net *tn, struct 
tc_action *a)
struct tcf_idrinfo *idrinfo = tn->idrinfo;
 
spin_lock(>lock);
-   idr_replace(>action_idr, a, a->tcfa_index);
+   /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
+   WARN_ON(!IS_ERR(idr_replace(>action_idr, a, a->tcfa_index)));
spin_unlock(>lock);
 }
 EXPORT_SYMBOL(tcf_idr_insert);
 
+/* Cleanup idr index that was allocated but not initialized. */
+
+void tcf_idr_cleanup(struct tc_action_net *tn, u32 index)
+{
+   struct 

[PATCH net-next v6 01/11] net: sched: use rcu for action cookie update

2018-07-05 Thread Vlad Buslov
Implement functions to atomically update and free action cookie
using rcu mechanism.

Reviewed-by: Marcelo Ricardo Leitner 
Signed-off-by: Vlad Buslov 
Signed-off-by: Jiri Pirko 
---
 include/net/act_api.h |  2 +-
 include/net/pkt_cls.h |  1 +
 net/sched/act_api.c   | 44 ++--
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 5ff11adbe2a6..ffc3ef321776 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -37,7 +37,7 @@ struct tc_action {
spinlock_t  tcfa_lock;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
-   struct tc_cookie*act_cookie;
+   struct tc_cookie__rcu *act_cookie;
struct tcf_chain*goto_chain;
 };
 #define tcf_index  common.tcfa_index
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 6641584b27f1..2081e4219f81 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -781,6 +781,7 @@ struct tc_mqprio_qopt_offload {
 struct tc_cookie {
u8  *data;
u32 len;
+   struct rcu_head rcu;
 };
 
 struct tc_qopt_offload_stats {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 3f4cf930f809..02670c7489e3 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct 
tc_action *a,
res->goto_tp = rcu_dereference_bh(chain->filter_chain);
 }
 
+static void tcf_free_cookie_rcu(struct rcu_head *p)
+{
+   struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu);
+
+   kfree(cookie->data);
+   kfree(cookie);
+}
+
+static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
+ struct tc_cookie *new_cookie)
+{
+   struct tc_cookie *old;
+
+   old = xchg(old_cookie, new_cookie);
+   if (old)
+   call_rcu(>rcu, tcf_free_cookie_rcu);
+}
+
 /* XXX: For standalone actions, we don't need a RCU grace period either, 
because
  * actions are always connected to filters and filters are already destroyed in
  * RCU callbacks, so after a RCU grace period actions are already disconnected
@@ -65,10 +83,7 @@ static void free_tcf(struct tc_action *p)
free_percpu(p->cpu_bstats);
free_percpu(p->cpu_qstats);
 
-   if (p->act_cookie) {
-   kfree(p->act_cookie->data);
-   kfree(p->act_cookie);
-   }
+   tcf_set_action_cookie(>act_cookie, NULL);
if (p->goto_chain)
tcf_action_goto_chain_fini(p);
 
@@ -567,16 +582,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action 
*a, int bind, int ref)
int err = -EINVAL;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
+   struct tc_cookie *cookie;
 
if (nla_put_string(skb, TCA_KIND, a->ops->kind))
goto nla_put_failure;
if (tcf_action_copy_stats(skb, a, 0))
goto nla_put_failure;
-   if (a->act_cookie) {
-   if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
-   a->act_cookie->data))
+
+   rcu_read_lock();
+   cookie = rcu_dereference(a->act_cookie);
+   if (cookie) {
+   if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
+   rcu_read_unlock();
goto nla_put_failure;
+   }
}
+   rcu_read_unlock();
 
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
@@ -719,13 +740,8 @@ struct tc_action *tcf_action_init_1(struct net *net, 
struct tcf_proto *tp,
if (err < 0)
goto err_mod;
 
-   if (name == NULL && tb[TCA_ACT_COOKIE]) {
-   if (a->act_cookie) {
-   kfree(a->act_cookie->data);
-   kfree(a->act_cookie);
-   }
-   a->act_cookie = cookie;
-   }
+   if (!name && tb[TCA_ACT_COOKIE])
+   tcf_set_action_cookie(>act_cookie, cookie);
 
/* module count goes up only when brand new policy is created
 * if it exists and is only bound to in a_o->init() then
-- 
2.7.5



[PATCH net-next v6 06/11] net: sched: add 'delete' function to action ops

2018-07-05 Thread Vlad Buslov
Extend action ops with 'delete' function. Each action type to implements
its own delete function that doesn't depend on rtnl lock.

Implement delete function that is required to delete actions without
holding rtnl lock. Use action API function that atomically deletes action
only if it is still in action idr. This implementation prevents concurrent
threads from deleting same action twice.

Reviewed-by: Marcelo Ricardo Leitner 
Signed-off-by: Vlad Buslov 
Signed-off-by: Jiri Pirko 
---
Changes from V1 to V2:
- Merge action ops delete definition and implementation.

 include/net/act_api.h  |  1 +
 net/sched/act_bpf.c|  8 
 net/sched/act_connmark.c   |  8 
 net/sched/act_csum.c   |  8 
 net/sched/act_gact.c   |  8 
 net/sched/act_ife.c|  8 
 net/sched/act_ipt.c| 16 
 net/sched/act_mirred.c |  8 
 net/sched/act_nat.c|  8 
 net/sched/act_pedit.c  |  8 
 net/sched/act_police.c |  8 
 net/sched/act_sample.c |  8 
 net/sched/act_simple.c |  8 
 net/sched/act_skbedit.c|  8 
 net/sched/act_skbmod.c |  8 
 net/sched/act_tunnel_key.c |  8 
 net/sched/act_vlan.c   |  8 
 17 files changed, 137 insertions(+)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index a8eaae67c264..b9ed2b8256a5 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -101,6 +101,7 @@ struct tc_action_ops {
void(*stats_update)(struct tc_action *, u64, u32, u64);
size_t  (*get_fill_size)(const struct tc_action *act);
struct net_device *(*get_dev)(const struct tc_action *a);
+   int (*delete)(struct net *net, u32 index);
 };
 
 struct tc_action_net {
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 8ebf40a3506c..7941dd66ff83 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -388,6 +388,13 @@ static int tcf_bpf_search(struct net *net, struct 
tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
 }
 
+static int tcf_bpf_delete(struct net *net, u32 index)
+{
+   struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+   return tcf_idr_delete_index(tn, index);
+}
+
 static struct tc_action_ops act_bpf_ops __read_mostly = {
.kind   =   "bpf",
.type   =   TCA_ACT_BPF,
@@ -398,6 +405,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = {
.init   =   tcf_bpf_init,
.walk   =   tcf_bpf_walker,
.lookup =   tcf_bpf_search,
+   .delete =   tcf_bpf_delete,
.size   =   sizeof(struct tcf_bpf),
 };
 
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e3787aa0025a..143c2d3de723 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -193,6 +193,13 @@ static int tcf_connmark_search(struct net *net, struct 
tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
 }
 
+static int tcf_connmark_delete(struct net *net, u32 index)
+{
+   struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+   return tcf_idr_delete_index(tn, index);
+}
+
 static struct tc_action_ops act_connmark_ops = {
.kind   =   "connmark",
.type   =   TCA_ACT_CONNMARK,
@@ -202,6 +209,7 @@ static struct tc_action_ops act_connmark_ops = {
.init   =   tcf_connmark_init,
.walk   =   tcf_connmark_walker,
.lookup =   tcf_connmark_search,
+   .delete =   tcf_connmark_delete,
.size   =   sizeof(struct tcf_connmark_info),
 };
 
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 334261943f9f..3768539340e0 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -654,6 +654,13 @@ static size_t tcf_csum_get_fill_size(const struct 
tc_action *act)
return nla_total_size(sizeof(struct tc_csum));
 }
 
+static int tcf_csum_delete(struct net *net, u32 index)
+{
+   struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+   return tcf_idr_delete_index(tn, index);
+}
+
 static struct tc_action_ops act_csum_ops = {
.kind   = "csum",
.type   = TCA_ACT_CSUM,
@@ -665,6 +672,7 @@ static struct tc_action_ops act_csum_ops = {
.walk   = tcf_csum_walker,
.lookup = tcf_csum_search,
.get_fill_size  = tcf_csum_get_fill_size,
+   .delete = tcf_csum_delete,
.size   = sizeof(struct tcf_csum),
 };
 
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index b4dfb2b4addc..a431a711f0dd 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -231,6 +231,13 @@ static size_t tcf_gact_get_fill_size(const struct 
tc_action *act)
return sz;
 }
 
+static int tcf_gact_delete(struct net *net, u32 index)
+{
+   struct 

[PATCH net-next v6 00/11] Modify action API for implementing lockless actions

2018-07-05 Thread Vlad Buslov
Currently, all netlink protocol handlers for updating rules, actions and
qdiscs are protected with single global rtnl lock which removes any
possibility for parallelism. This patch set is a first step to remove
rtnl lock dependency from TC rules update path.

Recently, new rtnl registration flag RTNL_FLAG_DOIT_UNLOCKED was added.
Handlers registered with this flag are called without RTNL taken. End
goal is to have rule update handlers(RTM_NEWTFILTER, RTM_DELTFILTER,
etc.) to be registered with UNLOCKED flag to allow parallel execution.
However, there is no intention to completely remove or split rtnl lock
itself. This patch set addresses specific problems in action API that
prevents it from being executed concurrently. This patch set does not
completely unlock rules or actions update path. Additional patch sets
are required to refactor individual actions and filters update for
parallel execution.

As a preparation for executing TC rules update handlers without rtnl
lock, action API code was audited to determine areas that assume
external synchronization with rtnl lock and must be changed to allow
safe concurrent access with following results:

1. Action idr is already protected with spinlock. However, some code
   paths assume that idr state is not changes between several
   consecutive tcf_idr_* function calls.
2. tc_action reference and bind counters are implemented as plain
   integers. They purpose was to allow single actions to be shared
   between multiple filters, not to provide means for concurrent
   modification.
3. tc_action 'cookie' pointer field is not protected against
   modification.
4. Action API functions, that work with set of actions, use intrusive
   linked list, which cannot be used concurrently without additional
   synchronization.
5. Action API functions don't take reference to actions while using
   them, assuming external synchronization with rtnl lock.

Following solutions to these problems are implemented:

1. To remove assumption that idr state doesn't change between tcf_idr_*
   calls, implement new functions that atomically perform several
   operations on idr without releasing idr spinlock. (function to
   atomically lookup and delete action by index, function to atomically
   check if action exists and allocate new one if necessary, etc.)
2. Use atomic operations on counters to make them suitable for
   concurrent get/put operations.
3. Data that 'cookie' points to is never modified, so it enough to
   refactor it to rcu pointer to prevent concurrent de-allocation.
4. Action API doesn't actually use any linked list specific operations
   on actions intrusive linked list, so it can be refactored to array in
   straightforward manner.
5. Always take reference to action while accessing it in action API.
   tcf_idr_search function modified to take reference to action before
   returning it, so there is no way to lookup an action without
   incrementing its reference counter. All users of this function are
   modified to release the reference, after they done using action. With
   all users using reference counting, it is now safe to concurrently
   delete actions.

Additionally, actions init function signature was expanded with
'rtnl_held' argument, that allows actions that have internal dependency
on rtnl lock to take/release it when necessary.

Since only shared state in action API module are actions themselves and
action idr, these changes are sufficient to not to rely on global rtnl
lock for protection of internal action API data structures.

Changes from V5 to V6:
- Rebase on current net-next
- When action is deleted, set pointer in actions array to NULL to
  prevent double freeing.

Changes from V4 to V5:
- Change action delete API to track actions that were deleted, to
  prevent releasing them on error.

Changes from V3 to V4:
- Expand cover letter.
- Reduce actions array size in tcf_action_init_1.
- Rebase on latest net-next.

Changes from V2 to V3:
- Re-send with changelog copied to individual patches.

Changes from V1 to V2:
- Removed redundant actions ops lookup during delete.
- Merge action ops delete definition and implementation.
- Assume all actions have delete implemented and don't check for it
  explicitly.
- Resplit action lookup/release code to prevent memory leaks in
  individual patches.
- Make __tcf_idr_check function static
- Remove unique idr insertion function. Change original idr insert to do
  the same thing.
- Merge changes that take reference to action when performing lookup and
  changes that account for this additional reference when dumping action
  to user space into single patch.
- Change convoluted commit message.
- Rename "unlocked" to "rtnl_held" for clarity.
- Remove estimator lock add patch.
- Refactor action check-alloc code into standalone function.
- Rename tcf_idr_find_delete to tcf_idr_delete_index.
- Rearrange variable definitions in tc_action_delete.
- Add patch that refactors action API code to use array of pointers to
  actions 

[PATCH net-next v6 07/11] net: sched: implement reference counted action release

2018-07-05 Thread Vlad Buslov
Implement helper delete function that uses new action ops 'delete', instead
of destroying action directly. This is required so act API could delete
actions by index, without holding any references to action that is being
deleted.

Implement function __tcf_action_put() that releases reference to action and
frees it, if necessary. Refactor action deletion code to use new put
function and not to rely on rtnl lock. Remove rtnl lock assertions that are
no longer needed.

Reviewed-by: Marcelo Ricardo Leitner 
Signed-off-by: Vlad Buslov 
Signed-off-by: Jiri Pirko 
---
Changes from V1 to V2:
- Removed redundant actions ops lookup during delete.
- Assume all actions have delete implemented and don't check for it
  explicitly.
- Rearrange variable definitions in tcf_action_delete.

 net/sched/act_api.c | 84 +++--
 net/sched/cls_api.c |  1 -
 2 files changed, 62 insertions(+), 23 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 0f31f09946ab..a023873db713 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -90,21 +90,39 @@ static void free_tcf(struct tc_action *p)
kfree(p);
 }
 
-static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
+static void tcf_action_cleanup(struct tc_action *p)
 {
-   spin_lock(>lock);
-   idr_remove(>action_idr, p->tcfa_index);
-   spin_unlock(>lock);
+   if (p->ops->cleanup)
+   p->ops->cleanup(p);
+
gen_kill_estimator(>tcfa_rate_est);
free_tcf(p);
 }
 
+static int __tcf_action_put(struct tc_action *p, bool bind)
+{
+   struct tcf_idrinfo *idrinfo = p->idrinfo;
+
+   if (refcount_dec_and_lock(>tcfa_refcnt, >lock)) {
+   if (bind)
+   atomic_dec(>tcfa_bindcnt);
+   idr_remove(>action_idr, p->tcfa_index);
+   spin_unlock(>lock);
+
+   tcf_action_cleanup(p);
+   return 1;
+   }
+
+   if (bind)
+   atomic_dec(>tcfa_bindcnt);
+
+   return 0;
+}
+
 int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
 {
int ret = 0;
 
-   ASSERT_RTNL();
-
/* Release with strict==1 and bind==0 is only called through act API
 * interface (classifiers always bind). Only case when action with
 * positive reference count and zero bind count can exist is when it was
@@ -118,18 +136,11 @@ int __tcf_idr_release(struct tc_action *p, bool bind, 
bool strict)
 * are acceptable.
 */
if (p) {
-   if (bind)
-   atomic_dec(>tcfa_bindcnt);
-   else if (strict && atomic_read(>tcfa_bindcnt) > 0)
+   if (!bind && strict && atomic_read(>tcfa_bindcnt) > 0)
return -EPERM;
 
-   if (atomic_read(>tcfa_bindcnt) <= 0 &&
-   refcount_dec_and_test(>tcfa_refcnt)) {
-   if (p->ops->cleanup)
-   p->ops->cleanup(p);
-   tcf_idr_remove(p->idrinfo, p);
+   if (__tcf_action_put(p, bind))
ret = ACT_P_DELETED;
-   }
}
 
return ret;
@@ -340,11 +351,7 @@ int tcf_idr_delete_index(struct tc_action_net *tn, u32 
index)
p->tcfa_index));
spin_unlock(>lock);
 
-   if (p->ops->cleanup)
-   p->ops->cleanup(p);
-
-   gen_kill_estimator(>tcfa_rate_est);
-   free_tcf(p);
+   tcf_action_cleanup(p);
module_put(owner);
return 0;
}
@@ -615,6 +622,11 @@ int tcf_action_destroy(struct list_head *actions, int bind)
return ret;
 }
 
+static int tcf_action_put(struct tc_action *p)
+{
+   return __tcf_action_put(p, false);
+}
+
 int
 tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int 
ref)
 {
@@ -1092,6 +1104,35 @@ static int tca_action_flush(struct net *net, struct 
nlattr *nla,
return err;
 }
 
+static int tcf_action_delete(struct net *net, struct list_head *actions,
+struct netlink_ext_ack *extack)
+{
+   struct tc_action *a, *tmp;
+   u32 act_index;
+   int ret;
+
+   list_for_each_entry_safe(a, tmp, actions, list) {
+   const struct tc_action_ops *ops = a->ops;
+
+   /* Actions can be deleted concurrently so we must save their
+* type and id to search again after reference is released.
+*/
+   act_index = a->tcfa_index;
+
+   list_del(>list);
+   if (tcf_action_put(a)) {
+   /* last reference, action was deleted concurrently */
+   module_put(ops->owner);
+   } else  {
+   /* now do the delete */
+   

[PATCH net-next v6 11/11] net: sched: change action API to use array of pointers to actions

2018-07-05 Thread Vlad Buslov
Act API used linked list to pass set of actions to functions. It is
intrusive data structure that stores list nodes inside action structure
itself, which means it is not safe to modify such list concurrently.
However, action API doesn't use any linked list specific operations on this
set of actions, so it can be safely refactored into plain pointer array.

Refactor action API to use array of pointers to tc_actions instead of
linked list. Change argument 'actions' type of exported action init,
destroy and dump functions.

Acked-by: Jiri Pirko 
Signed-off-by: Vlad Buslov 
---
Changes from V5 to V6:
- When action is deleted, set pointer in actions array to NULL to
  prevent double freeing.

Changes from V4 to V5:
- Change action delete API to track actions that were deleted, to
  prevent releasing them on error.

Changes from V3 to V4:
- Reduce actions array size in tcf_action_init_1.

 include/net/act_api.h |  7 ++--
 net/sched/act_api.c   | 89 +--
 net/sched/cls_api.c   | 21 
 3 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8090de2edab7..683ce41053d9 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -168,19 +168,20 @@ static inline int tcf_idr_release(struct tc_action *a, 
bool bind)
 int tcf_register_action(struct tc_action_ops *a, struct pernet_operations 
*ops);
 int tcf_unregister_action(struct tc_action_ops *a,
  struct pernet_operations *ops);
-int tcf_action_destroy(struct list_head *actions, int bind);
+int tcf_action_destroy(struct tc_action *actions[], int bind);
 int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
int nr_actions, struct tcf_result *res);
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
-   struct list_head *actions, size_t *attr_size,
+   struct tc_action *actions[], size_t *attr_size,
bool rtnl_held, struct netlink_ext_ack *extack);
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
bool rtnl_held,
struct netlink_ext_ack *extack);
-int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int);
+int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind,
+   int ref);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9511502e1cbb..bf1c35f3deb6 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -657,13 +657,15 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action 
**actions,
 }
 EXPORT_SYMBOL(tcf_action_exec);
 
-int tcf_action_destroy(struct list_head *actions, int bind)
+int tcf_action_destroy(struct tc_action *actions[], int bind)
 {
const struct tc_action_ops *ops;
-   struct tc_action *a, *tmp;
-   int ret = 0;
+   struct tc_action *a;
+   int ret = 0, i;
 
-   list_for_each_entry_safe(a, tmp, actions, list) {
+   for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+   a = actions[i];
+   actions[i] = NULL;
ops = a->ops;
ret = __tcf_idr_release(a, bind, true);
if (ret == ACT_P_DELETED)
@@ -679,11 +681,12 @@ static int tcf_action_put(struct tc_action *p)
return __tcf_action_put(p, false);
 }
 
-static void tcf_action_put_lst(struct list_head *actions)
+static void tcf_action_put_many(struct tc_action *actions[])
 {
-   struct tc_action *a, *tmp;
+   int i;
 
-   list_for_each_entry_safe(a, tmp, actions, list) {
+   for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+   struct tc_action *a = actions[i];
const struct tc_action_ops *ops = a->ops;
 
if (tcf_action_put(a))
@@ -735,14 +738,15 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action 
*a, int bind, int ref)
 }
 EXPORT_SYMBOL(tcf_action_dump_1);
 
-int tcf_action_dump(struct sk_buff *skb, struct list_head *actions,
+int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
int bind, int ref)
 {
struct tc_action *a;
-   int err = -EINVAL;
+   int err = -EINVAL, i;
struct nlattr *nest;
 
-   list_for_each_entry(a, actions, list) {
+   for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+   a = actions[i];
nest = nla_nest_start(skb, a->order);
if (nest 

[PATCH net-next v6 08/11] net: sched: don't release reference on action overwrite

2018-07-05 Thread Vlad Buslov
Return from action init function with reference to action taken,
even when overwriting existing action.

Action init API initializes its fourth argument (pointer to pointer to tc
action) to either existing action with same index or newly created action.
In case of existing index(and bind argument is zero), init function returns
without incrementing action reference counter. Caller of action init then
proceeds working with action, without actually holding reference to it.
This means that action could be deleted concurrently.

Change action init behavior to always take reference to action before
returning successfully, in order to protect from concurrent deletion.

Reviewed-by: Marcelo Ricardo Leitner 
Signed-off-by: Vlad Buslov 
Signed-off-by: Jiri Pirko 
---
Changes from V1 to V2:
- Resplit action lookup/release code to prevent memory leaks in
  individual patches.
- Change convoluted commit message.

 net/sched/act_api.c|  2 --
 net/sched/act_bpf.c|  8 
 net/sched/act_connmark.c   |  5 +++--
 net/sched/act_csum.c   |  8 
 net/sched/act_gact.c   |  5 +++--
 net/sched/act_ife.c| 10 +-
 net/sched/act_ipt.c|  5 +++--
 net/sched/act_mirred.c |  5 ++---
 net/sched/act_nat.c|  5 +++--
 net/sched/act_pedit.c  |  2 +-
 net/sched/act_police.c |  8 +++-
 net/sched/act_sample.c |  8 +++-
 net/sched/act_simple.c |  5 +++--
 net/sched/act_skbedit.c|  5 +++--
 net/sched/act_skbmod.c |  8 +++-
 net/sched/act_tunnel_key.c | 11 ---
 net/sched/act_vlan.c   |  8 +++-
 17 files changed, 50 insertions(+), 58 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a023873db713..f019f0464cec 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -870,8 +870,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, 
struct nlattr *nla,
}
act->order = i;
sz += tcf_action_fill_size(act);
-   if (ovr)
-   refcount_inc(>tcfa_refcnt);
list_add_tail(>list, actions);
}
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 7941dd66ff83..d3f4ac6f2c4b 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -311,9 +311,10 @@ static int tcf_bpf_init(struct net *net, struct nlattr 
*nla,
if (bind)
return 0;
 
-   tcf_idr_release(*act, bind);
-   if (!replace)
+   if (!replace) {
+   tcf_idr_release(*act, bind);
return -EEXIST;
+   }
}
 
is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
@@ -356,8 +357,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 
return res;
 out:
-   if (res == ACT_P_CREATED)
-   tcf_idr_release(*act, bind);
+   tcf_idr_release(*act, bind);
 
return ret;
 }
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 143c2d3de723..701e90244eff 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -135,9 +135,10 @@ static int tcf_connmark_init(struct net *net, struct 
nlattr *nla,
ci = to_connmark(*a);
if (bind)
return 0;
-   tcf_idr_release(*a, bind);
-   if (!ovr)
+   if (!ovr) {
+   tcf_idr_release(*a, bind);
return -EEXIST;
+   }
/* replacing action and zone */
ci->tcf_action = parm->action;
ci->zone = parm->zone;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 3768539340e0..5dbee136b0a1 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -76,9 +76,10 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
} else {
if (bind)/* dont override defaults */
return 0;
-   tcf_idr_release(*a, bind);
-   if (!ovr)
+   if (!ovr) {
+   tcf_idr_release(*a, bind);
return -EEXIST;
+   }
}
 
p = to_tcf_csum(*a);
@@ -86,8 +87,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
-   if (ret == ACT_P_CREATED)
-   tcf_idr_release(*a, bind);
+   tcf_idr_release(*a, bind);
return -ENOMEM;
}
params_old = rtnl_dereference(p->params);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index a431a711f0dd..11c4de3f344e 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -100,9 +100,10 @@ static int tcf_gact_init(struct net *net, struct nlattr 
*nla,
} else {
if (bind)/* dont override defaults */
return 

[PATCH net-next v6 04/11] net: sched: always take reference to action

2018-07-05 Thread Vlad Buslov
Without rtnl lock protection it is no longer safe to use pointer to tc
action without holding reference to it. (it can be destroyed concurrently)

Remove unsafe action idr lookup function. Instead of it, implement safe tcf
idr check function that atomically looks up action in idr and increments
its reference and bind counters. Implement both action search and check
using new safe function

Reference taken by idr check is temporal and should not be accounted by
userspace clients (both logically and to preserver current API behavior).
Subtract temporal reference when dumping action to userspace using existing
tca_get_fill function arguments.

Reviewed-by: Marcelo Ricardo Leitner 
Signed-off-by: Vlad Buslov 
Signed-off-by: Jiri Pirko 
---
Changes from V1 to V2:
- Make __tcf_idr_check function static
- Merge changes that take reference to action when performing lookup and
  changes that account for this additional reference when dumping action
  to user space into single patch.

 net/sched/act_api.c | 46 --
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 256b0c93916c..aa304d36fee0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -284,44 +284,38 @@ int tcf_generic_walker(struct tc_action_net *tn, struct 
sk_buff *skb,
 }
 EXPORT_SYMBOL(tcf_generic_walker);
 
-static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo)
+static bool __tcf_idr_check(struct tc_action_net *tn, u32 index,
+   struct tc_action **a, int bind)
 {
-   struct tc_action *p = NULL;
+   struct tcf_idrinfo *idrinfo = tn->idrinfo;
+   struct tc_action *p;
 
spin_lock(>lock);
p = idr_find(>action_idr, index);
+   if (p) {
+   refcount_inc(>tcfa_refcnt);
+   if (bind)
+   atomic_inc(>tcfa_bindcnt);
+   }
spin_unlock(>lock);
 
-   return p;
+   if (p) {
+   *a = p;
+   return true;
+   }
+   return false;
 }
 
 int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
 {
-   struct tcf_idrinfo *idrinfo = tn->idrinfo;
-   struct tc_action *p = tcf_idr_lookup(index, idrinfo);
-
-   if (p) {
-   *a = p;
-   return 1;
-   }
-   return 0;
+   return __tcf_idr_check(tn, index, a, 0);
 }
 EXPORT_SYMBOL(tcf_idr_search);
 
 bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
   int bind)
 {
-   struct tcf_idrinfo *idrinfo = tn->idrinfo;
-   struct tc_action *p = tcf_idr_lookup(index, idrinfo);
-
-   if (index && p) {
-   if (bind)
-   atomic_inc(>tcfa_bindcnt);
-   refcount_inc(>tcfa_refcnt);
-   *a = p;
-   return true;
-   }
-   return false;
+   return __tcf_idr_check(tn, index, a, bind);
 }
 EXPORT_SYMBOL(tcf_idr_check);
 
@@ -932,7 +926,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr 
*n,
if (!skb)
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
-0, 0) <= 0) {
+0, 1) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while 
adding TC action");
kfree_skb(skb);
return -EINVAL;
@@ -1072,7 +1066,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, 
struct list_head *actions,
return -ENOBUFS;
 
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
-0, 1) <= 0) {
+0, 2) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action 
attributes");
kfree_skb(skb);
return -EINVAL;
@@ -1131,14 +1125,14 @@ tca_action_gd(struct net *net, struct nlattr *nla, 
struct nlmsghdr *n,
if (event == RTM_GETACTION)
ret = tcf_get_notify(net, portid, n, , event, extack);
else { /* delete */
+   cleanup_a(, 1); /* lookup took reference */
ret = tcf_del_notify(net, n, , portid, attr_size, 
extack);
if (ret)
goto err;
return ret;
}
 err:
-   if (event != RTM_GETACTION)
-   tcf_action_destroy(, 0);
+   tcf_action_destroy(, 0);
return ret;
 }
 
-- 
2.7.5



[PATCH net-next v6 09/11] net: sched: use reference counting action init

2018-07-05 Thread Vlad Buslov
Change action API to assume that action init function always takes
reference to action, even when overwriting existing action. This is
necessary because action API continues to use action pointer after init
function is done. At this point action becomes accessible for concurrent
modifications, so user must always hold reference to it.

Implement helper put list function to atomically release list of actions
after action API init code is done using them.

Reviewed-by: Marcelo Ricardo Leitner 
Signed-off-by: Vlad Buslov 
Signed-off-by: Jiri Pirko 
---
Changes from V1 to V2:
- Resplit action lookup/release code to prevent memory leaks in
  individual patches.

 net/sched/act_api.c | 35 +--
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f019f0464cec..eefe8c2fe667 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -627,6 +627,18 @@ static int tcf_action_put(struct tc_action *p)
return __tcf_action_put(p, false);
 }
 
+static void tcf_action_put_lst(struct list_head *actions)
+{
+   struct tc_action *a, *tmp;
+
+   list_for_each_entry_safe(a, tmp, actions, list) {
+   const struct tc_action_ops *ops = a->ops;
+
+   if (tcf_action_put(a))
+   module_put(ops->owner);
+   }
+}
+
 int
 tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int 
ref)
 {
@@ -835,17 +847,6 @@ struct tc_action *tcf_action_init_1(struct net *net, 
struct tcf_proto *tp,
return ERR_PTR(err);
 }
 
-static void cleanup_a(struct list_head *actions, int ovr)
-{
-   struct tc_action *a;
-
-   if (!ovr)
-   return;
-
-   list_for_each_entry(a, actions, list)
-   refcount_dec(>tcfa_refcnt);
-}
-
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
struct list_head *actions, size_t *attr_size,
@@ -874,11 +875,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, 
struct nlattr *nla,
}
 
*attr_size = tcf_action_full_attrs_size(sz);
-
-   /* Remove the temp refcnt which was necessary to protect against
-* destroying an existing action which was being replaced
-*/
-   cleanup_a(actions, ovr);
return 0;
 
 err:
@@ -1209,7 +1205,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct 
nlmsghdr *n,
return ret;
}
 err:
-   tcf_action_destroy(, 0);
+   tcf_action_put_lst();
return ret;
 }
 
@@ -1251,8 +1247,11 @@ static int tcf_action_add(struct net *net, struct nlattr 
*nla,
  _size, true, extack);
if (ret)
return ret;
+   ret = tcf_add_notify(net, n, , portid, attr_size, extack);
+   if (ovr)
+   tcf_action_put_lst();
 
-   return tcf_add_notify(net, n, , portid, attr_size, extack);
+   return ret;
 }
 
 static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
-- 
2.7.5



Re: [PATCH iproute2] man: Fix typos on tc-cbs

2018-07-05 Thread Simon Horman
Some changelog text should go here.

On Wed, Jun 27, 2018 at 10:50:51AM -0700, Jesus Sanchez-Palencia wrote:
> Signed-off-by: Jesus Sanchez-Palencia 

Otherwise, this seems fine to me.

Reviewed-by: Simon Horman 

> ---
>  man/man8/tc-cbs.8 | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/man/man8/tc-cbs.8 b/man/man8/tc-cbs.8
> index 32e1e0d4..ad1d8821 100644
> --- a/man/man8/tc-cbs.8
> +++ b/man/man8/tc-cbs.8
> @@ -28,7 +28,7 @@ defined rate limiting method to the traffic.
>  This queueing discipline is intended to be used by TSN (Time Sensitive
>  Networking) applications, the CBS parameters are derived directly by
>  what is described by the Annex L of the IEEE 802.1Q-2014
> -Sepcification. The algorithm and how it affects the latency are
> +Specification. The algorithm and how it affects the latency are
>  detailed there.
>  
>  CBS is meant to be installed under another qdisc that maps packet
> @@ -60,7 +60,7 @@ packet size, which is then used for calculating the 
> idleslope.
>  sendslope
>  Sendslope is the rate of credits that is depleted (it should be a
>  negative number of kilobits per second) when a transmission is
> -ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
> +occurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
>  8.6.8.2 item g):
>  
>  sendslope = idleslope - port_transmit_rate
> -- 
> 2.17.1
> 


Re: [PATCHv2 net-next 2/2] selftests: add a selftest for directed broadcast forwarding

2018-07-05 Thread Xin Long
On Thu, Jul 5, 2018 at 9:18 PM, David Ahern  wrote:
> On 7/5/18 1:57 AM, Xin Long wrote:
>> On Thu, Jul 5, 2018 at 2:36 AM, David Ahern  wrote:
>>> On 7/4/18 11:56 AM, Xin Long wrote:
>>>
> your commands are not a proper test. The test should succeed and fail
> based on the routing lookup, not iptables rules.
 A proper test can be done easily with netns, as vrf can't isolate much.
 I don't want to bother forwarding/ directory with netns, so I will probably
 just drop this selftest, and let the feature patch go first.

>>>
>>> BTW, VRF isolates at the routing layer and this is a routing change. We
>>> need to understand why it does not work with VRF. Perhaps another tweak
>>> is needed for VRF.
>> One problem was that the peer may not use the address on the dev
>> that echo_request comes from as the src IP of echo_reply when the
>> echo_request's dst IP is broadcast, but try to get another one by
>> looking up a route without ".flowi4_oif" set. See:
>>
>> icmp_reply()->fib_compute_spec_dst():
>> struct flowi4 fl4 = {
>> .flowi4_iif = LOOPBACK_IFINDEX,
>> .daddr = ip_hdr(skb)->saddr,
>> .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
>> .flowi4_scope = scope,
>> .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark 
>> : 0,
>> };
>> if (!fib_lookup(net, , , 0))
>> return FIB_RES_PREFSRC(net, res);
>>
>>
>> Without ".flowi4_oif" set, it won't match the vrf route. That's why
>> I had to make h2 NOT into a vrf so that h1 can get the echo_reply.
>> But it can't tell if this echo_reply is from h2 or r1, as r1's echo_reply
>> will also use the same src IP which is actually got from main route
>> space as  ".flowi4_oif" is not set.
>> (hope I this description is clear to you) :)
>>
>> So i'm not sure if we can do any tweak for VRF.
>>
>
> Try this:
>
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index b21833651394..e46cdd310e5f 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -300,6 +300,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
> if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
> struct flowi4 fl4 = {
> .flowi4_iif = LOOPBACK_IFINDEX,
> +   .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
> .daddr = ip_hdr(skb)->saddr,
> .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
> .flowi4_scope = scope,
Great, with your fix, I can extend more for this selftest.
but I hope no side effects would be caused.

Thank you.


[PATCH net] net: aquantia: vlan unicast address list correct handling

2018-07-05 Thread Igor Russkikh
Setting up macvlan/macvtap networks over atlantic NIC results
in no traffic over these networks because ndo_set_rx_mode did
not listed UC MACs as registered in unicast filter.

Here we fix that taking into account maximum number of UC
filters supported by hardware. If more than MAX addresses were
registered, we just enable promisc  and/or allmulti to pass
the traffic in.

We also remove MULTICAST_ADDRESS_MAX constant from aq_cfg since
thats not a configurable parameter at all.

Fixes: b21f502 ("net:ethernet:aquantia: Fix for multicast filter handling.")
Signed-off-by: Igor Russkikh 
---
 drivers/net/ethernet/aquantia/atlantic/aq_cfg.h|  2 -
 drivers/net/ethernet/aquantia/atlantic/aq_hw.h |  4 +-
 drivers/net/ethernet/aquantia/atlantic/aq_main.c   | 11 +
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c| 47 +-
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h|  2 +-
 .../ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c  |  2 +-
 .../ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c  |  4 +-
 7 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h 
b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
index fc73831..91eb891 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
@@ -63,8 +63,6 @@
 
 #define AQ_CFG_NAPI_WEIGHT 64U
 
-#define AQ_CFG_MULTICAST_ADDRESS_MAX 32U
-
 /*#define AQ_CFG_MAC_ADDR_PERMANENT {0x30, 0x0E, 0xE3, 0x12, 0x34, 0x56}*/
 
 #define AQ_NIC_FC_OFF0U
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h 
b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
index a2d416b..2c6ebd9 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
@@ -98,6 +98,8 @@ struct aq_stats_s {
 #define AQ_HW_MEDIA_TYPE_TP1U
 #define AQ_HW_MEDIA_TYPE_FIBRE 2U
 
+#define AQ_HW_MULTICAST_ADDRESS_MAX 32U
+
 struct aq_hw_s {
atomic_t flags;
u8 rbl_enabled:1;
@@ -177,7 +179,7 @@ struct aq_hw_ops {
unsigned int packet_filter);
 
int (*hw_multicast_list_set)(struct aq_hw_s *self,
-u8 ar_mac[AQ_CFG_MULTICAST_ADDRESS_MAX]
+u8 ar_mac[AQ_HW_MULTICAST_ADDRESS_MAX]
 [ETH_ALEN],
 u32 count);
 
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_main.c 
b/drivers/net/ethernet/aquantia/atlantic/aq_main.c
index ba5fe8c..e3ae29e 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_main.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_main.c
@@ -135,17 +135,10 @@ static int aq_ndev_set_mac_address(struct net_device 
*ndev, void *addr)
 static void aq_ndev_set_multicast_settings(struct net_device *ndev)
 {
struct aq_nic_s *aq_nic = netdev_priv(ndev);
-   int err = 0;
 
-   err = aq_nic_set_packet_filter(aq_nic, ndev->flags);
-   if (err < 0)
-   return;
+   aq_nic_set_packet_filter(aq_nic, ndev->flags);
 
-   if (netdev_mc_count(ndev)) {
-   err = aq_nic_set_multicast_list(aq_nic, ndev);
-   if (err < 0)
-   return;
-   }
+   aq_nic_set_multicast_list(aq_nic, ndev);
 }
 
 static const struct net_device_ops aq_ndev_ops = {
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c 
b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 1a1a638..7a22d02 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -563,34 +563,41 @@ int aq_nic_set_packet_filter(struct aq_nic_s *self, 
unsigned int flags)
 
 int aq_nic_set_multicast_list(struct aq_nic_s *self, struct net_device *ndev)
 {
+   unsigned int packet_filter = self->packet_filter;
struct netdev_hw_addr *ha = NULL;
unsigned int i = 0U;
 
-   self->mc_list.count = 0U;
-
-   netdev_for_each_mc_addr(ha, ndev) {
-   ether_addr_copy(self->mc_list.ar[i++], ha->addr);
-   ++self->mc_list.count;
+   self->mc_list.count = 0;
+   if (netdev_uc_count(ndev) > AQ_HW_MULTICAST_ADDRESS_MAX) {
+   packet_filter |= IFF_PROMISC;
+   } else {
+   netdev_for_each_uc_addr(ha, ndev) {
+   ether_addr_copy(self->mc_list.ar[i++], ha->addr);
 
-   if (i >= AQ_CFG_MULTICAST_ADDRESS_MAX)
-   break;
+   if (i >= AQ_HW_MULTICAST_ADDRESS_MAX)
+   break;
+   }
}
 
-   if (i >= AQ_CFG_MULTICAST_ADDRESS_MAX) {
-   /* Number of filters is too big: atlantic does not support this.
-* Force all multi filter to support this.
-* With this we disable all UC filters and setup "all pass"
-* multicast mask
-*/
-   self->packet_filter |= 

Re: [PATCHv2 net-next 2/2] selftests: add a selftest for directed broadcast forwarding

2018-07-05 Thread David Ahern
On 7/5/18 1:57 AM, Xin Long wrote:
> On Thu, Jul 5, 2018 at 2:36 AM, David Ahern  wrote:
>> On 7/4/18 11:56 AM, Xin Long wrote:
>>
 your commands are not a proper test. The test should succeed and fail
 based on the routing lookup, not iptables rules.
>>> A proper test can be done easily with netns, as vrf can't isolate much.
>>> I don't want to bother forwarding/ directory with netns, so I will probably
>>> just drop this selftest, and let the feature patch go first.
>>>
>>
>> BTW, VRF isolates at the routing layer and this is a routing change. We
>> need to understand why it does not work with VRF. Perhaps another tweak
>> is needed for VRF.
> One problem was that the peer may not use the address on the dev
> that echo_request comes from as the src IP of echo_reply when the
> echo_request's dst IP is broadcast, but try to get another one by
> looking up a route without ".flowi4_oif" set. See:
> 
> icmp_reply()->fib_compute_spec_dst():
> struct flowi4 fl4 = {
> .flowi4_iif = LOOPBACK_IFINDEX,
> .daddr = ip_hdr(skb)->saddr,
> .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
> .flowi4_scope = scope,
> .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 
> 0,
> };
> if (!fib_lookup(net, , , 0))
> return FIB_RES_PREFSRC(net, res);
> 
> 
> Without ".flowi4_oif" set, it won't match the vrf route. That's why
> I had to make h2 NOT into a vrf so that h1 can get the echo_reply.
> But it can't tell if this echo_reply is from h2 or r1, as r1's echo_reply
> will also use the same src IP which is actually got from main route
> space as  ".flowi4_oif" is not set.
> (hope I this description is clear to you) :)
> 
> So i'm not sure if we can do any tweak for VRF.
> 

Try this:

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b21833651394..e46cdd310e5f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -300,6 +300,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
struct flowi4 fl4 = {
.flowi4_iif = LOOPBACK_IFINDEX,
+   .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
.flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
.flowi4_scope = scope,


[PATCH net] MAINTAINERS: update my email address

2018-07-05 Thread Stefan Schmidt
The mail server hosting the old address is going to fade out.
Time to update to an address I control directly.

Signed-off-by: Stefan Schmidt 
---
 MAINTAINERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index b40d702337f2..ca3e75ec9308 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2523,7 +2523,7 @@ S:Supported
 F: drivers/scsi/esas2r
 
 ATUSB IEEE 802.15.4 RADIO DRIVER
-M: Stefan Schmidt 
+M: Stefan Schmidt 
 L: linux-w...@vger.kernel.org
 S: Maintained
 F: drivers/net/ieee802154/atusb.c
@@ -6908,7 +6908,7 @@ F:drivers/clk/clk-versaclock5.c
 
 IEEE 802.15.4 SUBSYSTEM
 M: Alexander Aring 
-M: Stefan Schmidt 
+M: Stefan Schmidt 
 L: linux-w...@vger.kernel.org
 W: http://wpan.cakelab.org/
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan.git
-- 
2.14.4



Re: [PATCH] atm: Preserve value of skb->truesize when accounting to vcc

2018-07-05 Thread David Miller
From: David Woodhouse 
Date: Thu, 05 Jul 2018 09:44:22 +0100

> On Sat, 2018-06-16 at 16:27 -0700, David Miller wrote:
>> From: "David Woodhouse" 
>> Date: Sat, 16 Jun 2018 20:52:33 -
>> 
>> >> This Fixes tag shoots the messenger really.
>> >>
>> >> I suggest to instead use :
>> >>
>> >> Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")
>> > 
>> > 
>> > Oh, I hadn't realised how recent that was. Sure, let's blame you instead :)
>> 
>> Patch applied with adjusted Fixes: tag, and queued up for -stable.
> 
> Thanks gentle prod about the "stable" part of that. OpenWRT is
> lining up for a release it'd be good to ingest the patch properly if
> possible.
> 
> I periodically whine at them about the number of outstanding patches
> not in upstream. It helps if one of them doesn't have my name on :)

It's in my next batch of -stable submissions. :)


Re: [PATCH wpan 1/2] net: 6lowpan: fix reserved space for single frames

2018-07-05 Thread Stefan Schmidt
Hello.

[CC David Palma and Rabi Narayan Sahoo]

On 02.07.2018 22:32, Alexander Aring wrote:
> This patch fixes patch add handling to take care tail and headroom for
> single 6lowpan frames. We need to be sure we have a skb with the right
> head and tailroom for single frames. This patch do it by using
> skb_copy_expand() if head and tailroom is not enough allocated by upper
> layer.
> 
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195059
> Reported-by: David Palma 
> Reported-by: Rabi Narayan Sahoo 
> Signed-off-by: Alexander Aring 

David, Rabi and you please test these two patches and verify that it
fixes the problems you have?

regards
Stefan Schmidt

> ---
>  net/ieee802154/6lowpan/tx.c | 21 ++---
>  1 file changed, 18 insertions(+), 3 deletions(-)
> 
> diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
> index e6ff5128e61a..d0c4d220de08 100644
> --- a/net/ieee802154/6lowpan/tx.c
> +++ b/net/ieee802154/6lowpan/tx.c
> @@ -265,9 +265,24 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct 
> net_device *ldev)
>   /* We must take a copy of the skb before we modify/replace the ipv6
>* header as the header could be used elsewhere
>*/
> - skb = skb_unshare(skb, GFP_ATOMIC);
> - if (!skb)
> - return NET_XMIT_DROP;
> + if (unlikely(skb_headroom(skb) < ldev->needed_headroom ||
> +  skb_tailroom(skb) < ldev->needed_tailroom)) {
> + struct sk_buff *nskb;
> +
> + nskb = skb_copy_expand(skb, ldev->needed_headroom,
> +ldev->needed_tailroom, GFP_ATOMIC);
> + if (likely(skb)) {
> + consume_skb(skb);
> + skb = nskb;
> + } else {
> + kfree_skb(skb);
> + return NET_XMIT_DROP;
> + }
> + } else {
> + skb = skb_unshare(skb, GFP_ATOMIC);
> + if (!skb)
> + return NET_XMIT_DROP;
> + }
>  
>   ret = lowpan_header(skb, ldev, _size, _offset);
>   if (ret < 0) {
> 


Re: [PATCH net-next] cxgb4: Fix the condition to check if the card is T5

2018-07-05 Thread David Miller
From: Ganesh Goudar 
Date: Wed,  4 Jul 2018 17:49:33 +0530

> Use 'chip_ver' rather than 'chip' to check if the card
> is T5.
> 
> Fixes: e8d452923ae6 ("cxgb4: clean up init_one")
> Signed-off-by: Ganesh Goudar 

Applied.


Re: [PATCH net] ixgbe: Off by one in ixgbe_ipsec_tx()

2018-07-05 Thread David Miller
From: Dan Carpenter 
Date: Wed, 4 Jul 2018 12:53:37 +0300

> The ipsec->tx_tbl[] has IXGBE_IPSEC_MAX_SA_COUNT elements so the > needs
> to be changed to >= so we don't read one element beyond the end of the
> array.
> 
> Fixes: 592594704761 ("ixgbe: process the Tx ipsec offload")
> Signed-off-by: Dan Carpenter 

I'll let Jeff pick this up.


Re: [PATCH net] qed: off by one in qed_parse_mcp_trace_buf()

2018-07-05 Thread David Miller
From: Dan Carpenter 
Date: Wed, 4 Jul 2018 12:52:36 +0300

> If format_idx == s_mcp_trace_meta.formats_num then we read one element
> beyond the end of the s_mcp_trace_meta.formats[] array.
> 
> Fixes: 50bc60cb155c ("qed*: Utilize FW 8.33.11.0")
> Signed-off-by: Dan Carpenter 

Applied.


Re: [PATCH net-next] net: aquantia: Make some functions static

2018-07-05 Thread David Miller
From: Wei Yongjun 
Date: Thu, 5 Jul 2018 09:00:10 +

> Fixes the following sparse warnings:
> 
> drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c:525:5: warning:
>  symbol 'hw_atl_utils_mpi_set_speed' was not declared. Should it be static?
> drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c:536:5: warning:
>  symbol 'hw_atl_utils_mpi_set_state' was not declared. Should it be static?
> 
> Signed-off-by: Wei Yongjun 

Applied.


Re: [PATCH net-next] net: dsa: vsc73xx: Make some functions static

2018-07-05 Thread David Miller
From: Wei Yongjun 
Date: Thu, 5 Jul 2018 08:59:09 +

> Fixes the following sparse warnings:
> 
> drivers/net/dsa/vitesse-vsc73xx.c:1054:6: warning:
>  symbol 'vsc73xx_get_strings' was not declared. Should it be static?
> drivers/net/dsa/vitesse-vsc73xx.c:1113:5: warning:
>  symbol 'vsc73xx_get_sset_count' was not declared. Should it be static?
> drivers/net/dsa/vitesse-vsc73xx.c:1122:6: warning:
>  symbol 'vsc73xx_get_ethtool_stats' was not declared. Should it be static?
> 
> Signed-off-by: Wei Yongjun 

Applied.


Re: [PATCH][net-next][v2] net: limit each hash list length to MAX_GRO_SKBS

2018-07-05 Thread David Miller
From: Li RongQing 
Date: Thu,  5 Jul 2018 14:34:32 +0800

> After commit 07d78363dcff ("net: Convert NAPI gro list into a small hash
> table.")' there is 8 hash buckets, which allows more flows to be held for
> merging.  but MAX_GRO_SKBS, the total held skb for merging, is 8 skb still,
> limit the hash table performance.
> 
> keep MAX_GRO_SKBS as 8 skb, but limit each hash list length to 8 skb, not
> the total 8 skb
> 
> Signed-off-by: Li RongQing 

Applied, thanks.


[PATCH net-next] net: aquantia: Make some functions static

2018-07-05 Thread Wei Yongjun
Fixes the following sparse warnings:

drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c:525:5: warning:
 symbol 'hw_atl_utils_mpi_set_speed' was not declared. Should it be static?
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c:536:5: warning:
 symbol 'hw_atl_utils_mpi_set_state' was not declared. Should it be static?

Signed-off-by: Wei Yongjun 
---
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c 
b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
index e1feba5..c965e65 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
@@ -522,7 +522,7 @@ void hw_atl_utils_mpi_read_stats(struct aq_hw_s *self,
 err_exit:;
 }
 
-int hw_atl_utils_mpi_set_speed(struct aq_hw_s *self, u32 speed)
+static int hw_atl_utils_mpi_set_speed(struct aq_hw_s *self, u32 speed)
 {
u32 val = aq_hw_read_reg(self, HW_ATL_MPI_CONTROL_ADR);
 
@@ -533,8 +533,8 @@ int hw_atl_utils_mpi_set_speed(struct aq_hw_s *self, u32 
speed)
return 0;
 }
 
-int hw_atl_utils_mpi_set_state(struct aq_hw_s *self,
-  enum hal_atl_utils_fw_state_e state)
+static int hw_atl_utils_mpi_set_state(struct aq_hw_s *self,
+ enum hal_atl_utils_fw_state_e state)
 {
int err = 0;
u32 transaction_id = 0;



[PATCH net-next] net: dsa: vsc73xx: Make some functions static

2018-07-05 Thread Wei Yongjun
Fixes the following sparse warnings:

drivers/net/dsa/vitesse-vsc73xx.c:1054:6: warning:
 symbol 'vsc73xx_get_strings' was not declared. Should it be static?
drivers/net/dsa/vitesse-vsc73xx.c:1113:5: warning:
 symbol 'vsc73xx_get_sset_count' was not declared. Should it be static?
drivers/net/dsa/vitesse-vsc73xx.c:1122:6: warning:
 symbol 'vsc73xx_get_ethtool_stats' was not declared. Should it be static?

Signed-off-by: Wei Yongjun 
---
 drivers/net/dsa/vitesse-vsc73xx.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/dsa/vitesse-vsc73xx.c 
b/drivers/net/dsa/vitesse-vsc73xx.c
index a4fc260..d4ea5cd 100644
--- a/drivers/net/dsa/vitesse-vsc73xx.c
+++ b/drivers/net/dsa/vitesse-vsc73xx.c
@@ -1051,8 +1051,8 @@ static void vsc73xx_port_disable(struct dsa_switch *ds, 
int port,
return NULL;
 }
 
-void vsc73xx_get_strings(struct dsa_switch *ds, int port, u32 stringset,
-uint8_t *data)
+static void vsc73xx_get_strings(struct dsa_switch *ds, int port, u32 stringset,
+   uint8_t *data)
 {
const struct vsc73xx_counter *cnt;
struct vsc73xx *vsc = ds->priv;
@@ -1110,7 +1110,7 @@ void vsc73xx_get_strings(struct dsa_switch *ds, int port, 
u32 stringset,
}
 }
 
-int vsc73xx_get_sset_count(struct dsa_switch *ds, int port, int sset)
+static int vsc73xx_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
/* We only support SS_STATS */
if (sset != ETH_SS_STATS)
@@ -1119,7 +1119,8 @@ int vsc73xx_get_sset_count(struct dsa_switch *ds, int 
port, int sset)
return 8;
 }
 
-void vsc73xx_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data)
+static void vsc73xx_get_ethtool_stats(struct dsa_switch *ds, int port,
+ uint64_t *data)
 {
struct vsc73xx *vsc = ds->priv;
u8 regs[] = {



Re: [PATCH] atm: Preserve value of skb->truesize when accounting to vcc

2018-07-05 Thread David Woodhouse
On Sat, 2018-06-16 at 16:27 -0700, David Miller wrote:
> From: "David Woodhouse" 
> Date: Sat, 16 Jun 2018 20:52:33 -
> 
> >> This Fixes tag shoots the messenger really.
> >>
> >> I suggest to instead use :
> >>
> >> Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")
> > 
> > 
> > Oh, I hadn't realised how recent that was. Sure, let's blame you instead :)
> 
> Patch applied with adjusted Fixes: tag, and queued up for -stable.

Thanks gentle prod about the "stable" part of that. OpenWRT is
lining up for a release it'd be good to ingest the patch properly if
possible.

I periodically whine at them about the number of outstanding patches
not in upstream. It helps if one of them doesn't have my name on :)

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH bpf-next 11/11] tools: bpftool: allow reuse of maps with bpftool prog load

2018-07-05 Thread Daniel Borkmann
On 07/04/2018 04:54 AM, Jakub Kicinski wrote:
> Add map parameter to prog load which will allow reuse of existing
> maps instead of creating new ones.
> 
> Signed-off-by: Jakub Kicinski 
> Reviewed-by: Quentin Monnet 
[...]
> +
> + fd = map_parse_fd(, );
> + if (fd < 0)
> + goto err_free_reuse_maps;
> +
> + map_replace = reallocarray(map_replace, old_map_fds + 1,
> +sizeof(*map_replace));
> + if (!map_replace) {
> + p_err("mem alloc failed");
> + goto err_free_reuse_maps;

Series in general looks good to me. However, above reallocarray() doesn't
exist and hence build fails, please see below. Is that from newest glibc?

You probably need some fallback implementation or in general have something
bpftool internal that doesn't make a bet on its availability.

# make

Auto-detecting system features:
...libbfd: [ on  ]
...disassembler-four-args: [ OFF ]

  CC   bpf_jit_disasm.o
  LINK bpf_jit_disasm
  CC   bpf_dbg.o
  LINK bpf_dbg
  CC   bpf_asm.o
  BISONbpf_exp.yacc.c
  CC   bpf_exp.yacc.o
  FLEX bpf_exp.lex.c
  CC   bpf_exp.lex.o
  LINK bpf_asm
  DESCEND  bpftool

Auto-detecting system features:
...libbfd: [ on  ]
...disassembler-four-args: [ OFF ]

  CC   map_perf_ring.o
  CC   xlated_dumper.o
  CC   perf.o
  CC   prog.o
prog.c: In function ‘do_load’:
prog.c:785:18: warning: implicit declaration of function ‘reallocarray’ 
[-Wimplicit-function-declaration]
map_replace = reallocarray(map_replace, old_map_fds + 1,
  ^~~~
prog.c:785:16: warning: assignment makes pointer from integer without a cast 
[-Wint-conversion]
map_replace = reallocarray(map_replace, old_map_fds + 1,
^
  CC   common.o
  CC   cgroup.o
  CC   main.o
  CC   json_writer.o
  CC   cfg.o
  CC   map.o
  CC   jit_disasm.o
  CC   disasm.o

Auto-detecting system features:
...libelf: [ on  ]
...   bpf: [ on  ]

Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from 
latest version at 'include/uapi/linux/bpf.h'
  CC   libbpf.o
  CC   bpf.o
  CC   nlattr.o
  CC   btf.o
  LD   libbpf-in.o
  LINK libbpf.a
  LINK bpftool
prog.o: In function `do_load':
prog.c:(.text+0x23d): undefined reference to `reallocarray'
collect2: error: ld returned 1 exit status
Makefile:89: recipe for target 'bpftool' failed
make[1]: *** [bpftool] Error 1
Makefile:99: recipe for target 'bpftool' failed
make: *** [bpftool] Error 2

Thanks,
Daniel


Re: [PATCH net-next 09/18] tls: Add rx inline crypto offload

2018-07-05 Thread kbuild test robot
Hi Boris,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Boris-Pismenny/TLS-offload-rx-netdev-mlx5/20180705-064704
config: ia64-allmodconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 8.1.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=8.1.0 make.cross ARCH=ia64 

Note: the 
linux-review/Boris-Pismenny/TLS-offload-rx-netdev-mlx5/20180705-064704 HEAD 
fbaef8a3b3a3283de49a7171144b7471e5c780d9 builds fine.
  It only hurts bisectibility.

All errors (new ones prefixed by >>):

   In file included from drivers/net/ethernet/mellanox/mlx5/core/en_main.c:45:
   drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h:53:29: error: field 
'base' has incomplete type
 struct tls_offload_context base;
^~~~
   In file included from include/linux/kernel.h:10,
from include/linux/list.h:9,
from include/linux/timer.h:5,
from include/linux/netdevice.h:28,
from include/net/sch_generic.h:5,
from include/net/act_api.h:9,
from include/net/tc_act/tc_gact.h:5,
from drivers/net/ethernet/mellanox/mlx5/core/en_main.c:33:
   drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h: In function 
'mlx5e_get_tls_tx_context':
   drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h:62:8: error: 
'TLS_OFFLOAD_CONTEXT_SIZE' undeclared (first use in this function); did you 
mean 'TLS_OFFLOAD_CONTEXT_SIZE_TX'?
   TLS_OFFLOAD_CONTEXT_SIZE);
   ^~~~
   include/linux/compiler.h:316:19: note: in definition of macro 
'__compiletime_assert'
  bool __cond = !(condition);\
  ^
   include/linux/compiler.h:339:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/build_bug.h:45:37: note: in expansion of macro 
'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   include/linux/build_bug.h:69:2: note: in expansion of macro 
'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
 ^~~~
   drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h:61:2: note: in 
expansion of macro 'BUILD_BUG_ON'
 BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context) >
 ^~~~
   drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h:62:8: note: each 
undeclared identifier is reported only once for each function it appears in
   TLS_OFFLOAD_CONTEXT_SIZE);
   ^~~~
   include/linux/compiler.h:316:19: note: in definition of macro 
'__compiletime_assert'
  bool __cond = !(condition);\
  ^
   include/linux/compiler.h:339:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/build_bug.h:45:37: note: in expansion of macro 
'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   include/linux/build_bug.h:69:2: note: in expansion of macro 
'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
 ^~~~
   drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h:61:2: note: in 
expansion of macro 'BUILD_BUG_ON'
 BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context) >
 ^~~~
   In file included from include/linux/list.h:9,
from include/linux/timer.h:5,
from include/linux/netdevice.h:28,
from include/net/sch_generic.h:5,
from include/net/act_api.h:9,
from include/net/tc_act/tc_gact.h:5,
from drivers/net/ethernet/mellanox/mlx5/core/en_main.c:33:
>> drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h:63:22: error: 
>> implicit declaration of function 'tls_offload_ctx'; did you mean 
>> 'tls_offload_ctx_rx'? [-Werror=implicit-function-declaration]
 return container_of(tls_offload_ctx(tls_ctx),
 ^~~
   include/linux/kernel.h:963:26: note: in definition of macro 'container_of'
 void *__mptr = (void *)(ptr); \
 ^~~
   include/linux/kernel.h:963:17: warning: cast to pointer from integer of 
different size [-Wint-to-pointer-cast]
 void *__mptr = (void *)(ptr); \
^
   drivers/n

Re: [PATCH net-next 05/18] tls: Refactor tls_offload variable names

2018-07-05 Thread kbuild test robot
Hi Boris,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Boris-Pismenny/TLS-offload-rx-netdev-mlx5/20180705-064704
config: x86_64-randconfig-s0-07051307 (attached as .config)
compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

Note: the 
linux-review/Boris-Pismenny/TLS-offload-rx-netdev-mlx5/20180705-064704 HEAD 
fbaef8a3b3a3283de49a7171144b7471e5c780d9 builds fine.
  It only hurts bisectibility.

All errors (new ones prefixed by >>):

   net//tls/tls_device.c: In function 'tls_device_free_ctx':
>> net//tls/tls_device.c:55:22: error: 'TLS_HW' undeclared (first use in this 
>> function)
 if (ctx->tx_conf == TLS_HW)
 ^~
   net//tls/tls_device.c:55:22: note: each undeclared identifier is reported 
only once for each function it appears in

vim +/TLS_HW +55 net//tls/tls_device.c

52  
53  static void tls_device_free_ctx(struct tls_context *ctx)
54  {
  > 55  if (ctx->tx_conf == TLS_HW)
56  kfree(tls_offload_ctx_tx(ctx));
57  
58  kfree(ctx);
59  }
60  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH net-next 05/18] tls: Refactor tls_offload variable names

2018-07-05 Thread kbuild test robot
Hi Boris,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Boris-Pismenny/TLS-offload-rx-netdev-mlx5/20180705-064704
config: ia64-allmodconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 8.1.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=8.1.0 make.cross ARCH=ia64 

Note: the 
linux-review/Boris-Pismenny/TLS-offload-rx-netdev-mlx5/20180705-064704 HEAD 
fbaef8a3b3a3283de49a7171144b7471e5c780d9 builds fine.
  It only hurts bisectibility.

All error/warnings (new ones prefixed by >>):

   net/tls/tls_device.c: In function 'tls_device_free_ctx':
>> net/tls/tls_device.c:55:22: error: 'TLS_HW' undeclared (first use in this 
>> function); did you mean 'TLS_TX'?
 if (ctx->tx_conf == TLS_HW)
 ^~
 TLS_TX
   net/tls/tls_device.c:55:22: note: each undeclared identifier is reported 
only once for each function it appears in
--
   In file included from drivers/net/ethernet//mellanox/mlx5/core/en_main.c:45:
>> drivers/net/ethernet//mellanox/mlx5/core/en_accel/tls.h:53:29: error: field 
>> 'base' has incomplete type
 struct tls_offload_context base;
^~~~
   In file included from include/linux/kernel.h:10,
from include/linux/list.h:9,
from include/linux/timer.h:5,
from include/linux/netdevice.h:28,
from include/net/sch_generic.h:5,
from include/net/act_api.h:9,
from include/net/tc_act/tc_gact.h:5,
from drivers/net/ethernet//mellanox/mlx5/core/en_main.c:33:
   drivers/net/ethernet//mellanox/mlx5/core/en_accel/tls.h: In function 
'mlx5e_get_tls_tx_context':
>> drivers/net/ethernet//mellanox/mlx5/core/en_accel/tls.h:62:8: error: 
>> 'TLS_OFFLOAD_CONTEXT_SIZE' undeclared (first use in this function); did you 
>> mean 'TLS_OFFLOAD_CONTEXT_SIZE_TX'?
   TLS_OFFLOAD_CONTEXT_SIZE);
   ^~~~
   include/linux/compiler.h:316:19: note: in definition of macro 
'__compiletime_assert'
  bool __cond = !(condition);\
  ^
   include/linux/compiler.h:339:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/build_bug.h:45:37: note: in expansion of macro 
'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   include/linux/build_bug.h:69:2: note: in expansion of macro 
'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
 ^~~~
   drivers/net/ethernet//mellanox/mlx5/core/en_accel/tls.h:61:2: note: in 
expansion of macro 'BUILD_BUG_ON'
 BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context) >
 ^~~~
   drivers/net/ethernet//mellanox/mlx5/core/en_accel/tls.h:62:8: note: each 
undeclared identifier is reported only once for each function it appears in
   TLS_OFFLOAD_CONTEXT_SIZE);
   ^~~~
   include/linux/compiler.h:316:19: note: in definition of macro 
'__compiletime_assert'
  bool __cond = !(condition);\
  ^
   include/linux/compiler.h:339:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/build_bug.h:45:37: note: in expansion of macro 
'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   include/linux/build_bug.h:69:2: note: in expansion of macro 
'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
 ^~~~
   drivers/net/ethernet//mellanox/mlx5/core/en_accel/tls.h:61:2: note: in 
expansion of macro 'BUILD_BUG_ON'
 BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context) >
 ^~~~
   In file included from include/linux/list.h:9,
from include/linux/timer.h:5,
from include/linux/netdevice.h:28,
from include/net/sch_generic.h:5,
from include/net/act_api.h:9,
from include/net/tc_act/tc_gact.h:5,
from drivers/net/ethernet//mellanox/mlx5/core/en_main.c:33:
>> drivers/net/ethernet//mellanox/mlx5/core/en_accel/tls.h:63:22: error: 
>> implicit declaration of function 'tls_offload_ctx'; did you mean 
>> 'tls_offload_ct

Re: [PATCH net-next 09/18] tls: Add rx inline crypto offload

2018-07-05 Thread kbuild test robot
Hi Boris,

I love your patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Boris-Pismenny/TLS-offload-rx-netdev-mlx5/20180705-064704
config: x86_64-randconfig-s1-07051503 (attached as .config)
compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   net//tls/tls_sw.c: In function 'decrypt_skb_update':
>> net//tls/tls_sw.c:685:9: warning: 'err' may be used uninitialized in this 
>> function [-Wmaybe-uninitialized]
 return err;
^~~

vim +/err +685 net//tls/tls_sw.c

c46234eb Dave Watson2018-03-22  659  
13000621 Boris Pismenny 2018-07-04  660  static int decrypt_skb_update(struct 
sock *sk, struct sk_buff *skb,
13000621 Boris Pismenny 2018-07-04  661   struct 
scatterlist *sgout)
13000621 Boris Pismenny 2018-07-04  662  {
13000621 Boris Pismenny 2018-07-04  663 struct tls_context *tls_ctx = 
tls_get_ctx(sk);
13000621 Boris Pismenny 2018-07-04  664 struct tls_sw_context_rx *ctx = 
tls_sw_ctx_rx(tls_ctx);
13000621 Boris Pismenny 2018-07-04  665 struct strp_msg *rxm = 
strp_msg(skb);
13000621 Boris Pismenny 2018-07-04  666 int err;
13000621 Boris Pismenny 2018-07-04  667  
52ffb3bf Boris Pismenny 2018-07-04  668  #ifdef CONFIG_TLS_DEVICE
52ffb3bf Boris Pismenny 2018-07-04  669 err = tls_device_decrypted(sk, 
skb);
52ffb3bf Boris Pismenny 2018-07-04  670 if (err < 0)
52ffb3bf Boris Pismenny 2018-07-04  671 return err;
52ffb3bf Boris Pismenny 2018-07-04  672  #endif
52ffb3bf Boris Pismenny 2018-07-04  673 if (!ctx->decrypted) {
13000621 Boris Pismenny 2018-07-04  674 err = decrypt_skb(sk, 
skb, sgout);
13000621 Boris Pismenny 2018-07-04  675 if (err < 0)
13000621 Boris Pismenny 2018-07-04  676 return err;
52ffb3bf Boris Pismenny 2018-07-04  677 }
13000621 Boris Pismenny 2018-07-04  678  
13000621 Boris Pismenny 2018-07-04  679 rxm->offset += 
tls_ctx->rx.prepend_size;
13000621 Boris Pismenny 2018-07-04  680 rxm->full_len -= 
tls_ctx->rx.overhead_size;
13000621 Boris Pismenny 2018-07-04  681 tls_advance_record_sn(sk, 
_ctx->rx);
13000621 Boris Pismenny 2018-07-04  682 ctx->decrypted = true;
13000621 Boris Pismenny 2018-07-04  683 ctx->saved_data_ready(sk);
13000621 Boris Pismenny 2018-07-04  684  
13000621 Boris Pismenny 2018-07-04 @685 return err;
13000621 Boris Pismenny 2018-07-04  686  }
13000621 Boris Pismenny 2018-07-04  687  

:: The code at line 685 was first introduced by commit
:: 1300062159ee8551834a4371379b82abe20f436e tls: Split decrypt_skb to two 
functions

:: TO: Boris Pismenny 
:: CC: 0day robot 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCHv2 net-next 2/2] selftests: add a selftest for directed broadcast forwarding

2018-07-05 Thread Xin Long
On Thu, Jul 5, 2018 at 4:39 AM, Ido Schimmel  wrote:
> On Thu, Jul 05, 2018 at 01:56:23AM +0800, Xin Long wrote:
>> On Wed, Jul 4, 2018 at 3:23 AM, David Ahern  wrote:
>> > your commands are not a proper test. The test should succeed and fail
>> > based on the routing lookup, not iptables rules.
>> A proper test can be done easily with netns, as vrf can't isolate much.
>> I don't want to bother forwarding/ directory with netns, so I will probably
>> just drop this selftest, and let the feature patch go first.
>>
>> What do you think?
>
> You can add a tc rule on the ingress of h2 and make sure that in the
> first case ping succeeds and the tc rule wasn't hit. In the second case
> ping should also succeed, but the tc rule should be hit. This is similar
> to your original netns test.
With netns, it will be much easier to use
sysctl net.ipv4.icmp_echo_ignore_broadcasts
to block the echo_request on r1 or h2, and check if ping works.
(this's more like the idea of using 'iptables' above) :D

>
> You can look at tc_flower.sh for reference and in particular at
> tc_check_packets().
This is a way similar idea of using tcpdump, I just feel it's too much,
this test should be an as simple test as route.sh. :)


  1   2   >