date:20161016

[RFC PATCH v3 2/2] L2TP:Adjust intf MTU,factor underlay L3,overlay L2

2016-10-16 Thread R. Parameswaran



[v3: Picked up review comments from James Chapman, added a
 function  to compute ip header + ip option overhead on a socket, and factored
 it  into L2TP change-set, RFC, would like early feedback on name and
 placement  of new function while I test this.

 Part 2/2: Changes in l2tp_eth.c, using the new API from part 1]

>From f4066da53e781ef167055c1e89ca1a7819215a40 Mon Sep 17 00:00:00 2001
From: "R. Parameswaran" 
Date: Sun, 16 Oct 2016 20:27:20 -0700

In existing kernel code, when setting up the L2TP interface, all of the
tunnel encapsulation headers are not taken into account when setting
up the MTU on the  L2TP logical interface device. Due to this, the
packets created by the applications on top of the L2TP layer are larger
than they ought to be, relative to the underlay MTU, which leads to
needless fragmentation once the L2TP packet is encapsulated in an outer IP
packet.

Specifically, the MTU calculation  does not take into account the (outer)
IP header imposed on the encapsulated L2TP packet, and the Layer 2 header
imposed on the inner L2TP packet prior to encapsulation. The patch posted
here takes care of these.

Existing code also seems to assume an Ethernet (non-jumbo) underlay. The
patch uses the PMTU mechanism and the dst entry in the L2TP tunnel socket
to directly pull up the underlay MTU (as the baseline number on top of
which the encapsulation headers are factored in).  Ethernet MTU is
assumed as a fallback only if this fails.

Picked up review comments from James Chapman, added a function
to compute ip header + ip option overhead on a socket, and factored it
into L2TP change-set.

Signed-off-by: nprac...@brocade.com,
Signed-off-by: bh...@brocade.com,
Signed-off-by: rshea...@brocade.com,
Signed-off-by: dfaw...@brocade.com
---
 net/l2tp/l2tp_eth.c | 51 +++
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 965f7e3..75eb5d3 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -30,6 +30,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #include "l2tp_core.h"
 
@@ -206,6 +209,49 @@ static void l2tp_eth_show(struct seq_file *m, void *arg)
 }
 #endif
 
+static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
+   struct l2tp_session *session,
+   struct net_device *dev)
+{
+   unsigned int overhead = 0;
+   struct dst_entry *dst;
+   u32 l3_overhead = 0;
+
+   if (session->mtu != 0) {
+   dev->mtu = session->mtu;
+   dev->needed_headroom += session->hdr_len;
+   if (tunnel->encap == L2TP_ENCAPTYPE_UDP)
+   dev->needed_headroom += sizeof(struct udphdr);
+   return;
+   }
+   overhead = session->hdr_len;
+   l3_overhead = kernel_sock_ip_overhead(tunnel->sock);
+   if (!tunnel->sock || (l3_overhead == 0)) {
+   /* L3 Overhead couldn't be identified, dev mtu stays at 1500 */
+   return;
+   }
+   /* Adjust MTU, factor overhead - underlay L3, overlay L2 hdr*/
+   overhead += ETH_HLEN + l3_overhead;
+   /* Additionally, if the encap is UDP, account for UDP header size */
+   if (tunnel->encap == L2TP_ENCAPTYPE_UDP)
+   overhead += sizeof(struct udphdr);
+   /* If PMTU discovery was enabled, use discovered MTU on L2TP device */
+   dst = sk_dst_get(tunnel->sock);
+   if (dst) {
+   /* dst_mtu will use PMTU if found, else fallback to intf MTU */
+   u32 pmtu = dst_mtu(dst);
+
+   if (pmtu != 0)
+   dev->mtu = pmtu;
+   dst_release(dst);
+   }
+   session->mtu = dev->mtu - overhead;
+   dev->mtu = session->mtu;
+   dev->needed_headroom += session->hdr_len;
+   if (tunnel->encap == L2TP_ENCAPTYPE_UDP)
+   dev->needed_headroom += sizeof(struct udphdr);
+}
+
 static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 
peer_session_id, struct l2tp_session_cfg *cfg)
 {
struct net_device *dev;
@@ -255,11 +301,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, 
u32 session_id, u32 p
}
 
dev_net_set(dev, net);
-   if (session->mtu == 0)
-   session->mtu = dev->mtu - session->hdr_len;
-   dev->mtu = session->mtu;
-   dev->needed_headroom += session->hdr_len;
 
+   l2tp_eth_adjust_mtu(tunnel, session, dev);
priv = netdev_priv(dev);
priv->dev = dev;
priv->session = session;
-- 
2.1.4



> 
> I think keep it simple. A function to return the size of the IP header
> associated with any IP socket, not necessarily a tunnel socket. Don't
> mix in any MTU derivation logic or UDP header size etc.
> 
> Post code early as an RFC. You're more likely to get review feedback
> from others.
> 
> 
> 
>

[PATCH net-next] ila: Don't use dest cache when gateway is set

2016-10-16 Thread Tom Herbert

If the gateway is set on an ILA route we don't need to bother with using
the destination cache in the ILA route. Translation does not change the
routing in this case so we can stick with orig_output in the lwstate
output function.

Tested: Ran netperf with and without gateway for LWT route.

Signed-off-by: Tom Herbert 
---
 net/ipv6/ila/ila_lwt.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index d0a98d9..9fafba6 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -36,6 +36,7 @@ static inline struct ila_params *ila_params_lwtunnel(
 static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
struct dst_entry *orig_dst = skb_dst(skb);
+   struct rt6_info *rt = (struct rt6_info *)orig_dst;
struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate);
struct dst_entry *dst;
int err = -EINVAL;
@@ -46,6 +47,13 @@ static int ila_output(struct net *net, struct sock *sk, 
struct sk_buff *skb)
ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate),
true);
 
+   if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) {
+   /* Already have a next hop address in route, no need for
+* dest cache route.
+*/
+   return orig_dst->lwtstate->orig_output(net, sk, skb);
+   }
+
dst = dst_cache_get(>dst_cache);
if (unlikely(!dst)) {
struct ipv6hdr *ip6h = ipv6_hdr(skb);
-- 
2.9.3

[RFC PATCH v3 1/2] L2TP:Adjust intf MTU,factor underlay L3,overlay L2

2016-10-16 Thread R. Parameswaran


[v3: Picked up review comments from James Chapman, added a
 function  to compute ip header + ip option overhead on a socket, and factored
 it  into L2TP change-set, RFC, would like early feedback on name and
 placement, and logic  of new function while I test this]

>From 30c4b3900d09deb912fc6ce4af3c19e870f84e14 Mon Sep 17 00:00:00 2001
From: "R. Parameswaran" 
Date: Sun, 16 Oct 2016 20:19:38 -0700

In existing kernel code, when setting up the L2TP interface, all of the
tunnel encapsulation headers are not taken into account when setting
up the MTU on the  L2TP logical interface device. Due to this, the
packets created by the applications on top of the L2TP layer are larger
than they ought to be, relative to the underlay MTU, which leads to
needless fragmentation once the L2TP packet is encapsulated in an outer IP
packet.

Specifically, the MTU calculation  does not take into account the (outer)
IP header imposed on the encapsulated L2TP packet, and the Layer 2 header
imposed on the inner L2TP packet prior to encapsulation. The patch posted
here takes care of these.

Existing code also seems to assume an Ethernet (non-jumbo) underlay. The
patch uses the PMTU mechanism and the dst entry in the L2TP tunnel socket
to directly pull up the underlay MTU (as the baseline number on top of
which the encapsulation headers are factored in).  Ethernet MTU is
assumed as a fallback only if this fails.

Picked up review comments from James Chapman, added a function
to compute ip header + ip option overhead on a socket, and factored it
into L2TP change-set.

Signed-off-by: nprac...@brocade.com,
Signed-off-by: bh...@brocade.com,
Signed-off-by: rshea...@brocade.com,
Signed-off-by: dfaw...@brocade.com
---
 include/linux/net.h |  3 +++
 net/socket.c| 37 +
 2 files changed, 40 insertions(+)

diff --git a/include/linux/net.h b/include/linux/net.h
index cd0c8bd..2c8b092 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -298,6 +298,9 @@ int kernel_sendpage(struct socket *sock, struct page *page, 
int offset,
 int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg);
 int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how);
 
+/* Following routine returns the IP overhead imposed by a socket.  */
+u32 kernel_sock_ip_overhead(struct sock *sk);
+
 #define MODULE_ALIAS_NETPROTO(proto) \
MODULE_ALIAS("net-pf-" __stringify(proto))
 
diff --git a/net/socket.c b/net/socket.c
index 5a9bf5e..d5e79c2 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3293,3 +3293,40 @@ int kernel_sock_shutdown(struct socket *sock, enum 
sock_shutdown_cmd how)
return sock->ops->shutdown(sock, how);
 }
 EXPORT_SYMBOL(kernel_sock_shutdown);
+
+/*
+ * This routine returns the IP overhead imposed by a socket i.e.
+ * the length of the underlying IP header, depending on whether
+ *  this is an IPv4 or IPv6 socket and the length from IP options turned
+ *  on at the socket.
+ */
+u32 kernel_sock_ip_overhead(struct sock *sk)
+{
+   u32 overhead = 0;
+   if (!sk)
+   goto done;
+   if (sk->sk_family == AF_INET) {
+   struct ip_options_rcu *opt = NULL;
+   struct inet_sock *inet = inet_sk(sk);
+   overhead += sizeof(struct iphdr);
+   if (inet)
+   opt = rcu_dereference_protected(inet->inet_opt,
+   sock_owned_by_user(sk));
+   if (opt)
+   overhead += opt->opt.optlen;
+   }
+   else if (sk->sk_family == AF_INET6) {
+   struct ipv6_pinfo *np = inet6_sk(sk);
+   struct ipv6_txoptions *opt = NULL;
+   overhead += sizeof(struct ipv6hdr);
+   if (np)
+   opt = rcu_dereference_protected(np->opt,
+   sock_owned_by_user(sk));
+   if (opt)
+   overhead += (opt->opt_flen + opt->opt_nflen);
+   }
+
+done:
+   return overhead;
+}
+EXPORT_SYMBOL_GPL(kernel_sock_ip_overhead);
-- 
2.1.4


On Tue, 11 Oct 2016, James Chapman wrote:

> 
> I think keep it simple. A function to return the size of the IP header
> associated with any IP socket, not necessarily a tunnel socket. Don't
> mix in any MTU derivation logic or UDP header size etc.
> 
> Post code early as an RFC. You're more likely to get review feedback
> from others.
> 
> 
> 
>

[PATCH] net: Require exact match for TCP socket lookups if dif is l3mdev

2016-10-16 Thread David Ahern

Currently, socket lookups for l3mdev (vrf) use cases can match a socket
that is bound to a port but not a device (ie., a global socket). If the
sysctl tcp_l3mdev_accept is not set this leads to ack packets going out
based on the main table even though the packet came in from an L3 domain.
The end result is that the connection does not establish creating
confusion for users since the service is running and a socket shows in
ss output. Fix by requiring an exact dif to sk_bound_dev_if match if the
skb came through an interface enslaved to an l3mdev device and the
tcp_l3mdev_accept is not set.

skb's through an l3mdev interface are marked by setting a flag in
inet{6}_skb_parm. The IPv6 variant is already set; this patch adds the
flag for IPv4. Using an skb flag avoids a device lookup on the dif. The
flag is set in the VRF driver using the IP{6}CB macros. For IPv4, the
inet_skb_parm struct is moved in the cb per commit 971f10eca186, so the
match function in the TCP stack needs to use TCP_SKB_CB. For IPv6, the
move is done after the socket lookup, so IP6CB is used.

The flags field in inet_skb_parm struct needs to be increased to add
another flag. There is currently a 1-byte hole following the flags,
so it can be expanded to u16 without increasing the size of the struct.

Fixes: 193125dbd8eb ("net: Introduce VRF device driver")
Signed-off-by: David Ahern 
---
v4
- renamed existing skb_l3mdev_slave to ipv6_l3mdev_skb
- renamed ipv4 version ipv4_l3mdev_skb

v3
- changed the match functions to pull the skb flag from TCP_SKB_CB
  rather than IPCB for IPv4 per changes from 971f10eca186. match
  function is moved to tcp.h as a consequence.
- made flags a u16 versus __u16 for consistency with frag_max_size
- updated commit message

v2
- reordered the checks in inet_exact_dif_match per Eric's comment
- changed the l3mdev determination from looking up the dif to using
  a flag set on the skb which is much faster

 drivers/net/vrf.c   |  2 ++
 include/linux/ipv6.h| 17 ++---
 include/net/ip.h|  8 +++-
 include/net/tcp.h   | 13 -
 net/ipv4/inet_hashtables.c  |  8 +---
 net/ipv6/inet6_hashtables.c |  7 ---
 6 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 85c271c70d42..820de6a9ddde 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -956,6 +956,7 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device 
*vrf_dev,
if (skb->pkt_type == PACKET_LOOPBACK) {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
+   IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
skb->pkt_type = PACKET_HOST;
goto out;
}
@@ -996,6 +997,7 @@ static struct sk_buff *vrf_ip_rcv(struct net_device 
*vrf_dev,
 {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
+   IPCB(skb)->flags |= IPSKB_L3SLAVE;
 
/* loopback traffic; do not push through packet taps again.
 * Reset pkt_type for upper layers to process skb
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 7e9a789be5e0..ca1ad9ebbc92 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -123,12 +123,12 @@ struct inet6_skb_parm {
 };
 
 #if defined(CONFIG_NET_L3_MASTER_DEV)
-static inline bool skb_l3mdev_slave(__u16 flags)
+static inline bool ipv6_l3mdev_skb(__u16 flags)
 {
return flags & IP6SKB_L3SLAVE;
 }
 #else
-static inline bool skb_l3mdev_slave(__u16 flags)
+static inline bool ipv6_l3mdev_skb(__u16 flags)
 {
return false;
 }
@@ -139,11 +139,22 @@ static inline bool skb_l3mdev_slave(__u16 flags)
 
 static inline int inet6_iif(const struct sk_buff *skb)
 {
-   bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags);
+   bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags);
 
return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
 }
 
+/* can not be used in TCP layer after tcp_v6_fill_cb */
+static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if defined(CONFIG_NET_L3_MASTER_DEV)
+   if (!net->ipv4.sysctl_tcp_l3mdev_accept &&
+   ipv6_l3mdev_skb(IP6CB(skb)->flags))
+   return true;
+#endif
+   return false;
+}
+
 struct tcp6_request_sock {
struct tcp_request_sock   tcp6rsk_tcp;
 };
diff --git a/include/net/ip.h b/include/net/ip.h
index bc43c0fcae12..c9d07988911e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -38,7 +38,7 @@ struct sock;
 struct inet_skb_parm {
int iif;
struct ip_options   opt;/* Compiled IP options  
*/
-   unsigned char   flags;
+   u16 flags;
 
 #define IPSKB_FORWARDEDBIT(0)
 #define IPSKB_XFRM_TUNNEL_SIZE BIT(1)
@@ -48,10 +48,16 @@ struct inet_skb_parm {
 #define IPSKB_DOREDIRECT   BIT(5)
 #define IPSKB_FRAG_PMTUBIT(6)
 #define IPSKB_FRAG_SEGS

Re: [PATCH 7/8] tools lib bpf: fix maps resolution

2016-10-16 Thread Wangnan (F)




On 2016/10/17 5:18, Eric Leblond wrote:

It is not correct to assimilate the elf data of the maps section
to an array of map definition. In fact the sizes differ. The
offset provided in the symbol section has to be used instead.

This patch fixes a bug causing a elf with two maps not to load
correctly.


Could you please give an example so we can understand why
section 'maps' is not an array?

Thank you.

Re: [PATCH 5/8] tools lib bpf: add missing functions

2016-10-16 Thread Wangnan (F)




On 2016/10/17 5:18, Eric Leblond wrote:

Some functions were missing in the library to be able to use it
in the case where the userspace is handling the maps in kernel.

The patch also renames functions to have a homogeneous naming
convention.

Signed-off-by: Eric Leblond 
---
  tools/lib/bpf/bpf.c| 35 ++-
  tools/lib/bpf/bpf.h|  2 --
  tools/lib/bpf/libbpf.h |  5 +
  3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 4212ed6..c0e07bd 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -25,6 +25,7 @@
  #include 
  #include 
  #include "bpf.h"
+#include "libbpf.h"
  
  /*

   * When building perf, unistd.h is overrided. __NR_bpf is
@@ -97,7 +98,7 @@ int bpf_load_program(enum bpf_prog_type type, struct bpf_insn 
*insns,
return sys_bpf(BPF_PROG_LOAD, , sizeof(attr));
  }
  
-int bpf_map_update_elem(int fd, void *key, void *value,

+int bpf_map__update_elem(int fd, void *key, void *value,
u64 flags)


Please don't use '__' style API here. It is easily be confused with
bpf_map__*() in libbpf.h. They are APIs at different level.

bpf_map__*() are APIs for 'struct bpf_map's, they are object introduced
by libbpf, defined in libbpf.h. bpf_map_*() APIs operate on fd, they are
objects defined by kernel. bpf_map_*() APIs are declared in bpf.h.

In libbpf, bpf.h directly operates on kernel objects (fd), APIs in it
are named bpf_map_*(); libbpf.h operates on 'struct bpf_map' object,
APIs in it are named using bpf_map__*(). libbpf.h and bpf.h are independent
with each other.


  {
union bpf_attr attr;
@@ -110,3 +111,35 @@ int bpf_map_update_elem(int fd, void *key, void *value,
  
  	return sys_bpf(BPF_MAP_UPDATE_ELEM, , sizeof(attr));

  }
+
+int bpf_map__lookup_elem(int fd, void *key, void *value)
+{
+   union bpf_attr attr = {
+   .map_fd = fd,
+   .key = ptr_to_u64(key),
+   .value = ptr_to_u64(value),
+   };
+
+   return sys_bpf(BPF_MAP_LOOKUP_ELEM, , sizeof(attr));
+}
+
+int bpf_map__delete_elem(int fd, void *key)
+{
+   union bpf_attr attr = {
+   .map_fd = fd,
+   .key = ptr_to_u64(key),
+   };
+
+   return sys_bpf(BPF_MAP_DELETE_ELEM, , sizeof(attr));
+}
+
+int bpf_map__get_next_key(int fd, void *key, void *next_key)
+{
+   union bpf_attr attr = {
+   .map_fd = fd,
+   .key = ptr_to_u64(key),
+   .next_key = ptr_to_u64(next_key),
+   };
+
+   return sys_bpf(BPF_MAP_GET_NEXT_KEY, , sizeof(attr));
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index e8ba540..5ca834a 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -33,6 +33,4 @@ int bpf_load_program(enum bpf_prog_type type, struct bpf_insn 
*insns,
 u32 kern_version, char *log_buf,
 size_t log_buf_sz);
  
-int bpf_map_update_elem(int fd, void *key, void *value,

-   u64 flags);
  #endif
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index a18783b..dfb46d0 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -207,6 +207,11 @@ bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
  int bpf_map__fd(struct bpf_map *map);
  const struct bpf_map_def *bpf_map__def(struct bpf_map *map);
  const char *bpf_map__name(struct bpf_map *map);
+int bpf_map__update_elem(int fd, void *key, void *value,
+   uint64_t flags);
+int bpf_map__lookup_elem(int fd, void *key, void *value);
+int bpf_map__delete_elem(int fd, void *key);
+int bpf_map__get_next_key(int fd, void *key, void *next_key);


As what we have discussed, the newly introduced functions should be added
in bpf.h.

Thank you.

Re: [PATCH 6/8] tools lib bpf: improve warning

2016-10-16 Thread Wangnan (F)




On 2016/10/17 5:18, Eric Leblond wrote:

Signed-off-by: Eric Leblond 


Please add some commit messages.

Thank you.


---
  tools/lib/bpf/libbpf.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7cd341e..1fe4532 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -802,7 +802,8 @@ bpf_object__create_maps(struct bpf_object *obj)
size_t j;
int err = *pfd;
  
-			pr_warning("failed to create map: %s\n",

+   pr_warning("failed to create map (name: '%s'): %s\n",
+  obj->maps[i].name,
   strerror(errno));
for (j = 0; j < i; j++)
zclose(obj->maps[j].fd);

Re: [PATCH 4/8] tools lib bpf: export function to set type

2016-10-16 Thread Wangnan (F)




On 2016/10/17 5:18, Eric Leblond wrote:

Current API was not allowing the user to set a type like socket
filter. To avoid a setter function for each type, the patch simply
exports a set function that takes the type in parameter.

Signed-off-by: Eric Leblond 
---
  tools/lib/bpf/libbpf.c | 19 +--
  tools/lib/bpf/libbpf.h |  3 +++
  2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 90932f1..7cd341e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1336,26 +1336,25 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n)
return fd;
  }
  
-static void bpf_program__set_type(struct bpf_program *prog,

- enum bpf_prog_type type)
+int bpf_program__set_type(struct bpf_program *prog, unsigned int type)
  {
+   if (!prog)
+   return -EINVAL;
+   if (type >= __MAX_BPF_PROG_TYPE)
+   return -EINVAL;
+
prog->type = type;
+   return 0;
  }
  
  int bpf_program__set_tracepoint(struct bpf_program *prog)

  {
-   if (!prog)
-   return -EINVAL;
-   bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
-   return 0;
+   return bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
  }
  
  int bpf_program__set_kprobe(struct bpf_program *prog)

  {
-   if (!prog)
-   return -EINVAL;
-   bpf_program__set_type(prog, BPF_PROG_TYPE_KPROBE);
-   return 0;
+   return bpf_program__set_type(prog, BPF_PROG_TYPE_KPROBE);
  }
  
  static bool bpf_program__is_type(struct bpf_program *prog,

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index e40c8d3..a18783b 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -173,6 +173,9 @@ int bpf_program__set_kprobe(struct bpf_program *prog);
  bool bpf_program__is_tracepoint(struct bpf_program *prog);
  bool bpf_program__is_kprobe(struct bpf_program *prog);
  
+int bpf_program__set_type(struct bpf_program *prog,

+ unsigned int type);
+


Although you don't include uapi/linux/bpf.h in this patch, logically
you add this dependency.

Please continously add bpf_program__set_socket_filter() and
bpf_program__is_socket_filter() like what we do for tracepoint.
This way libbpf.h is indenpendent from kernel header.

We can use macro in both .h and .c.

Thank you.

Re: [PATCH 3/8] tools: Sync tools/include/uapi/linux/bpf.h with the kernel

2016-10-16 Thread Wangnan (F)




On 2016/10/17 5:18, Eric Leblond wrote:

Signed-off-by: Eric Leblond 


Commit message is required.

Thank you.


---
  tools/include/uapi/linux/bpf.h | 52 ++
  1 file changed, 52 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9e5fc16..570287f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -95,6 +95,8 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
+   BPF_PROG_TYPE_PERF_EVENT,
+   __MAX_BPF_PROG_TYPE,
  };
  
  #define BPF_PSEUDO_MAP_FD	1

@@ -375,6 +377,56 @@ enum bpf_func_id {
 */
BPF_FUNC_probe_write_user,
  
+	/**

+* bpf_current_task_under_cgroup(map, index) - Check cgroup2 membership 
of current task
+* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+* @index: index of the cgroup in the bpf_map
+* Return:
+*   == 0 current failed the cgroup2 descendant test
+*   == 1 current succeeded the cgroup2 descendant test
+*< 0 error
+*/
+   BPF_FUNC_current_task_under_cgroup,
+
+   /**
+* bpf_skb_change_tail(skb, len, flags)
+* The helper will resize the skb to the given new size,
+* to be used f.e. with control messages.
+* @skb: pointer to skb
+* @len: new skb length
+* @flags: reserved
+* Return: 0 on success or negative error
+*/
+   BPF_FUNC_skb_change_tail,
+
+   /**
+* bpf_skb_pull_data(skb, len)
+* The helper will pull in non-linear data in case the
+* skb is non-linear and not all of len are part of the
+* linear section. Only needed for read/write with direct
+* packet access.
+* @skb: pointer to skb
+* @len: len to make read/writeable
+* Return: 0 on success or negative error
+*/
+   BPF_FUNC_skb_pull_data,
+
+   /**
+* bpf_csum_update(skb, csum)
+* Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
+* @skb: pointer to skb
+* @csum: csum to add
+* Return: csum on success or negative error
+*/
+   BPF_FUNC_csum_update,
+
+   /**
+* bpf_set_hash_invalid(skb)
+* Invalidate current skb>hash.
+* @skb: pointer to skb
+*/
+   BPF_FUNC_set_hash_invalid,
+
__BPF_FUNC_MAX_ID,
  };

Re: [PATCH 1/8] tools lib bpf: add error functions

2016-10-16 Thread Wangnan (F)




On 2016/10/17 5:18, Eric Leblond wrote:

The include of err.h is not explicitely needed in exported
functions and it was causing include conflict with some existing
code due to redefining some macros.

To fix this, let's have error handling functions provided by the
library. Furthermore this will allow user to have an homogeneous
API.

Signed-off-by: Eric Leblond 
---
  tools/lib/bpf/libbpf.c | 11 +++
  tools/lib/bpf/libbpf.h |  4 +++-
  2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b699aea..90932f1 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -31,6 +31,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  
@@ -1447,3 +1448,13 @@ bpf_object__find_map_by_name(struct bpf_object *obj, const char *name)

}
return NULL;
  }
+
+bool bpf__is_error(const void *ptr)


Please use libbpf_is_error(), like libbpf_set_print. We use '__' because 
we want

to use the OO concept. This utility is not OO.


+{
+   return IS_ERR(ptr);
+}
+
+long bpf__get_error(const void *ptr)


Same, please call it libbpf_get_error().

Thank you.

[PATCH] mac80211_hwsim: suggest nl80211 instead of wext driver in documentation

2016-10-16 Thread Linus Lüssing

For mac80211_hwsim interfaces, suggest to use wpa_supplicant with the more
modern, netlink based driver instead of wext.

Signed-off-by: Linus Lüssing 
---

Actually, I wasn't even able to make a connection with the configuration
files and information provided in
Documentation/networking/mac80211_hwsim/{README,hostapd.conf/wpa_supplicant.conf}

Changing -Dwext to -Dnl80211 helped and made the WPA-PSK connection with
mac80211_hwsim interfaces work for me.
---
 Documentation/networking/mac80211_hwsim/README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/networking/mac80211_hwsim/README 
b/Documentation/networking/mac80211_hwsim/README
index 24ac91d..3566a72 100644
--- a/Documentation/networking/mac80211_hwsim/README
+++ b/Documentation/networking/mac80211_hwsim/README
@@ -60,7 +60,7 @@ modprobe mac80211_hwsim
 hostapd hostapd.conf
 
 # Run wpa_supplicant (station) for wlan1
-wpa_supplicant -Dwext -iwlan1 -c wpa_supplicant.conf
+wpa_supplicant -Dnl80211 -iwlan1 -c wpa_supplicant.conf
 
 
 More test cases are available in hostap.git:
-- 
2.1.4

[PATCH] cxgb4: fix memory leak of qe on error exit path

2016-10-16 Thread Colin King

From: Colin Ian King 

A memory leak of qe occurs when t4_sched_queue_unbind fails,
so fix this by free'ing qe on the error exit path.

Signed-off-by: Colin Ian King 
---
 drivers/net/ethernet/chelsio/cxgb4/sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/sched.c 
b/drivers/net/ethernet/chelsio/cxgb4/sched.c
index 539de76..cbd68a8 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sched.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sched.c
@@ -210,8 +210,10 @@ static int t4_sched_queue_bind(struct port_info *pi, 
struct ch_sched_queue *p)
 
/* Unbind queue from any existing class */
err = t4_sched_queue_unbind(pi, p);
-   if (err)
+   if (err) {
+   t4_free_mem(qe);
goto out;
+   }
 
/* Bind queue to specified class */
memset(qe, 0, sizeof(*qe));
-- 
2.9.3

[PATCH 0/8] tools lib bpf: fixes and functional upgrade

2016-10-16 Thread Eric Leblond

Hello,

Here's a patchset on the libbpf library that can be found in
tools/lib/bpf.

Patch 0 to patch 4 add a new function to be able to set the BPF
program type. Till then program type such as network filter can't
be loaded by the library:

* tools lib bpf: add error functions
* uapi linux bpf: add max value to enum
* tools: Sync tools/include/uapi/linux/bpf.h with the
* tools lib bpf: export function to set type

Patch 5 is adding functions that were missing to handle maps in
userspace.

* tools lib bpf: add missing functions

Patch 7 fixes a bug in the parsing of BPF ELF file.

* tools lib bpf: fix maps resolution

Patch 8 update 'make install' to install the header on the system.

* tools lib bpf: install header file


Patchset statistics:
 include/uapi/linux/bpf.h   |  1 +
 tools/include/uapi/linux/bpf.h | 56 
++--
 tools/lib/bpf/Makefile | 11 +--
 tools/lib/bpf/bpf.c| 35 ++-
 tools/lib/bpf/bpf.h|  2 --
 tools/lib/bpf/libbpf.c | 83 
+--
 tools/lib/bpf/libbpf.h | 12 +++-
 7 files changed, 166 insertions(+), 34 deletions(-)

Best regards,
--
Eric Leblond

[PATCH 8/8] tools lib bpf: install header file

2016-10-16 Thread Eric Leblond

Makefile was not installing the header file of the library and a
manual copy was needed to have a usable library on the system.

Signed-off-by: Eric Leblond 
---
 tools/lib/bpf/Makefile | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 62d89d5..9525956 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -47,6 +47,7 @@ endif
 
 prefix ?= /usr/local
 libdir = $(prefix)/$(libdir_relative)
+includedir = $(prefix)/include/bpf
 man_dir = $(prefix)/share/man
 man_dir_SQ = '$(subst ','\'',$(man_dir))'
 
@@ -87,14 +88,16 @@ include $(FEATURES_DUMP)
 endif
 endif
 
-export prefix libdir src obj
+export prefix libdir includedir src obj
 
 # Shell quotes
 libdir_SQ = $(subst ','\'',$(libdir))
 libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
+includedir_SQ = $(subst ','\'',$(includedir))
 plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
 
 LIB_FILE = libbpf.a libbpf.so
+HEADER_FILE = libbpf.h
 
 VERSION= $(BPF_VERSION)
 PATCHLEVEL = $(BPF_PATCHLEVEL)
@@ -189,7 +192,11 @@ install_lib: all_cmd
$(call QUIET_INSTALL, $(LIB_FILE)) \
$(call do_install,$(LIB_FILE),$(libdir_SQ))
 
-install: install_lib
+install_header: all_cmd
+   $(call QUIET_INSTALL, $(HEADER_FILE)) \
+   $(call do_install,$(HEADER_FILE),$(includedir_SQ))
+
+install: install_lib install_header
 
 ### Cleaning rules
 
-- 
2.9.3

[PATCH 4/8] tools lib bpf: export function to set type

2016-10-16 Thread Eric Leblond

Current API was not allowing the user to set a type like socket
filter. To avoid a setter function for each type, the patch simply
exports a set function that takes the type in parameter.

Signed-off-by: Eric Leblond 
---
 tools/lib/bpf/libbpf.c | 19 +--
 tools/lib/bpf/libbpf.h |  3 +++
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 90932f1..7cd341e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1336,26 +1336,25 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n)
return fd;
 }
 
-static void bpf_program__set_type(struct bpf_program *prog,
- enum bpf_prog_type type)
+int bpf_program__set_type(struct bpf_program *prog, unsigned int type)
 {
+   if (!prog)
+   return -EINVAL;
+   if (type >= __MAX_BPF_PROG_TYPE)
+   return -EINVAL;
+
prog->type = type;
+   return 0;
 }
 
 int bpf_program__set_tracepoint(struct bpf_program *prog)
 {
-   if (!prog)
-   return -EINVAL;
-   bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
-   return 0;
+   return bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
 }
 
 int bpf_program__set_kprobe(struct bpf_program *prog)
 {
-   if (!prog)
-   return -EINVAL;
-   bpf_program__set_type(prog, BPF_PROG_TYPE_KPROBE);
-   return 0;
+   return bpf_program__set_type(prog, BPF_PROG_TYPE_KPROBE);
 }
 
 static bool bpf_program__is_type(struct bpf_program *prog,
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index e40c8d3..a18783b 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -173,6 +173,9 @@ int bpf_program__set_kprobe(struct bpf_program *prog);
 bool bpf_program__is_tracepoint(struct bpf_program *prog);
 bool bpf_program__is_kprobe(struct bpf_program *prog);
 
+int bpf_program__set_type(struct bpf_program *prog,
+ unsigned int type);
+
 /*
  * We don't need __attribute__((packed)) now since it is
  * unnecessary for 'bpf_map_def' because they are all aligned.
-- 
2.9.3

[PATCH 5/8] tools lib bpf: add missing functions

2016-10-16 Thread Eric Leblond

Some functions were missing in the library to be able to use it
in the case where the userspace is handling the maps in kernel.

The patch also renames functions to have a homogeneous naming
convention.

Signed-off-by: Eric Leblond 
---
 tools/lib/bpf/bpf.c| 35 ++-
 tools/lib/bpf/bpf.h|  2 --
 tools/lib/bpf/libbpf.h |  5 +
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 4212ed6..c0e07bd 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include "bpf.h"
+#include "libbpf.h"
 
 /*
  * When building perf, unistd.h is overrided. __NR_bpf is
@@ -97,7 +98,7 @@ int bpf_load_program(enum bpf_prog_type type, struct bpf_insn 
*insns,
return sys_bpf(BPF_PROG_LOAD, , sizeof(attr));
 }
 
-int bpf_map_update_elem(int fd, void *key, void *value,
+int bpf_map__update_elem(int fd, void *key, void *value,
u64 flags)
 {
union bpf_attr attr;
@@ -110,3 +111,35 @@ int bpf_map_update_elem(int fd, void *key, void *value,
 
return sys_bpf(BPF_MAP_UPDATE_ELEM, , sizeof(attr));
 }
+
+int bpf_map__lookup_elem(int fd, void *key, void *value)
+{
+   union bpf_attr attr = {
+   .map_fd = fd,
+   .key = ptr_to_u64(key),
+   .value = ptr_to_u64(value),
+   };
+
+   return sys_bpf(BPF_MAP_LOOKUP_ELEM, , sizeof(attr));
+}
+
+int bpf_map__delete_elem(int fd, void *key)
+{
+   union bpf_attr attr = {
+   .map_fd = fd,
+   .key = ptr_to_u64(key),
+   };
+
+   return sys_bpf(BPF_MAP_DELETE_ELEM, , sizeof(attr));
+}
+
+int bpf_map__get_next_key(int fd, void *key, void *next_key)
+{
+   union bpf_attr attr = {
+   .map_fd = fd,
+   .key = ptr_to_u64(key),
+   .next_key = ptr_to_u64(next_key),
+   };
+
+   return sys_bpf(BPF_MAP_GET_NEXT_KEY, , sizeof(attr));
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index e8ba540..5ca834a 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -33,6 +33,4 @@ int bpf_load_program(enum bpf_prog_type type, struct bpf_insn 
*insns,
 u32 kern_version, char *log_buf,
 size_t log_buf_sz);
 
-int bpf_map_update_elem(int fd, void *key, void *value,
-   u64 flags);
 #endif
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index a18783b..dfb46d0 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -207,6 +207,11 @@ bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
 int bpf_map__fd(struct bpf_map *map);
 const struct bpf_map_def *bpf_map__def(struct bpf_map *map);
 const char *bpf_map__name(struct bpf_map *map);
+int bpf_map__update_elem(int fd, void *key, void *value,
+   uint64_t flags);
+int bpf_map__lookup_elem(int fd, void *key, void *value);
+int bpf_map__delete_elem(int fd, void *key);
+int bpf_map__get_next_key(int fd, void *key, void *next_key);
 
 typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
 int bpf_map__set_priv(struct bpf_map *map, void *priv,
-- 
2.9.3

[PATCH 7/8] tools lib bpf: fix maps resolution

2016-10-16 Thread Eric Leblond

It is not correct to assimilate the elf data of the maps section
to an array of map definition. In fact the sizes differ. The
offset provided in the symbol section has to be used instead.

This patch fixes a bug causing a elf with two maps not to load
correctly.

Signed-off-by: Eric Leblond 
---
 tools/lib/bpf/libbpf.c | 50 +++---
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1fe4532..f72628b 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -186,6 +186,7 @@ struct bpf_program {
 struct bpf_map {
int fd;
char *name;
+   size_t offset;
struct bpf_map_def def;
void *priv;
bpf_map_clear_priv_t clear_priv;
@@ -529,13 +530,6 @@ bpf_object__init_maps(struct bpf_object *obj, void *data,
 
pr_debug("maps in %s: %zd bytes\n", obj->path, size);
 
-   obj->maps = calloc(nr_maps, sizeof(obj->maps[0]));
-   if (!obj->maps) {
-   pr_warning("alloc maps for object failed\n");
-   return -ENOMEM;
-   }
-   obj->nr_maps = nr_maps;
-
for (i = 0; i < nr_maps; i++) {
struct bpf_map_def *def = >maps[i].def;
 
@@ -547,23 +541,42 @@ bpf_object__init_maps(struct bpf_object *obj, void *data,
obj->maps[i].fd = -1;
 
/* Save map definition into obj->maps */
-   *def = ((struct bpf_map_def *)data)[i];
+   *def = *(struct bpf_map_def *)(data + obj->maps[i].offset);
}
return 0;
 }
 
 static int
-bpf_object__init_maps_name(struct bpf_object *obj)
+bpf_object__init_maps_symbol(struct bpf_object *obj)
 {
int i;
+   int nr_maps = 0;
Elf_Data *symbols = obj->efile.symbols;
+   size_t map_idx = 0;
 
if (!symbols || obj->efile.maps_shndx < 0)
return -EINVAL;
 
+   /* get the number of maps */
+   for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
+   GElf_Sym sym;
+
+   if (!gelf_getsym(symbols, i, ))
+   continue;
+   if (sym.st_shndx != obj->efile.maps_shndx)
+   continue;
+   nr_maps++;
+   }
+
+   obj->maps = calloc(nr_maps, sizeof(obj->maps[0]));
+   if (!obj->maps) {
+   pr_warning("alloc maps for object failed\n");
+   return -ENOMEM;
+   }
+   obj->nr_maps = nr_maps;
+
for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
GElf_Sym sym;
-   size_t map_idx;
const char *map_name;
 
if (!gelf_getsym(symbols, i, ))
@@ -574,12 +587,12 @@ bpf_object__init_maps_name(struct bpf_object *obj)
map_name = elf_strptr(obj->efile.elf,
  obj->efile.strtabidx,
  sym.st_name);
-   map_idx = sym.st_value / sizeof(struct bpf_map_def);
if (map_idx >= obj->nr_maps) {
pr_warning("index of map \"%s\" is buggy: %zu > %zu\n",
   map_name, map_idx, obj->nr_maps);
continue;
}
+   obj->maps[map_idx].offset = sym.st_value;
obj->maps[map_idx].name = strdup(map_name);
if (!obj->maps[map_idx].name) {
pr_warning("failed to alloc map name\n");
@@ -587,6 +600,7 @@ bpf_object__init_maps_name(struct bpf_object *obj)
}
pr_debug("map %zu is \"%s\"\n", map_idx,
 obj->maps[map_idx].name);
+   map_idx++;
}
return 0;
 }
@@ -647,8 +661,6 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
data->d_buf,
data->d_size);
else if (strcmp(name, "maps") == 0) {
-   err = bpf_object__init_maps(obj, data->d_buf,
-   data->d_size);
obj->efile.maps_shndx = idx;
} else if (sh.sh_type == SHT_SYMTAB) {
if (obj->efile.symbols) {
@@ -698,8 +710,16 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
pr_warning("Corrupted ELF file: index of strtab invalid\n");
return LIBBPF_ERRNO__FORMAT;
}
-   if (obj->efile.maps_shndx >= 0)
-   err = bpf_object__init_maps_name(obj);
+   if (obj->efile.maps_shndx >= 0) {
+   Elf_Data *data;
+   err = bpf_object__init_maps_symbol(obj);
+   if (err)
+   goto out;
+
+   scn = elf_getscn(elf, obj->efile.maps_shndx);
+   data = elf_getdata(scn, 0);
+   err = bpf_object__init_maps(obj,

[PATCH 3/8] tools: Sync tools/include/uapi/linux/bpf.h with the kernel

2016-10-16 Thread Eric Leblond

Signed-off-by: Eric Leblond 
---
 tools/include/uapi/linux/bpf.h | 52 ++
 1 file changed, 52 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9e5fc16..570287f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -95,6 +95,8 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
+   BPF_PROG_TYPE_PERF_EVENT,
+   __MAX_BPF_PROG_TYPE,
 };
 
 #define BPF_PSEUDO_MAP_FD  1
@@ -375,6 +377,56 @@ enum bpf_func_id {
 */
BPF_FUNC_probe_write_user,
 
+   /**
+* bpf_current_task_under_cgroup(map, index) - Check cgroup2 membership 
of current task
+* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+* @index: index of the cgroup in the bpf_map
+* Return:
+*   == 0 current failed the cgroup2 descendant test
+*   == 1 current succeeded the cgroup2 descendant test
+*< 0 error
+*/
+   BPF_FUNC_current_task_under_cgroup,
+
+   /**
+* bpf_skb_change_tail(skb, len, flags)
+* The helper will resize the skb to the given new size,
+* to be used f.e. with control messages.
+* @skb: pointer to skb
+* @len: new skb length
+* @flags: reserved
+* Return: 0 on success or negative error
+*/
+   BPF_FUNC_skb_change_tail,
+
+   /**
+* bpf_skb_pull_data(skb, len)
+* The helper will pull in non-linear data in case the
+* skb is non-linear and not all of len are part of the
+* linear section. Only needed for read/write with direct
+* packet access.
+* @skb: pointer to skb
+* @len: len to make read/writeable
+* Return: 0 on success or negative error
+*/
+   BPF_FUNC_skb_pull_data,
+
+   /**
+* bpf_csum_update(skb, csum)
+* Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
+* @skb: pointer to skb
+* @csum: csum to add
+* Return: csum on success or negative error
+*/
+   BPF_FUNC_csum_update,
+
+   /**
+* bpf_set_hash_invalid(skb)
+* Invalidate current skb>hash.
+* @skb: pointer to skb
+*/
+   BPF_FUNC_set_hash_invalid,
+
__BPF_FUNC_MAX_ID,
 };
 
-- 
2.9.3

[PATCH 1/8] tools lib bpf: add error functions

2016-10-16 Thread Eric Leblond

The include of err.h is not explicitely needed in exported
functions and it was causing include conflict with some existing
code due to redefining some macros.

To fix this, let's have error handling functions provided by the
library. Furthermore this will allow user to have an homogeneous
API.

Signed-off-by: Eric Leblond 
---
 tools/lib/bpf/libbpf.c | 11 +++
 tools/lib/bpf/libbpf.h |  4 +++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b699aea..90932f1 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -1447,3 +1448,13 @@ bpf_object__find_map_by_name(struct bpf_object *obj, 
const char *name)
}
return NULL;
 }
+
+bool bpf__is_error(const void *ptr)
+{
+   return IS_ERR(ptr);
+}
+
+long bpf__get_error(const void *ptr)
+{
+   return PTR_ERR(ptr);
+}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index dd7a513..e40c8d3 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -23,7 +23,6 @@
 
 #include 
 #include 
-#include 
 
 enum libbpf_errno {
__LIBBPF_ERRNO__START = 4000,
@@ -211,4 +210,7 @@ int bpf_map__set_priv(struct bpf_map *map, void *priv,
  bpf_map_clear_priv_t clear_priv);
 void *bpf_map__priv(struct bpf_map *map);
 
+bool bpf__is_error(const void *ptr);
+long bpf__get_error(const void *ptr);
+
 #endif
-- 
2.9.3

[PATCH 6/8] tools lib bpf: improve warning

2016-10-16 Thread Eric Leblond

Signed-off-by: Eric Leblond 
---
 tools/lib/bpf/libbpf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7cd341e..1fe4532 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -802,7 +802,8 @@ bpf_object__create_maps(struct bpf_object *obj)
size_t j;
int err = *pfd;
 
-   pr_warning("failed to create map: %s\n",
+   pr_warning("failed to create map (name: '%s'): %s\n",
+  obj->maps[i].name,
   strerror(errno));
for (j = 0; j < i; j++)
zclose(obj->maps[j].fd);
-- 
2.9.3

[PATCH 2/8] uapi linux bpf: add max value to enum

2016-10-16 Thread Eric Leblond

It will be used to detect userspace trying to set invalid value.

Signed-off-by: Eric Leblond 
---
 include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f09c70b..570287f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
+   __MAX_BPF_PROG_TYPE,
 };
 
 #define BPF_PSEUDO_MAP_FD  1
-- 
2.9.3

iproute: ss truncates abstract unix domain socket embedding null

2016-10-16 Thread Isaac Boukris

Hello,

The unix(7) man page says that null have no special meaning in
abstract unix domain socket address (the length is specified
therefore).

However, when such name (embedding null) is used, ss (and netstat)
will only show up to the first null occurrence (second technically, if
we count the null prefix).
e.g. the name "\0/tmp/fo\0.sock" is displayed as: "@/tmp/fo" (whilst
strace tool shows it as: sun_path=@"/tmp/fo\0.sock").

Would it be more useful if it printed the whole name and escaped the null?
If so, would '\0' be ok for escaping the null?

Thanks!

Re: BUG: KASAN: use-after-free in udp_lib_get_port

2016-10-16 Thread Cong Wang

On Sun, Oct 16, 2016 at 6:46 AM, Baozeng Ding  wrote:
> Hello all,
> While running syzkaller fuzzer I have got the following use-after-free
> bug in udp_lib_get_port. The kernel version is 4.8.0+ (on Oct 7 commit 
> d1f5323370fceaed43a7ee38f4c7bfc7e70f28d0). Unfortunately I failed to find a 
> reproducer for it.
>
> BUG: KASAN: use-after-free in udp_lib_get_port+0x1573/0x1860 at addr 
> 88000804cb60
> Write of size 8 by task syz-executor/31190
> CPU: 0 PID: 31190 Comm: syz-executor Not tainted 4.8.0+ #39
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
>  880015ac7a48 829f835b 880032b531c0 88000804cb40
>  88000804d250 880017415a4a 880015ac7a70 8174d3cc
>  880015ac7b00 88000804cb00 880032b531c0 880015ac7af0
> Call Trace:
>  [] dump_stack+0xb3/0x118 lib/dump_stack.c:15
>  [] kasan_object_err+0x1c/0x70 mm/kasan/report.c:156
>  [< inline >] print_address_description mm/kasan/report.c:194
>  [] kasan_report_error+0x1f6/0x4d0 mm/kasan/report.c:283
>  [< inline >] kasan_report mm/kasan/report.c:303
>  [] __asan_report_store8_noabort+0x3e/0x40 
> mm/kasan/report.c:329
>  [< inline >] hlist_add_head_rcu ./include/linux/rculist.h:487
>  [] udp_lib_get_port+0x1573/0x1860 net/ipv4/udp.c:345
>  [] udp_v6_get_port+0xa7/0xd0 net/ipv6/udp.c:106
>  [] inet6_bind+0x89c/0xfb0 net/ipv6/af_inet6.c:384
>  [] SYSC_bind+0x1ea/0x250 net/socket.c:1367
>  [] SyS_bind+0x24/0x30 net/socket.c:1353
>  [] entry_SYSCALL_64_fastpath+0x23/0xc6


We should have a reference to this sock via fd and its sock->sk too,
so I fail to see why it could be freed while we holding this reference.
Maybe a VFS layer bug?

> Object at 88000804cb40, in cache UDPv6 size: 1496
> Allocated:
> PID = 30789
>  [  378.305168] [] save_stack_trace+0x16/0x20
>  [  378.305168] [] save_stack+0x46/0xd0
>  [  378.305168] [] kasan_kmalloc+0xad/0xe0
>  [  378.305168] [] kasan_slab_alloc+0x12/0x20
>  [  378.305168] [< inline >] slab_post_alloc_hook mm/slab.h:417
>  [  378.305168] [< inline >] slab_alloc_node mm/slub.c:2708
>  [  378.305168] [< inline >] slab_alloc mm/slub.c:2716
>  [  378.305168] [] kmem_cache_alloc+0xc8/0x2b0 
> mm/slub.c:2721
>  [  378.305168] [] sk_prot_alloc+0x69/0x2b0 
> net/core/sock.c:1326
>  [  378.305168] [] sk_alloc+0x38/0xae0 net/core/sock.c:1388
>  [  378.305168] [] inet6_create+0x2d7/0x1000 
> net/ipv6/af_inet6.c:182
>  [  378.305168] [] __sock_create+0x37b/0x640 
> net/socket.c:1153
>  [  378.305168] [< inline >] sock_create net/socket.c:1193
>  [  378.305168] [< inline >] SYSC_socket net/socket.c:1223
>  [  378.305168] [] SyS_socket+0xef/0x1b0 net/socket.c:1203
>  [  378.305168] [] entry_SYSCALL_64_fastpath+0x23/0xc6
> Freed:
> PID = 30789
>  [  378.305168] [] save_stack_trace+0x16/0x20
>  [  378.305168] [] save_stack+0x46/0xd0
>  [  378.305168] [] kasan_slab_free+0x71/0xb0
>  [  378.305168] [< inline >] slab_free_hook mm/slub.c:1352
>  [  378.305168] [< inline >] slab_free_freelist_hook mm/slub.c:1374
>  [  378.305168] [< inline >] slab_free mm/slub.c:2951
>  [  378.305168] [] kmem_cache_free+0xc8/0x330 mm/slub.c:2973
>  [  378.305168] [< inline >] sk_prot_free net/core/sock.c:1369
>  [  378.305168] [] __sk_destruct+0x32b/0x4f0 
> net/core/sock.c:1444
>  [  378.305168] [] sk_destruct+0x44/0x80 
> net/core/sock.c:1452
>  [  378.305168] [] __sk_free+0x53/0x220 net/core/sock.c:1460
>  [  378.305168] [] sk_free+0x23/0x30 net/core/sock.c:1471
>  [  378.305168] [] sk_common_release+0x28c/0x3e0 
> ./include/net/sock.h:1589
>  [  378.305168] [] udp_lib_close+0x15/0x20 
> ./include/net/udp.h:203
>  [  378.305168] [] inet_release+0xed/0x1c0 
> net/ipv4/af_inet.c:415
>  [  378.305168] [] inet6_release+0x50/0x70 
> net/ipv6/af_inet6.c:422
>  [  378.305168] [] sock_release+0x8d/0x1d0 net/socket.c:570
>  [  378.305168] [] sock_close+0x16/0x20 net/socket.c:1017
>  [  378.305168] [] __fput+0x28c/0x780 fs/file_table.c:208
>  [  378.305168] [] fput+0x15/0x20 fs/file_table.c:244
>  [  378.305168] [] task_work_run+0xf9/0x170
>  [  378.305168] [] do_exit+0x85e/0x2a00
>  [  378.305168] [] do_group_exit+0x108/0x330
>  [  378.376437] [] get_signal+0x617/0x17a0 
> kernel/signal.c:2307
>  [  378.376437] [] do_signal+0x7f/0x18f0
>  [  378.376437] [] exit_to_usermode_loop+0xbf/0x150 
> arch/x86/entry/common.c:156
>  [  378.376437] [< inline >] prepare_exit_to_usermode 
> arch/x86/entry/common.c:190
>  [  378.376437] [] syscall_return_slowpath+0x1a0/0x1e0 
> arch/x86/entry/common.c:259
>  [  378.376437] [] entry_SYSCALL_64_fastpath+0xc4/0xc6
> Memory state around the buggy address:
>  88000804ca00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>  88000804ca80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>>88000804cb00: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
>

Re: net/l2tp:BUG: KASAN: use-after-free in l2tp_ip6_close

2016-10-16 Thread Cong Wang

On Sun, Oct 16, 2016 at 8:07 AM, Baozeng Ding  wrote:
> Hello,
> While running syzkaller fuzzer I have got the following use-after-free
> bug in l2tp_ip6_close. The kernel version is 4.8.0+ (on Oct 7 commit 
> d1f5323370fceaed43a7ee38f4c7bfc7e70f28d0).
>
> BUG: KASAN: use-after-free in l2tp_ip6_close+0x22e/0x290 at addr 
> 8800081b0ed8
> Write of size 8 by task syz-executor/10987
> CPU: 0 PID: 10987 Comm: syz-executor Not tainted 4.8.0+ #39
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
>  880031d97838 829f835b 88001b5a1640 8800081b0ec0
>  8800081b15a0 8800081b6d20 880031d97860 8174d3cc
>  880031d978f0 8800081b0e80 88001b5a1640 880031d978e0
> Call Trace:
>  [] dump_stack+0xb3/0x118 lib/dump_stack.c:15
>  [] kasan_object_err+0x1c/0x70 mm/kasan/report.c:156
>  [< inline >] print_address_description mm/kasan/report.c:194
>  [] kasan_report_error+0x1f6/0x4d0 mm/kasan/report.c:283
>  [< inline >] kasan_report mm/kasan/report.c:303
>  [] __asan_report_store8_noabort+0x3e/0x40 
> mm/kasan/report.c:329
>  [< inline >] __write_once_size ./include/linux/compiler.h:249
>  [< inline >] __hlist_del ./include/linux/list.h:622
>  [< inline >] hlist_del_init ./include/linux/list.h:637
>  [] l2tp_ip6_close+0x22e/0x290 net/l2tp/l2tp_ip6.c:239


This one looks pretty interesting, how the hell could we call fput() twice
on the same fd...


>  [] inet_release+0xed/0x1c0 net/ipv4/af_inet.c:415
>  [] inet6_release+0x50/0x70 net/ipv6/af_inet6.c:422
>  [] sock_release+0x8d/0x1d0 net/socket.c:570
>  [] sock_close+0x16/0x20 net/socket.c:1017
>  [] __fput+0x28c/0x780 fs/file_table.c:208
>  [] fput+0x15/0x20 fs/file_table.c:244
>  [] task_work_run+0xf9/0x170
>  [] do_exit+0x85e/0x2a00
>  [] do_group_exit+0x108/0x330
>  [] get_signal+0x617/0x17a0 kernel/signal.c:2307
>  [] do_signal+0x7f/0x18f0
>  [] exit_to_usermode_loop+0xbf/0x150 
> arch/x86/entry/common.c:156
>  [< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:190
>  [] syscall_return_slowpath+0x1a0/0x1e0 
> arch/x86/entry/common.c:259
>  [] entry_SYSCALL_64_fastpath+0xc4/0xc6
> Object at 8800081b0ec0, in cache L2TP/IPv6 size: 1448
> Allocated:
> PID = 10987
>  [ 1116.897025] [] save_stack_trace+0x16/0x20
>  [ 1116.897025] [] save_stack+0x46/0xd0
>  [ 1116.897025] [] kasan_kmalloc+0xad/0xe0
>  [ 1116.897025] [] kasan_slab_alloc+0x12/0x20
>  [ 1116.897025] [< inline >] slab_post_alloc_hook mm/slab.h:417
>  [ 1116.897025] [< inline >] slab_alloc_node mm/slub.c:2708
>  [ 1116.897025] [< inline >] slab_alloc mm/slub.c:2716
>  [ 1116.897025] [] kmem_cache_alloc+0xc8/0x2b0 
> mm/slub.c:2721
>  [ 1116.897025] [] sk_prot_alloc+0x69/0x2b0 
> net/core/sock.c:1326
>  [ 1116.897025] [] sk_alloc+0x38/0xae0 net/core/sock.c:1388
>  [ 1116.897025] [] inet6_create+0x2d7/0x1000 
> net/ipv6/af_inet6.c:182
>  [ 1116.897025] [] __sock_create+0x37b/0x640 
> net/socket.c:1153
>  [ 1116.897025] [< inline >] sock_create net/socket.c:1193
>  [ 1116.897025] [< inline >] SYSC_socket net/socket.c:1223
>  [ 1116.897025] [] SyS_socket+0xef/0x1b0 net/socket.c:1203
>  [ 1116.897025] [] entry_SYSCALL_64_fastpath+0x23/0xc6
> Freed:
> PID = 10987
>  [ 1116.897025] [] save_stack_trace+0x16/0x20
>  [ 1116.897025] [] save_stack+0x46/0xd0
>  [ 1116.897025] [] kasan_slab_free+0x71/0xb0
>  [ 1116.897025] [< inline >] slab_free_hook mm/slub.c:1352
>  [ 1116.897025] [< inline >] slab_free_freelist_hook mm/slub.c:1374
>  [ 1116.897025] [< inline >] slab_free mm/slub.c:2951
>  [ 1116.897025] [] kmem_cache_free+0xc8/0x330 mm/slub.c:2973
>  [ 1116.897025] [< inline >] sk_prot_free net/core/sock.c:1369
>  [ 1116.897025] [] __sk_destruct+0x32b/0x4f0 
> net/core/sock.c:1444
>  [ 1116.897025] [] sk_destruct+0x44/0x80 
> net/core/sock.c:1452
>  [ 1116.897025] [] __sk_free+0x53/0x220 net/core/sock.c:1460
>  [ 1116.897025] [] sk_free+0x23/0x30 net/core/sock.c:1471
>  [ 1116.897025] [] sk_common_release+0x28c/0x3e0 
> ./include/net/sock.h:1589
>  [ 1116.897025] [] l2tp_ip6_close+0x1fe/0x290 
> net/l2tp/l2tp_ip6.c:243
>  [ 1116.897025] [] inet_release+0xed/0x1c0 
> net/ipv4/af_inet.c:415
>  [ 1116.897025] [] inet6_release+0x50/0x70 
> net/ipv6/af_inet6.c:422
>  [ 1116.897025] [] sock_release+0x8d/0x1d0 net/socket.c:570
>  [ 1116.897025] [] sock_close+0x16/0x20 net/socket.c:1017
>  [ 1116.897025] [] __fput+0x28c/0x780 fs/file_table.c:208
>  [ 1116.897025] [] fput+0x15/0x20 fs/file_table.c:244
>  [ 1116.897025] [] task_work_run+0xf9/0x170
>  [ 1116.897025] [] do_exit+0x85e/0x2a00
>  [ 1116.897025] [] do_group_exit+0x108/0x330
>  [ 1116.897025] [] get_signal+0x617/0x17a0 
> kernel/signal.c:2307
>  [ 1116.897025] [] do_signal+0x7f/0x18f0
>  [ 1116.897025] [] exit_to_usermode_loop+0xbf/0x150 
> arch/x86/entry/common.c:156
>  [ 1116.897025] [<

Re: net/ipv6: potential deadlock in do_ipv6_setsockopt

2016-10-16 Thread Cong Wang

On Sun, Oct 16, 2016 at 6:34 AM, Baozeng Ding  wrote:
>  Possible unsafe locking scenario:
>
>CPU0CPU1
>
>   lock([  165.136033] sk_lock-AF_INET6
> );
>lock([  165.136033] rtnl_mutex
> );
>lock([  165.136033] sk_lock-AF_INET6
> );
>   lock([  165.136033] rtnl_mutex
> );
>
>  *** DEADLOCK ***

This is caused by the conditional rtnl locking in do_ipv6_setsockopt().
It looks like we miss the case of IPV6_ADDRFORM.

Please try the attached patch.
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 46ad699..b8c8d20 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -414,7 +414,9 @@ int inet6_release(struct socket *sock)
return -EINVAL;
 
/* Free mc lists */
+   rtnl_lock();
ipv6_sock_mc_close(sk);
+   rtnl_unlock();
 
/* Free ac lists */
ipv6_sock_ac_close(sk);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 5330262..1e4bcce 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -120,6 +120,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
 static bool setsockopt_needs_rtnl(int optname)
 {
switch (optname) {
+   case IPV6_ADDRFORM:
case IPV6_ADD_MEMBERSHIP:
case IPV6_DROP_MEMBERSHIP:
case IPV6_JOIN_ANYCAST:
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 75c1fc5..41badfd 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -282,10 +282,11 @@ void ipv6_sock_mc_close(struct sock *sk)
struct ipv6_mc_socklist *mc_lst;
struct net *net = sock_net(sk);
 
+   ASSERT_RTNL();
+
if (!rcu_access_pointer(np->ipv6_mc_list))
return;
 
-   rtnl_lock();
while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) {
struct net_device *dev;
 
@@ -305,7 +306,6 @@ void ipv6_sock_mc_close(struct sock *sk)
kfree_rcu(mc_lst, rcu);
 
}
-   rtnl_unlock();
 }
 
 int ip6_mc_source(int add, int omode, struct sock *sk,

[PATCH v2 net-next 1/5] net: dsa: mv88e6xxx: Implement interrupt support.

2016-10-16 Thread Andrew Lunn

The switch can have up to two interrupt controllers. One of these
contains the interrupts from the integrated PHYs, so is useful to
export. The Marvell PHY driver can then be used in interrupt mode,
rather than polling, speeding up PHY handling and reducing load on the
MDIO bus.

Signed-off-by: Andrew Lunn 
---
 .../devicetree/bindings/net/dsa/marvell.txt|  21 +-
 drivers/net/dsa/mv88e6xxx/chip.c   | 248 -
 drivers/net/dsa/mv88e6xxx/global2.c| 139 +++-
 drivers/net/dsa/mv88e6xxx/global2.h|  11 +
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h  |  31 +++
 5 files changed, 438 insertions(+), 12 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/dsa/marvell.txt 
b/Documentation/devicetree/bindings/net/dsa/marvell.txt
index 7629189398aa..32025eb4b31b 100644
--- a/Documentation/devicetree/bindings/net/dsa/marvell.txt
+++ b/Documentation/devicetree/bindings/net/dsa/marvell.txt
@@ -20,16 +20,35 @@ Required properties:
 Optional properties:
 
 - reset-gpios  : Should be a gpio specifier for a reset line
-
+- interrupt-parent : Parent interrupt controller
+- interrupts   : Interrupt from the switch
+- interrupt-controller : Indicates the switch is itself an interrupt
+ controller. This is used for the PHY interrupts.
+#interrupt-cells = <2> : Controller uses two cells, number and flag
+- mdio : container of PHY and devices on the switches MDIO
+ bus
 Example:
 
mdio {
#address-cells = <1>;
#size-cells = <0>;
+   interrupt-parent = <>;
+   interrupts = <27 IRQ_TYPE_LEVEL_LOW>;
+   interrupt-controller;
+   #interrupt-cells = <2>;
 
switch0: switch@0 {
compatible = "marvell,mv88e6085";
reg = <0>;
   reset-gpios = < 1 GPIO_ACTIVE_LOW>;
};
+   mdio {
+   #address-cells = <1>;
+   #size-cells = <0>;
+   switch1phy0: switch1phy0@0 {
+   reg = <0>;
+   interrupt-parent = <>;
+   interrupts = <0 IRQ_TYPE_LEVEL_HIGH>;
+   };
+   };
};
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 883fd9809dd2..ac032977b16c 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -18,11 +18,15 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -323,6 +327,164 @@ static int mv88e6xxx_serdes_write(struct mv88e6xxx_chip 
*chip, int reg, u16 val)
reg, val);
 }
 
+static void mv88e6xxx_g1_irq_mask(struct irq_data *d)
+{
+   struct mv88e6xxx_chip *chip = irq_data_get_irq_chip_data(d);
+   unsigned int n = d->hwirq;
+
+   chip->g1_irq.masked |= (1 << n);
+}
+
+static void mv88e6xxx_g1_irq_unmask(struct irq_data *d)
+{
+   struct mv88e6xxx_chip *chip = irq_data_get_irq_chip_data(d);
+   unsigned int n = d->hwirq;
+
+   chip->g1_irq.masked &= ~(1 << n);
+}
+
+static irqreturn_t mv88e6xxx_g1_irq_thread_fn(int irq, void *dev_id)
+{
+   struct mv88e6xxx_chip *chip = dev_id;
+   unsigned int nhandled = 0;
+   unsigned int sub_irq;
+   unsigned int n;
+   u16 reg;
+   int err;
+
+   mutex_lock(>reg_lock);
+   err = mv88e6xxx_g1_read(chip, GLOBAL_STATUS, );
+   mutex_unlock(>reg_lock);
+
+   if (err)
+   goto out;
+
+   for (n = 0; n < chip->g1_irq.nirqs; ++n) {
+   if (reg & (1 << n)) {
+   sub_irq = irq_find_mapping(chip->g1_irq.domain, n);
+   handle_nested_irq(sub_irq);
+   ++nhandled;
+   }
+   }
+out:
+   return (nhandled > 0 ? IRQ_HANDLED : IRQ_NONE);
+}
+
+static void mv88e6xxx_g1_irq_bus_lock(struct irq_data *d)
+{
+   struct mv88e6xxx_chip *chip = irq_data_get_irq_chip_data(d);
+
+   mutex_lock(>reg_lock);
+}
+
+static void mv88e6xxx_g1_irq_bus_sync_unlock(struct irq_data *d)
+{
+   struct mv88e6xxx_chip *chip = irq_data_get_irq_chip_data(d);
+   u16 mask = GENMASK(chip->g1_irq.nirqs, 0);
+   u16 reg;
+   int err;
+
+   err = mv88e6xxx_g1_read(chip, GLOBAL_CONTROL, );
+   if (err)
+   goto out;
+
+   reg &= ~mask;
+   reg |= (~chip->g1_irq.masked & mask);
+
+   err = mv88e6xxx_g1_write(chip, GLOBAL_CONTROL, reg);
+   if (err)
+   goto out;
+
+out:
+   mutex_unlock(>reg_lock);
+}
+
+static struct irq_chip mv88e6xxx_g1_irq_chip = {
+   .name   = "mv88e6xxx-g1",
+   .irq_mask

[PATCH v2 net-next 2/5] net: phy: Use threaded IRQ, to allow IRQ from sleeping devices

2016-10-16 Thread Andrew Lunn

The interrupt lines from PHYs maybe connected to I2C bus expanders, or
from switches on MDIO busses. Such interrupts are sourced from devices
which sleep, so use threaded interrupts. Threaded interrupts require
that the interrupt requester also uses the threaded API. Change the
phylib to use the threaded API, which is backwards compatible with
none-threaded IRQs.

Signed-off-by: Andrew Lunn 
---
v2: Add back IRQF_SHARED
---
 drivers/net/phy/phy.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index f424b867f73e..25f2b296aaba 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -739,10 +739,10 @@ static int phy_disable_interrupts(struct phy_device 
*phydev)
 int phy_start_interrupts(struct phy_device *phydev)
 {
atomic_set(>irq_disable, 0);
-   if (request_irq(phydev->irq, phy_interrupt,
-   IRQF_SHARED,
-   "phy_interrupt",
-   phydev) < 0) {
+   if (request_threaded_irq(phydev->irq, NULL, phy_interrupt,
+IRQF_ONESHOT | IRQF_SHARED,
+"phy_interrupt",
+phydev) < 0) {
pr_warn("%s: Can't get IRQ %d (PHY)\n",
phydev->mdio.bus->name, phydev->irq);
phydev->irq = PHY_POLL;
-- 
2.9.3

[PATCH v2 net-next 0/5] Interrupt support for mv88e6xxx

2016-10-16 Thread Andrew Lunn

This patchset add interrupt controller support to the MV88E6xxx.  This
allows access to the interrupts the internal PHY generate. These
interrupts can then be associated to a PHY device in the device tree
and used by the PHY lib, rather than polling.

Since interrupt handling needs to make MDIO bus accesses, threaded
interrupts are used. The phylib needs to request the PHY interrupt
using the threaded IRQ API. This in term allows some simplification to
the code, in that the phylib interrupt handler can directly call
phy_change(), rather than use a work queue. The work queue is however
retained for the phy_mac_interrupt() call, which can be called in hard
interrupt context.

Since RFC v1:

Keep phy_mac_interrupt() callable in hard IRQ context.

The fix to trigger the phy state machine transitions on interrupts has
already been submitted, so is dropped from here.

Added back shared interrupts support.


Andrew Lunn (5):
  net: dsa: mv88e6xxx: Implement interrupt support.
  net: phy: Use threaded IRQ, to allow IRQ from sleeping devices
  net: phy: Threaded interrupts allow some simplification
  net: phy: Use phy name when requesting the interrupt
  arm: vf610: zii devel b: Add support for switch interrupts

 .../devicetree/bindings/net/dsa/marvell.txt|  21 +-
 arch/arm/boot/dts/vf610-zii-dev-rev-b.dts  |  51 +
 drivers/net/dsa/mv88e6xxx/chip.c   | 248 -
 drivers/net/dsa/mv88e6xxx/global2.c| 139 +++-
 drivers/net/dsa/mv88e6xxx/global2.h|  11 +
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h  |  31 +++
 drivers/net/phy/phy.c  |  52 +++--
 drivers/net/phy/phy_device.c   |   2 +-
 include/linux/phy.h|   5 +-
 9 files changed, 522 insertions(+), 38 deletions(-)

-- 
2.9.3

[PATCH v2 net-next 4/5] net: phy: Use phy name when requesting the interrupt

2016-10-16 Thread Andrew Lunn

Using the fixed name "phy_interrupt" is not very informative in
/proc/interrupts when there are a lot of phys, e.g. a device with an
Ethernet switch. So when requesting the interrupt, use the name of the
phy.

Signed-off-by: Andrew Lunn 
Acked-by: Florian Fainelli 
---
 drivers/net/phy/phy.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index bb673c63c85c..96bf03352bb8 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -736,8 +736,7 @@ int phy_start_interrupts(struct phy_device *phydev)
atomic_set(>irq_disable, 0);
if (request_threaded_irq(phydev->irq, NULL, phy_interrupt,
 IRQF_ONESHOT | IRQF_SHARED,
-"phy_interrupt",
-phydev) < 0) {
+phydev_name(phydev), phydev) < 0) {
pr_warn("%s: Can't get IRQ %d (PHY)\n",
phydev->mdio.bus->name, phydev->irq);
phydev->irq = PHY_POLL;
-- 
2.9.3

[PATCH v2 net-next 5/5] arm: vf610: zii devel b: Add support for switch interrupts

2016-10-16 Thread Andrew Lunn

The Switches use GPIO lines to indicate interrupts from two of the
switches.

With these interrupts in place, we can make use of the interrupt
controllers within the switch to indicate when the internal PHYs
generate an interrupt. Use standard PHY properties to do this.

Signed-off-by: Andrew Lunn 
---
 arch/arm/boot/dts/vf610-zii-dev-rev-b.dts | 51 +++
 1 file changed, 51 insertions(+)

diff --git a/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts 
b/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts
index 5c1fcab4a6f7..1552db00cc59 100644
--- a/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts
+++ b/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts
@@ -88,10 +88,16 @@
 
switch0: switch0@0 {
compatible = "marvell,mv88e6085";
+   pinctrl-0 = <_gpio_switch0>;
+   pinctrl-names = "default";
#address-cells = <1>;
#size-cells = <0>;
reg = <0>;
dsa,member = <0 0>;
+   interrupt-parent = <>;
+   interrupts = <27 IRQ_TYPE_LEVEL_LOW>;
+   interrupt-controller;
+   #interrupt-cells = <2>;
 
ports {
#address-cells = <1>;
@@ -99,16 +105,19 @@
port@0 {
reg = <0>;
label = "lan0";
+   phy-handle = <>;
};
 
port@1 {
reg = <1>;
label = "lan1";
+   phy-handle = <>;
};
 
port@2 {
reg = <2>;
label = "lan2";
+   phy-handle = <>;
};
 
switch0port5: port@5 {
@@ -133,6 +142,24 @@
};
};
};
+   mdio {
+   #address-cells = <1>;
+   #size-cells = <0>;
+   switch0phy0: switch0phy0@0 {
+   reg = <0>;
+   interrupt-parent = <>;
+   interrupts = <0 
IRQ_TYPE_LEVEL_HIGH>;
+   };
+   switch0phy1: switch1phy0@1 {
+   reg = <1>;
+   interrupt-parent = <>;
+   interrupts = <1 
IRQ_TYPE_LEVEL_HIGH>;   };
+   switch0phy2: switch1phy0@2 {
+   reg = <2>;
+   interrupt-parent = <>;
+   interrupts = <2 
IRQ_TYPE_LEVEL_HIGH>;
+   };
+   };
};
};
 
@@ -143,10 +170,16 @@
 
switch1: switch1@0 {
compatible = "marvell,mv88e6085";
+   pinctrl-0 = <_gpio_switch1>;
+   pinctrl-names = "default";
#address-cells = <1>;
#size-cells = <0>;
reg = <0>;
dsa,member = <0 1>;
+   interrupt-parent = <>;
+   interrupts = <26 IRQ_TYPE_LEVEL_LOW>;
+   interrupt-controller;
+   #interrupt-cells = <2>;
 
ports {
#address-cells = <1>;
@@ -196,12 +229,18 @@
#size-cells = <0>;
switch1phy0: switch1phy0@0 {
reg = <0>;
+   interrupt-parent = <>;
+   interrupts = <0 
IRQ_TYPE_LEVEL_HIGH>;
};

[PATCH v2 net-next 3/5] net: phy: Threaded interrupts allow some simplification

2016-10-16 Thread Andrew Lunn

The PHY interrupts are now handled in a threaded interrupt handler,
which can sleep. The work queue is no longer needed, phy_change() can
be called directly. phy_mac_interrupt() still needs to be safe to call
in interrupt context, so keep the work queue, and use a helper to call
phy_change().

Signed-off-by: Andrew Lunn 
---
 drivers/net/phy/phy.c| 45 +---
 drivers/net/phy/phy_device.c |  2 +-
 include/linux/phy.h  |  5 +++--
 3 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 25f2b296aaba..bb673c63c85c 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -664,7 +664,7 @@ static void phy_error(struct phy_device *phydev)
  * @phy_dat: phy_device pointer
  *
  * Description: When a PHY interrupt occurs, the handler disables
- * interrupts, and schedules a work task to clear the interrupt.
+ * interrupts, and uses phy_change to handle the interrupt.
  */
 static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 {
@@ -673,15 +673,10 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
if (PHY_HALTED == phydev->state)
return IRQ_NONE;/* It can't be ours.  */
 
-   /* The MDIO bus is not allowed to be written in interrupt
-* context, so we need to disable the irq here.  A work
-* queue will write the PHY to disable and clear the
-* interrupt, and then reenable the irq line.
-*/
disable_irq_nosync(irq);
atomic_inc(>irq_disable);
 
-   queue_work(system_power_efficient_wq, >phy_queue);
+   phy_change(phydev);
 
return IRQ_HANDLED;
 }
@@ -766,12 +761,6 @@ int phy_stop_interrupts(struct phy_device *phydev)
 
free_irq(phydev->irq, phydev);
 
-   /* Cannot call flush_scheduled_work() here as desired because
-* of rtnl_lock(), but we do not really care about what would
-* be done, except from enable_irq(), so cancel any work
-* possibly pending and take care of the matter below.
-*/
-   cancel_work_sync(>phy_queue);
/* If work indeed has been cancelled, disable_irq() will have
 * been left unbalanced from phy_interrupt() and enable_irq()
 * has to be called so that other devices on the line work.
@@ -784,14 +773,11 @@ int phy_stop_interrupts(struct phy_device *phydev)
 EXPORT_SYMBOL(phy_stop_interrupts);
 
 /**
- * phy_change - Scheduled by the phy_interrupt/timer to handle PHY changes
- * @work: work_struct that describes the work to be done
+ * phy_change - Called by the phy_interrupt to handle PHY changes
+ * @phydev: phy_device struct that interrupted
  */
-void phy_change(struct work_struct *work)
+void phy_change(struct phy_device *phydev)
 {
-   struct phy_device *phydev =
-   container_of(work, struct phy_device, phy_queue);
-
if (phy_interrupt_is_valid(phydev)) {
if (phydev->drv->did_interrupt &&
!phydev->drv->did_interrupt(phydev))
@@ -833,6 +819,18 @@ void phy_change(struct work_struct *work)
 }
 
 /**
+ * phy_change_work - Scheduled by the phy_mac_interrupt to handle PHY changes
+ * @work: work_struct that describes the work to be done
+ */
+void phy_change_work(struct work_struct *work)
+{
+   struct phy_device *phydev =
+   container_of(work, struct phy_device, phy_queue);
+
+   phy_change(phydev);
+}
+
+/**
  * phy_stop - Bring down the PHY link, and stop checking the status
  * @phydev: target phy_device struct
  */
@@ -1116,6 +1114,15 @@ void phy_state_machine(struct work_struct *work)
   PHY_STATE_TIME * HZ);
 }
 
+/**
+ * phy_mac_interrupt - MAC says the link has changed
+ * @phydev: phy_device struct with changed link
+ * @new_link: Link is Up/Down.
+ *
+ * Description: The MAC layer is able indicate there has been a change
+ *   in the PHY link status. Set the new link status, and trigger the
+ *   state machine, work a work queue.
+ */
 void phy_mac_interrupt(struct phy_device *phydev, int new_link)
 {
phydev->link = new_link;
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index e977ba931878..ac440a815353 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -347,7 +347,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, 
int addr, int phy_id,
 
mutex_init(>lock);
INIT_DELAYED_WORK(>state_queue, phy_state_machine);
-   INIT_WORK(>phy_queue, phy_change);
+   INIT_WORK(>phy_queue, phy_change_work);
 
/* Request the appropriate module unconditionally; don't
 * bother trying to do so only if it isn't already loaded,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e25f1830fbcf..c47378c93607 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -343,7 +343,7 @@ struct phy_c45_device_ids {
  * giving up on the current attempt at

[RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.

2016-10-16 Thread William Tu

When running bpf_map_lookup on percpu elements, the bytes copied to
userspace depends on num_possible_cpus() * value_size, which could
potentially be larger than memory allocated from user, which depends
on sysconf(_SC_NPROCESSORS_CONF) to get the current cpu num.  As a
result, the inconsistency might corrupt the user's stack.

The fact that sysconf(_SC_NPROCESSORS_CONF) != num_possible_cpu()
happens when cpu hotadd is enabled.  For example, in Fusion when
setting vcpu.hotadd = "TRUE" or in KVM, setting
  ./qemu-system-x86_64 -smp 2, maxcpus=4 ...
the num_possible_cpu() will be 4 and sysconf() will be 2[1].
Currently the any percpu map lookup suffers this issue, try
samples/bpf/test_maps.c or tracex3.c.

Th RFC patch adds additional 'size' param from userspace so that
kernel knows the maximum memory it should copy to the user.

[1] https://www.mail-archive.com/netdev@vger.kernel.org/msg121183.html

Signed-off-by: William Tu 
---
 include/uapi/linux/bpf.h   |  5 -
 kernel/bpf/syscall.c   |  5 +++--
 samples/bpf/fds_example.c  |  2 +-
 samples/bpf/libbpf.c   |  3 ++-
 samples/bpf/libbpf.h   |  2 +-
 samples/bpf/test_maps.c| 30 +++---
 tools/include/uapi/linux/bpf.h |  5 -
 7 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f09c70b..fa0c40b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -123,7 +123,10 @@ union bpf_attr {
__aligned_u64 value;
__aligned_u64 next_key;
};
-   __u64   flags;
+   union {
+   __u64   flags;
+   __u32   size; /* number of bytes allocated in 
userspace */
+   };
};
 
struct { /* anonymous struct used by BPF_PROG_LOAD command */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962..be211ea 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -264,13 +264,14 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void 
*key, void *value)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD size
 
 static int map_lookup_elem(union bpf_attr *attr)
 {
void __user *ukey = u64_to_ptr(attr->key);
void __user *uvalue = u64_to_ptr(attr->value);
int ufd = attr->map_fd;
+   u32 usize = attr->size;
struct bpf_map *map;
void *key, *value, *ptr;
u32 value_size;
@@ -324,7 +325,7 @@ static int map_lookup_elem(union bpf_attr *attr)
goto free_value;
 
err = -EFAULT;
-   if (copy_to_user(uvalue, value, value_size) != 0)
+   if (copy_to_user(uvalue, value, min_t(u32, usize, value_size)) != 0)
goto free_value;
 
err = 0;
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
index 625e797..5b833d8 100644
--- a/samples/bpf/fds_example.c
+++ b/samples/bpf/fds_example.c
@@ -88,7 +88,7 @@ static int bpf_do_map(const char *file, uint32_t flags, 
uint32_t key,
   ret, strerror(errno));
assert(ret == 0);
} else if (flags & BPF_F_KEY) {
-   ret = bpf_lookup_elem(fd, , );
+   ret = bpf_lookup_elem(fd, , , sizeof(value));
printf("bpf: fd:%d l->(%u):%u ret:(%d,%s)\n", fd, key, value,
   ret, strerror(errno));
assert(ret == 0);
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 9969e35..9f0a1c3 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -44,12 +44,13 @@ int bpf_update_elem(int fd, void *key, void *value, 
unsigned long long flags)
return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, , sizeof(attr));
 }
 
-int bpf_lookup_elem(int fd, void *key, void *value)
+int bpf_lookup_elem(int fd, void *key, void *value, int size)
 {
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
+   .size = size,
};
 
return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, , sizeof(attr));
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index ac6edb6..b911185 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -7,7 +7,7 @@ struct bpf_insn;
 int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
   int max_entries, int map_flags);
 int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
-int bpf_lookup_elem(int fd, void *key, void *value);
+int bpf_lookup_elem(int fd, void *key, void *value, int size);
 int bpf_delete_elem(int fd, void *key);
 int bpf_get_next_key(int fd, void *key, void *next_key);
 
diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c
index cce2b59..a6a8fbe 100644
---

net/l2tp:BUG: KASAN: use-after-free in l2tp_ip6_close

2016-10-16 Thread Baozeng Ding

Hello,
While running syzkaller fuzzer I have got the following use-after-free
bug in l2tp_ip6_close. The kernel version is 4.8.0+ (on Oct 7 commit 
d1f5323370fceaed43a7ee38f4c7bfc7e70f28d0).

BUG: KASAN: use-after-free in l2tp_ip6_close+0x22e/0x290 at addr 
8800081b0ed8
Write of size 8 by task syz-executor/10987
CPU: 0 PID: 10987 Comm: syz-executor Not tainted 4.8.0+ #39
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
 880031d97838 829f835b 88001b5a1640 8800081b0ec0
 8800081b15a0 8800081b6d20 880031d97860 8174d3cc
 880031d978f0 8800081b0e80 88001b5a1640 880031d978e0
Call Trace:
 [] dump_stack+0xb3/0x118 lib/dump_stack.c:15
 [] kasan_object_err+0x1c/0x70 mm/kasan/report.c:156
 [< inline >] print_address_description mm/kasan/report.c:194
 [] kasan_report_error+0x1f6/0x4d0 mm/kasan/report.c:283
 [< inline >] kasan_report mm/kasan/report.c:303
 [] __asan_report_store8_noabort+0x3e/0x40 
mm/kasan/report.c:329
 [< inline >] __write_once_size ./include/linux/compiler.h:249
 [< inline >] __hlist_del ./include/linux/list.h:622
 [< inline >] hlist_del_init ./include/linux/list.h:637
 [] l2tp_ip6_close+0x22e/0x290 net/l2tp/l2tp_ip6.c:239
 [] inet_release+0xed/0x1c0 net/ipv4/af_inet.c:415
 [] inet6_release+0x50/0x70 net/ipv6/af_inet6.c:422
 [] sock_release+0x8d/0x1d0 net/socket.c:570
 [] sock_close+0x16/0x20 net/socket.c:1017
 [] __fput+0x28c/0x780 fs/file_table.c:208
 [] fput+0x15/0x20 fs/file_table.c:244
 [] task_work_run+0xf9/0x170
 [] do_exit+0x85e/0x2a00
 [] do_group_exit+0x108/0x330
 [] get_signal+0x617/0x17a0 kernel/signal.c:2307
 [] do_signal+0x7f/0x18f0
 [] exit_to_usermode_loop+0xbf/0x150 
arch/x86/entry/common.c:156
 [< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:190
 [] syscall_return_slowpath+0x1a0/0x1e0 
arch/x86/entry/common.c:259
 [] entry_SYSCALL_64_fastpath+0xc4/0xc6
Object at 8800081b0ec0, in cache L2TP/IPv6 size: 1448
Allocated:
PID = 10987
 [ 1116.897025] [] save_stack_trace+0x16/0x20
 [ 1116.897025] [] save_stack+0x46/0xd0
 [ 1116.897025] [] kasan_kmalloc+0xad/0xe0
 [ 1116.897025] [] kasan_slab_alloc+0x12/0x20
 [ 1116.897025] [< inline >] slab_post_alloc_hook mm/slab.h:417
 [ 1116.897025] [< inline >] slab_alloc_node mm/slub.c:2708
 [ 1116.897025] [< inline >] slab_alloc mm/slub.c:2716
 [ 1116.897025] [] kmem_cache_alloc+0xc8/0x2b0 mm/slub.c:2721
 [ 1116.897025] [] sk_prot_alloc+0x69/0x2b0 
net/core/sock.c:1326
 [ 1116.897025] [] sk_alloc+0x38/0xae0 net/core/sock.c:1388
 [ 1116.897025] [] inet6_create+0x2d7/0x1000 
net/ipv6/af_inet6.c:182
 [ 1116.897025] [] __sock_create+0x37b/0x640 net/socket.c:1153
 [ 1116.897025] [< inline >] sock_create net/socket.c:1193
 [ 1116.897025] [< inline >] SYSC_socket net/socket.c:1223
 [ 1116.897025] [] SyS_socket+0xef/0x1b0 net/socket.c:1203
 [ 1116.897025] [] entry_SYSCALL_64_fastpath+0x23/0xc6
Freed:
PID = 10987
 [ 1116.897025] [] save_stack_trace+0x16/0x20
 [ 1116.897025] [] save_stack+0x46/0xd0
 [ 1116.897025] [] kasan_slab_free+0x71/0xb0
 [ 1116.897025] [< inline >] slab_free_hook mm/slub.c:1352
 [ 1116.897025] [< inline >] slab_free_freelist_hook mm/slub.c:1374
 [ 1116.897025] [< inline >] slab_free mm/slub.c:2951
 [ 1116.897025] [] kmem_cache_free+0xc8/0x330 mm/slub.c:2973
 [ 1116.897025] [< inline >] sk_prot_free net/core/sock.c:1369
 [ 1116.897025] [] __sk_destruct+0x32b/0x4f0 
net/core/sock.c:1444
 [ 1116.897025] [] sk_destruct+0x44/0x80 net/core/sock.c:1452
 [ 1116.897025] [] __sk_free+0x53/0x220 net/core/sock.c:1460
 [ 1116.897025] [] sk_free+0x23/0x30 net/core/sock.c:1471
 [ 1116.897025] [] sk_common_release+0x28c/0x3e0 
./include/net/sock.h:1589
 [ 1116.897025] [] l2tp_ip6_close+0x1fe/0x290 
net/l2tp/l2tp_ip6.c:243
 [ 1116.897025] [] inet_release+0xed/0x1c0 
net/ipv4/af_inet.c:415
 [ 1116.897025] [] inet6_release+0x50/0x70 
net/ipv6/af_inet6.c:422
 [ 1116.897025] [] sock_release+0x8d/0x1d0 net/socket.c:570
 [ 1116.897025] [] sock_close+0x16/0x20 net/socket.c:1017
 [ 1116.897025] [] __fput+0x28c/0x780 fs/file_table.c:208
 [ 1116.897025] [] fput+0x15/0x20 fs/file_table.c:244
 [ 1116.897025] [] task_work_run+0xf9/0x170
 [ 1116.897025] [] do_exit+0x85e/0x2a00
 [ 1116.897025] [] do_group_exit+0x108/0x330
 [ 1116.897025] [] get_signal+0x617/0x17a0 
kernel/signal.c:2307
 [ 1116.897025] [] do_signal+0x7f/0x18f0
 [ 1116.897025] [] exit_to_usermode_loop+0xbf/0x150 
arch/x86/entry/common.c:156
 [ 1116.897025] [< inline >] prepare_exit_to_usermode 
arch/x86/entry/common.c:190
 [ 1116.897025] [] syscall_return_slowpath+0x1a0/0x1e0 
arch/x86/entry/common.c:259
 [ 1116.897025] [] entry_SYSCALL_64_fastpath+0xc4/0xc6
Memory state around the buggy address:
 8800081b0d80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 8800081b0e00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc

Re: [RFC v2 0/2] proc connector: get namespace events

2016-10-16 Thread Eric W. Biederman

Alban Crequy  writes:

> This is v2 of the patch set to add namespace events in the proc
> connector.

So while not totally wrong the way you report namespaces makes me
grumpy.  You are not including the device node of the filesystem
those inodes are on.  The inode number is meaningless if you don't
specify which filesystem the inode is from. 

I absolutely do not want to have to implement a namespace for namespaces
someday just because people have been sloppy like this.

So please correct this to at least report the full information on
namespaces.

Thank you,
Eric

BUG: KASAN: use-after-free in udp_lib_rehash

2016-10-16 Thread Baozeng Ding

Hello all,
While running syzkaller fuzzer I have got the following use-after-free
bug in udp_lib_rehash. The kernel version is 4.8.0+ (on Oct 7 commit 
d1f5323370fceaed43a7ee38f4c7bfc7e70f28d0). Unfortunately I failed to find a 
reproducer for it.

BUG: KASAN: use-after-free in udp_lib_rehash+0x634/0x640 at addr 
88002f3fe1e0
Write of size 8 by task syz-executor/11156
CPU: 3 PID: 11156 Comm: syz-executor Not tainted 4.8.0+ #39
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
 88001acb7b58 829f835b 880034acd900 88002f3fe1c0
 88002f3fe8d0 c9230810 88001acb7b80 8174d3cc
 88001acb7c10 88002f3fe180 880034acd900 88001acb7c00
Call Trace:
 [] dump_stack+0xb3/0x118 lib/dump_stack.c:15
 [] kasan_object_err+0x1c/0x70 mm/kasan/report.c:156
 [< inline >] print_address_description mm/kasan/report.c:194
 [] kasan_report_error+0x1f6/0x4d0 mm/kasan/report.c:283
 [< inline >] kasan_report mm/kasan/report.c:303
 [] __asan_report_store8_noabort+0x3e/0x40 
mm/kasan/report.c:329
 [< inline >] hlist_add_head_rcu ./include/linux/rculist.h:487
 [] udp_lib_rehash+0x634/0x640 net/ipv4/udp.c:1429
 [] udp_v6_rehash+0x72/0xa0 net/ipv6/udp.c:115
 [] ip6_datagram_connect+0x786/0xc40
 [] inet_dgram_connect+0x112/0x1f0 net/ipv4/af_inet.c:530
 [] SYSC_connect+0x23e/0x2e0 net/socket.c:1533
 [] SyS_connect+0x24/0x30 net/socket.c:1514
 [] entry_SYSCALL_64_fastpath+0x23/0xc6
Object at 88002f3fe1c0, in cache UDPv6 size: 1496
Allocated:
PID = 11149
 [ 1921.980207] [] save_stack_trace+0x16/0x20
 [ 1921.980207] [] save_stack+0x46/0xd0
 [ 1921.980207] [] kasan_kmalloc+0xad/0xe0
 [ 1921.980207] [] kasan_slab_alloc+0x12/0x20
 [ 1921.980207] [< inline >] slab_post_alloc_hook mm/slab.h:417
 [ 1921.980207] [< inline >] slab_alloc_node mm/slub.c:2708
 [ 1921.980207] [< inline >] slab_alloc mm/slub.c:2716
 [ 1921.980207] [] kmem_cache_alloc+0xc8/0x2b0 mm/slub.c:2721
 [ 1921.980207] [] sk_prot_alloc+0x69/0x2b0 
net/core/sock.c:1326
 [ 1921.980207] [] sk_alloc+0x38/0xae0 net/core/sock.c:1388
 [ 1921.980207] [] inet6_create+0x2d7/0x1000 
net/ipv6/af_inet6.c:182
 [ 1921.980207] [] __sock_create+0x37b/0x640 net/socket.c:1153
 [ 1921.980207] [< inline >] sock_create net/socket.c:1193
 [ 1921.980207] [< inline >] SYSC_socket net/socket.c:1223
 [ 1921.980207] [] SyS_socket+0xef/0x1b0 net/socket.c:1203
 [ 1921.980207] [] entry_SYSCALL_64_fastpath+0x23/0xc6
Freed:
PID = 11157
 [ 1921.980207] [] save_stack_trace+0x16/0x20
 [ 1921.980207] [] save_stack+0x46/0xd0
 [ 1921.980207] [] kasan_slab_free+0x71/0xb0
 [ 1921.980207] [< inline >] slab_free_hook mm/slub.c:1352
 [ 1921.980207] [< inline >] slab_free_freelist_hook mm/slub.c:1374
 [ 1921.980207] [< inline >] slab_free mm/slub.c:2951
 [ 1921.980207] [] kmem_cache_free+0xc8/0x330 mm/slub.c:2973
 [ 1921.980207] [< inline >] sk_prot_free net/core/sock.c:1369
 [ 1921.980207] [] __sk_destruct+0x32b/0x4f0 
net/core/sock.c:1444
 [ 1921.980207] [] sk_destruct+0x44/0x80 net/core/sock.c:1452
 [ 1921.980207] [] __sk_free+0x53/0x220 net/core/sock.c:1460
 [ 1921.980207] [] sk_free+0x23/0x30 net/core/sock.c:1471
 [ 1921.980207] [] sk_common_release+0x28c/0x3e0 
./include/net/sock.h:1589
 [ 1921.980207] [] udp_lib_close+0x15/0x20 
./include/net/udp.h:203
 [ 1921.980207] [] inet_release+0xed/0x1c0 
net/ipv4/af_inet.c:415
 [ 1921.980207] [] inet6_release+0x50/0x70 
net/ipv6/af_inet6.c:422
 [ 1921.980207] [] sock_release+0x8d/0x1d0 net/socket.c:570
 [ 1921.980207] [] sock_close+0x16/0x20 net/socket.c:1017
 [ 1921.980207] [] __fput+0x28c/0x780 fs/file_table.c:208
 [ 1921.980207] [] fput+0x15/0x20 fs/file_table.c:244
 [ 1921.980207] [] task_work_run+0xf9/0x170
 [ 1921.980207] [] do_exit+0x85e/0x2a00
 [ 1921.980207] [] do_group_exit+0x108/0x330
 [ 1921.980207] [] get_signal+0x617/0x17a0 
kernel/signal.c:2307
 [ 1921.980207] [] do_signal+0x7f/0x18f0
 [ 1921.980207] [] exit_to_usermode_loop+0xbf/0x150 
arch/x86/entry/common.c:156
 [ 1921.980207] [< inline >] prepare_exit_to_usermode 
arch/x86/entry/common.c:190
 [ 1921.980207] [] syscall_return_slowpath+0x1a0/0x1e0 
arch/x86/entry/common.c:259
 [ 1921.980207] [] entry_SYSCALL_64_fastpath+0xc4/0xc6
Memory state around the buggy address:
 88002f3fe080: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 88002f3fe100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>88002f3fe180: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
   ^
 88002f3fe200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 88002f3fe280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Thansk && Best Regards,
Baozeng Ding

BUG: KASAN: use-after-free in udp_lib_get_port

2016-10-16 Thread Baozeng Ding

Hello all,
While running syzkaller fuzzer I have got the following use-after-free
bug in udp_lib_get_port. The kernel version is 4.8.0+ (on Oct 7 commit 
d1f5323370fceaed43a7ee38f4c7bfc7e70f28d0). Unfortunately I failed to find a 
reproducer for it.

BUG: KASAN: use-after-free in udp_lib_get_port+0x1573/0x1860 at addr 
88000804cb60
Write of size 8 by task syz-executor/31190
CPU: 0 PID: 31190 Comm: syz-executor Not tainted 4.8.0+ #39
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
 880015ac7a48 829f835b 880032b531c0 88000804cb40
 88000804d250 880017415a4a 880015ac7a70 8174d3cc
 880015ac7b00 88000804cb00 880032b531c0 880015ac7af0
Call Trace:
 [] dump_stack+0xb3/0x118 lib/dump_stack.c:15
 [] kasan_object_err+0x1c/0x70 mm/kasan/report.c:156
 [< inline >] print_address_description mm/kasan/report.c:194
 [] kasan_report_error+0x1f6/0x4d0 mm/kasan/report.c:283
 [< inline >] kasan_report mm/kasan/report.c:303
 [] __asan_report_store8_noabort+0x3e/0x40 
mm/kasan/report.c:329
 [< inline >] hlist_add_head_rcu ./include/linux/rculist.h:487
 [] udp_lib_get_port+0x1573/0x1860 net/ipv4/udp.c:345
 [] udp_v6_get_port+0xa7/0xd0 net/ipv6/udp.c:106
 [] inet6_bind+0x89c/0xfb0 net/ipv6/af_inet6.c:384
 [] SYSC_bind+0x1ea/0x250 net/socket.c:1367
 [] SyS_bind+0x24/0x30 net/socket.c:1353
 [] entry_SYSCALL_64_fastpath+0x23/0xc6
Object at 88000804cb40, in cache UDPv6 size: 1496
Allocated:
PID = 30789
 [  378.305168] [] save_stack_trace+0x16/0x20
 [  378.305168] [] save_stack+0x46/0xd0
 [  378.305168] [] kasan_kmalloc+0xad/0xe0
 [  378.305168] [] kasan_slab_alloc+0x12/0x20
 [  378.305168] [< inline >] slab_post_alloc_hook mm/slab.h:417
 [  378.305168] [< inline >] slab_alloc_node mm/slub.c:2708
 [  378.305168] [< inline >] slab_alloc mm/slub.c:2716
 [  378.305168] [] kmem_cache_alloc+0xc8/0x2b0 mm/slub.c:2721
 [  378.305168] [] sk_prot_alloc+0x69/0x2b0 
net/core/sock.c:1326
 [  378.305168] [] sk_alloc+0x38/0xae0 net/core/sock.c:1388
 [  378.305168] [] inet6_create+0x2d7/0x1000 
net/ipv6/af_inet6.c:182
 [  378.305168] [] __sock_create+0x37b/0x640 net/socket.c:1153
 [  378.305168] [< inline >] sock_create net/socket.c:1193
 [  378.305168] [< inline >] SYSC_socket net/socket.c:1223
 [  378.305168] [] SyS_socket+0xef/0x1b0 net/socket.c:1203
 [  378.305168] [] entry_SYSCALL_64_fastpath+0x23/0xc6
Freed:
PID = 30789
 [  378.305168] [] save_stack_trace+0x16/0x20
 [  378.305168] [] save_stack+0x46/0xd0
 [  378.305168] [] kasan_slab_free+0x71/0xb0
 [  378.305168] [< inline >] slab_free_hook mm/slub.c:1352
 [  378.305168] [< inline >] slab_free_freelist_hook mm/slub.c:1374
 [  378.305168] [< inline >] slab_free mm/slub.c:2951
 [  378.305168] [] kmem_cache_free+0xc8/0x330 mm/slub.c:2973
 [  378.305168] [< inline >] sk_prot_free net/core/sock.c:1369
 [  378.305168] [] __sk_destruct+0x32b/0x4f0 
net/core/sock.c:1444
 [  378.305168] [] sk_destruct+0x44/0x80 net/core/sock.c:1452
 [  378.305168] [] __sk_free+0x53/0x220 net/core/sock.c:1460
 [  378.305168] [] sk_free+0x23/0x30 net/core/sock.c:1471
 [  378.305168] [] sk_common_release+0x28c/0x3e0 
./include/net/sock.h:1589
 [  378.305168] [] udp_lib_close+0x15/0x20 
./include/net/udp.h:203
 [  378.305168] [] inet_release+0xed/0x1c0 
net/ipv4/af_inet.c:415
 [  378.305168] [] inet6_release+0x50/0x70 
net/ipv6/af_inet6.c:422
 [  378.305168] [] sock_release+0x8d/0x1d0 net/socket.c:570
 [  378.305168] [] sock_close+0x16/0x20 net/socket.c:1017
 [  378.305168] [] __fput+0x28c/0x780 fs/file_table.c:208
 [  378.305168] [] fput+0x15/0x20 fs/file_table.c:244
 [  378.305168] [] task_work_run+0xf9/0x170
 [  378.305168] [] do_exit+0x85e/0x2a00
 [  378.305168] [] do_group_exit+0x108/0x330
 [  378.376437] [] get_signal+0x617/0x17a0 
kernel/signal.c:2307
 [  378.376437] [] do_signal+0x7f/0x18f0
 [  378.376437] [] exit_to_usermode_loop+0xbf/0x150 
arch/x86/entry/common.c:156
 [  378.376437] [< inline >] prepare_exit_to_usermode 
arch/x86/entry/common.c:190
 [  378.376437] [] syscall_return_slowpath+0x1a0/0x1e0 
arch/x86/entry/common.c:259
 [  378.376437] [] entry_SYSCALL_64_fastpath+0xc4/0xc6
Memory state around the buggy address:
 88000804ca00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 88000804ca80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>88000804cb00: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
   ^
 88000804cb80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 88000804cc00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==

Thanks && Best Regards,
Baozeng Ding

net/ipv6: potential deadlock in do_ipv6_setsockopt

2016-10-16 Thread Baozeng Ding

Hello,
While running syzkaller fuzzer I have got the following deadlock
report. The kernel version is 4.8.0+ (on Oct 7 commit 
d1f5323370fceaed43a7ee38f4c7bfc7e70f28d0). Unfortunately I failed to find a 
reproducer for it. 
===
[ INFO: possible circular locking dependency detected ]
4.8.0+ #39 Not tainted
---
syz-executor/21301 is trying to acquire lock:
 ([  165.136033] rtnl_mutex
[] rtnl_lock+0x17/0x20 net/core/rtnetlink.c:70

but task is already holding lock:
 ([  165.136033] sk_lock-AF_INET6
[] do_ipv6_setsockopt.isra.7+0x1f1/0x2960

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

:
   [  165.136033] [] lock_acquire+0x1a8/0x380 
kernel/locking/lockdep.c:3746
   [  165.136033] [] lock_sock_nested+0xcb/0x120 
net/core/sock.c:2493
   [  165.136033] [] 
do_ipv6_setsockopt.isra.7+0x268/0x2960
   [  165.136033] [] ipv6_setsockopt+0x9b/0x140
   [  165.136033] [] udpv6_setsockopt+0x45/0x80 
net/ipv6/udp.c:1344
   [  165.136033] [] sock_common_setsockopt+0x95/0xd0 
net/core/sock.c:2688
   [  165.136033] [< inline >] SYSC_setsockopt net/socket.c:1742
   [  165.136033] [] SyS_setsockopt+0x158/0x240 
net/socket.c:1721
   [  165.136033] [] entry_SYSCALL_64_fastpath+0x23/0xc6

:
   [  165.136033] [< inline >] check_prev_add 
kernel/locking/lockdep.c:1829
   [  165.136033] [< inline >] check_prevs_add 
kernel/locking/lockdep.c:1939
   [  165.136033] [< inline >] validate_chain 
kernel/locking/lockdep.c:2266
   [  165.136033] [] __lock_acquire+0x35a9/0x4bc0 
kernel/locking/lockdep.c:3335
   [  165.136033] [] lock_acquire+0x1a8/0x380 
kernel/locking/lockdep.c:3746
   [  165.136033] [< inline >] __mutex_lock_common 
kernel/locking/mutex.c:521
   [  165.136033] [] mutex_lock_nested+0xb1/0x860 
kernel/locking/mutex.c:621
   [  165.136033] [] rtnl_lock+0x17/0x20 
net/core/rtnetlink.c:70
   [  165.136033] [] ipv6_sock_mc_close+0xfe/0x350 
net/ipv6/mcast.c:288
   [  165.136033] [] 
do_ipv6_setsockopt.isra.7+0x22fc/0x2960
   [  165.136033] [] ipv6_setsockopt+0x9b/0x140
   [  165.136033] [] udpv6_setsockopt+0x45/0x80 
net/ipv6/udp.c:1344
   [  165.136033] [] sock_common_setsockopt+0x95/0xd0 
net/core/sock.c:2688
   [  165.136033] [< inline >] SYSC_setsockopt net/socket.c:1742
   [  165.136033] [] SyS_setsockopt+0x158/0x240 
net/socket.c:1721
   [  165.136033] [] entry_SYSCALL_64_fastpath+0x23/0xc6
other info that might help us debug this:

 Possible unsafe locking scenario:

   CPU0CPU1
   
  lock([  165.136033] sk_lock-AF_INET6
);
   lock([  165.136033] rtnl_mutex
);
   lock([  165.136033] sk_lock-AF_INET6
);
  lock([  165.136033] rtnl_mutex
);

 *** DEADLOCK ***

1 lock held by syz-executor/21301:
 #0: [  165.136033]  (

stack backtrace:
CPU: 1 PID: 21301 Comm: syz-executor Not tainted 4.8.0+ #39
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
 880017217580 829f835b 88d65790 88d65790
 88dc6b70 880016f41fd8 8800172175d0 8141df18
 880016f41ffa dc00 8764c180 880016f41fd8
Call Trace:
 [] dump_stack+0xb3/0x118 lib/dump_stack.c:15
 [] print_circular_bug+0x288/0x340 
kernel/locking/lockdep.c:1202
 [< inline >] check_prev_add kernel/locking/lockdep.c:1829
 [< inline >] check_prevs_add kernel/locking/lockdep.c:1939
 [< inline >] validate_chain kernel/locking/lockdep.c:2266
 [] __lock_acquire+0x35a9/0x4bc0 kernel/locking/lockdep.c:3335
 [] lock_acquire+0x1a8/0x380 kernel/locking/lockdep.c:3746
 [< inline >] __mutex_lock_common kernel/locking/mutex.c:521
 [] mutex_lock_nested+0xb1/0x860 kernel/locking/mutex.c:621
 [] rtnl_lock+0x17/0x20 net/core/rtnetlink.c:70
 [] ipv6_sock_mc_close+0xfe/0x350 net/ipv6/mcast.c:288
 [] do_ipv6_setsockopt.isra.7+0x22fc/0x2960
 [] ipv6_setsockopt+0x9b/0x140
 [] udpv6_setsockopt+0x45/0x80 net/ipv6/udp.c:1344
 [] sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2688
 [< inline >] SYSC_setsockopt net/socket.c:1742
 [] SyS_setsockopt+0x158/0x240 net/socket.c:1721
 [] entry_SYSCALL_64_fastpath+0x23/0xc6

Thanks && Best Regards,
Baozeng Ding

Re: [patch net-next RFC 4/6] Introduce sample tc action

2016-10-16 Thread Or Gerlitz

On Wed, Oct 12, 2016 at 3:41 PM, Jiri Pirko  wrote:
> From: Yotam Gigi 
>
> This action allow the user to sample traffic matched by tc classifier.
> The sampling consists of choosing packets randomly, truncating them,
> adding some informative metadata regarding the interface and the original
> packet size and mark them with specific mark, to allow further tc rules to
> match and process. The marked sample packets are then injected into the
> device ingress qdisc using netif_receive_skb.
>
> The packets metadata is packed using the ife encapsulation protocol, and
> the outer packet's ethernet dest, source and eth_type, along with the
> rate, mark and the optional truncation size can be configured from
> userspace.
>
> Example:
> To sample ingress traffic from interface eth1, and redirect the sampled
> the sampled packets to interface dummy0, one may use the commands:
>
> tc qdisc add dev eth1 handle : ingress
>
> tc filter add dev eth1 parent : \
>matchall action sample rate 12 mark 17
>
> tc filter add parent : dev eth1 protocol all \
>u32 match mark 172 0xff
>action mirred egress redirect dev dummy0
>
> Where the first command adds an ingress qdisc and the second starts
> sampling every 12'th packet on dev eth0 and marks the sampled packets with
> 17. The command third catches the sampled packets, which are marked with
> 17, and redirects them to dev dummy0.

eth0 --> eth1

command third --> third command

don't we need a re-classify directive for the u32 filter to apply
after the marking done by the matchall rule + sample action
or is that implicit?


> diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
> new file mode 100644
> index 000..a2b445a
> --- /dev/null
> +++ b/include/net/tc_act/tc_sample.h
> @@ -0,0 +1,88 @@
> +#ifndef __NET_TC_SAMPLE_H
> +#define __NET_TC_SAMPLE_H
> +
> +#include 
> +#include 
> +
> +struct tcf_sample {
> +   struct tc_actioncommon;
> +   u32 rate;
> +   u32 mark;
> +   booltruncate;
> +   u32 trunc_size;
> +   u32 packet_counter;
> +   u8  eth_dst[ETH_ALEN];
> +   u8  eth_src[ETH_ALEN];
> +   u16 eth_type;
> +   booleth_type_set;
> +   struct list_headtcfm_list;
> +};

> +++ b/include/uapi/linux/tc_act/tc_sample.h
> @@ -0,0 +1,31 @@
> +#ifndef __LINUX_TC_SAMPLE_H
> +#define __LINUX_TC_SAMPLE_H
> +
> +#include 
> +#include 
> +#include 
> +
> +#define TCA_ACT_SAMPLE 26
> +
> +struct tc_sample {
> +   tc_gen;
> +   __u32   rate;   /* sample rate */
> +   __u32   mark;   /* mark to put on the sampled packets 
> */
> +   booltruncate;   /* whether to truncate the packets */
> +   __u32   trunc_size; /* truncation size */
> +   __u8eth_dst[ETH_ALEN]; /* encapsulated mac destination */
> +   __u8eth_src[ETH_ALEN]; /* encapsulated mac source */
> +   booleth_type_set;  /* whether to overrid ethtype */
> +   __u16   eth_type;  /* encapsulated mac ethtype */
> +};

overrid --> override

what do you mean by override here, to encapsulate?

consider using 0 as special value, e.g no truncation and no encapsulation

best if you just define the netlink attributes (document on the RHS
the type, see the uapi
for the new tunnel key action) and let the tc action in-kernel code to
decode them directly
into the non UAPI structure. This way you are extendable and also
avoid having two
structs which is sort of confusing.

> +
> +enum {
> +   TCA_SAMPLE_UNSPEC,
> +   TCA_SAMPLE_TM,
> +   TCA_SAMPLE_PARMS,
> +   TCA_SAMPLE_PAD,
> +   __TCA_SAMPLE_MAX
> +};
> +#define TCA_SAMPLE_MAX (__TCA_SAMPLE_MAX - 1)
> +
> +#endif


> +static bool dev_ok_push(struct net_device *dev)
> +{
> +   switch (dev->type) {
> +   case ARPHRD_TUNNEL:
> +   case ARPHRD_TUNNEL6:
> +   case ARPHRD_SIT:
> +   case ARPHRD_IPGRE:
> +   case ARPHRD_VOID:
> +   case ARPHRD_NONE:
> +   return false;
> +   default:
> +   return true;
> +   }
> +}
> +

> +static int tcf_sample(struct sk_buff *skb, const struct tc_action *a,
> + struct tcf_result *res)
> +{
> +   struct tcf_sample *s = to_sample(a);
> +   struct sample_packet_metadata metadata;
> +   static struct ethhdr *ethhdr;
> +   struct sk_buff *skb2;
> +   int retval;
> +   u32 at;
> +
> +   tcf_lastuse_update(>tcf_tm);
> +   bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
> +
> +   rcu_read_lock();
> +   retval = READ_ONCE(s->tcf_action);
> +
> +   if (++s->packet_counter % s->rate == 0) {
> +   skb2 =

Re: [PATCH v2 1/3] net: smc91x: isolate u16 writes alignment workaround

2016-10-16 Thread Robert Jarzmik

Robert Jarzmik  writes:

> diff --git a/drivers/net/ethernet/smsc/smc91x.h 
> b/drivers/net/ethernet/smsc/smc91x.h
> index ea8465467469..dff165ed106d 100644
> --- a/drivers/net/ethernet/smsc/smc91x.h
> +++ b/drivers/net/ethernet/smsc/smc91x.h

And there is also the specific case of ARCH=MN10300, where I didn't see :
#include 

In which SMC_outw() is also defined ... sic.

So this is also to be fixed, thanks to kbuild test robot.

--
Robert

39 matches

Mail list logo