date:20170330

[PATCH v2 net-next 5/6] selftests/bpf: add a test for basic XDP functionality

2017-03-30 Thread Alexei Starovoitov

add C test for xdp_adjust_head(), packet rewrite and map lookups

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 tools/testing/selftests/bpf/Makefile   |   2 +-
 tools/testing/selftests/bpf/test_iptunnel_common.h |  37 
 tools/testing/selftests/bpf/test_progs.c   |  58 +
 tools/testing/selftests/bpf/test_xdp.c | 236 +
 4 files changed, 332 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_iptunnel_common.h
 create mode 100644 tools/testing/selftests/bpf/test_xdp.c

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index ff68c9419a67..76cbe1d42dda 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -6,7 +6,7 @@ LDLIBS += -lcap -lelf
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs
 
-TEST_GEN_FILES = test_pkt_access.o
+TEST_GEN_FILES = test_pkt_access.o test_xdp.o
 
 TEST_PROGS := test_kmod.sh
 
diff --git a/tools/testing/selftests/bpf/test_iptunnel_common.h 
b/tools/testing/selftests/bpf/test_iptunnel_common.h
new file mode 100644
index ..e4cd252a1b20
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_iptunnel_common.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _TEST_IPTNL_COMMON_H
+#define _TEST_IPTNL_COMMON_H
+
+#include 
+
+#define MAX_IPTNL_ENTRIES 256U
+
+struct vip {
+   union {
+   __u32 v6[4];
+   __u32 v4;
+   } daddr;
+   __u16 dport;
+   __u16 family;
+   __u8 protocol;
+};
+
+struct iptnl_info {
+   union {
+   __u32 v6[4];
+   __u32 v4;
+   } saddr;
+   union {
+   __u32 v6[4];
+   __u32 v4;
+   } daddr;
+   __u16 family;
+   __u8 dmac[6];
+};
+
+#endif
diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
index 1d9a310e71e5..defcb273242e 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -27,6 +27,7 @@ typedef __u16 __sum16;
 #include 
 #include 
 #include 
+#include "test_iptunnel_common.h"
 
 #define _htons __builtin_bswap16
 
@@ -100,6 +101,20 @@ static int bpf_prog_load(const char *file, enum 
bpf_prog_type type,
return 0;
 }
 
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+   const char *name)
+{
+   struct bpf_map *map;
+
+   map = bpf_object__find_map_by_name(obj, name);
+   if (!map) {
+   printf("%s:FAIL:map '%s' not found\n", test, name);
+   error_cnt++;
+   return -1;
+   }
+   return bpf_map__fd(map);
+}
+
 static void test_pkt_access(void)
 {
const char *file = "./test_pkt_access.o";
@@ -125,6 +140,48 @@ static void test_pkt_access(void)
bpf_object__close(obj);
 }
 
+static void test_xdp(void)
+{
+   struct vip key4 = {.protocol = 6, .family = AF_INET};
+   struct vip key6 = {.protocol = 6, .family = AF_INET6};
+   struct iptnl_info value4 = {.family = AF_INET};
+   struct iptnl_info value6 = {.family = AF_INET6};
+   const char *file = "./test_xdp.o";
+   struct bpf_object *obj;
+   char buf[128];
+   struct ipv6hdr *iph6 = (void *)buf + sizeof(struct ethhdr);
+   struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+   __u32 duration, retval, size;
+   int err, prog_fd, map_fd;
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, , _fd);
+   if (err)
+   return;
+
+   map_fd = bpf_find_map(__func__, obj, "vip2tnl");
+   if (map_fd < 0)
+   goto out;
+   bpf_map_update_elem(map_fd, , , 0);
+   bpf_map_update_elem(map_fd, , , 0);
+
+   err = bpf_prog_test_run(prog_fd, 1, _v4, sizeof(pkt_v4),
+   buf, , , );
+
+   CHECK(err || errno || retval != XDP_TX || size != 74 ||
+ iph->protocol != IPPROTO_IPIP, "ipv4",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size);
+
+   err = bpf_prog_test_run(prog_fd, 1, _v6, sizeof(pkt_v6),
+   buf, , , );
+   CHECK(err || errno || retval != XDP_TX || size != 114 ||
+ iph6->nexthdr != IPPROTO_IPV6, "ipv6",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size);
+out:
+   bpf_object__close(obj);
+}
+
 int main(void)
 {
struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
@@ -132,6 +189,7 @@ int main(void)
setrlimit(RLIMIT_MEMLOCK, );
 
test_pkt_access();
+   test_xdp();

[PATCH v2 net-next 0/6] bpf: program testing framework

2017-03-30 Thread Alexei Starovoitov

Development and testing of networking bpf programs is quite cumbersome.
Especially tricky are XDP programs that attach to real netdevices and
program development feels like working on the car engine while
the car is in motion.
Another problem is ongoing changes to upstream llvm core
that can introduce an optimization that verifier will not
recognize. llvm bpf backend tests have no ability to run the programs.
To improve this situation introduce BPF_PROG_TEST_RUN command
to test and performance benchmark bpf programs.
It achieves several goals:
- development of xdp and skb based bpf programs can be done
in a canned environment with unit tests
- program performance optimizations can be benchmarked outside of
networking core (without driver and skb costs)
- continuous testing of upstream changes is finally practical

Patches 4,5,6 add C based test cases of various complexity
to cover some sched_cls and xdp features. More tests will
be added in the future. The tests were run on centos7 only.

For now the framework supports only skb and xdp programs. In the future
it can be extended to socket_filter and tracing program types.

More details are in individual patches.

v1->v2:
- rename bpf_program_test_run->bpf_prog_test_run
- add missing #include  since libbpf.h shouldn't depend
on prior includes
- reordered patches 3 and 4 to keep bisect clean

Alexei Starovoitov (6):
  bpf: introduce BPF_PROG_TEST_RUN command
  tools/lib/bpf: add support for BPF_PROG_TEST_RUN command
  tools/lib/bpf: expose bpf_program__set_type()
  selftests/bpf: add a test for overlapping packet range checks
  selftests/bpf: add a test for basic XDP functionality
  selftests/bpf: add l4 load balancer test based on sched_cls

 include/linux/bpf.h|   7 +
 include/uapi/linux/bpf.h   |  12 +
 kernel/bpf/syscall.c   |  27 +-
 net/Makefile   |   2 +-
 net/bpf/Makefile   |   1 +
 net/bpf/test_run.c | 172 
 net/core/filter.c  |   5 +
 tools/include/uapi/linux/bpf.h |  24 ++
 tools/lib/bpf/bpf.c|  24 ++
 tools/lib/bpf/bpf.h|   4 +-
 tools/lib/bpf/libbpf.c |   3 +-
 tools/lib/bpf/libbpf.h |   2 +
 tools/testing/selftests/bpf/Makefile   |  17 +-
 tools/testing/selftests/bpf/test_iptunnel_common.h |  37 ++
 tools/testing/selftests/bpf/test_l4lb.c| 474 +
 tools/testing/selftests/bpf/test_pkt_access.c  |  64 +++
 tools/testing/selftests/bpf/test_progs.c   | 284 
 tools/testing/selftests/bpf/test_xdp.c | 236 ++
 18 files changed, 1385 insertions(+), 10 deletions(-)
 create mode 100644 net/bpf/Makefile
 create mode 100644 net/bpf/test_run.c
 create mode 100644 tools/testing/selftests/bpf/test_iptunnel_common.h
 create mode 100644 tools/testing/selftests/bpf/test_l4lb.c
 create mode 100644 tools/testing/selftests/bpf/test_pkt_access.c
 create mode 100644 tools/testing/selftests/bpf/test_progs.c
 create mode 100644 tools/testing/selftests/bpf/test_xdp.c

-- 
2.9.3

[PATCH v2 net-next 4/6] selftests/bpf: add a test for overlapping packet range checks

2017-03-30 Thread Alexei Starovoitov

add simple C test case for llvm and verifier range check fix from
commit b1977682a385 ("bpf: improve verifier packet range checks")

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 tools/testing/selftests/bpf/Makefile  |  17 +++-
 tools/testing/selftests/bpf/test_pkt_access.c |  64 
 tools/testing/selftests/bpf/test_progs.c  | 138 ++
 3 files changed, 215 insertions(+), 4 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_pkt_access.c
 create mode 100644 tools/testing/selftests/bpf/test_progs.c

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 6a1ad58cb66f..ff68c9419a67 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,16 +1,18 @@
 LIBDIR := ../../../lib
 BPFDIR := $(LIBDIR)/bpf
 
-CFLAGS += -Wall -O2 -I../../../include/uapi -I$(LIBDIR)
-LDLIBS += -lcap
+CFLAGS += -Wall -O2 -I../../../include/uapi -I$(LIBDIR) -I../../../include
+LDLIBS += -lcap -lelf
 
-TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map
+TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs
+
+TEST_GEN_FILES = test_pkt_access.o
 
 TEST_PROGS := test_kmod.sh
 
 include ../lib.mk
 
-BPFOBJ := $(OUTPUT)/bpf.o
+BPFOBJ := $(OUTPUT)/libbpf.a
 
 $(TEST_GEN_PROGS): $(BPFOBJ)
 
@@ -21,3 +23,10 @@ $(TEST_GEN_PROGS): $(BPFOBJ)
 
 $(BPFOBJ): force
$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/
+
+CLANG ?= clang
+
+%.o: %.c
+   $(CLANG) -I../../../include/uapi -I../../../../samples/bpf/ \
+   -D__x86_64__ -Wno-compare-distinct-pointer-types \
+   -O2 -target bpf -c $< -o $@
diff --git a/tools/testing/selftests/bpf/test_pkt_access.c 
b/tools/testing/selftests/bpf/test_pkt_access.c
new file mode 100644
index ..fd1e0832d409
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_pkt_access.c
@@ -0,0 +1,64 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define _htons __builtin_bswap16
+#define barrier() __asm__ __volatile__("": : :"memory")
+int _version SEC("version") = 1;
+
+SEC("test1")
+int process(struct __sk_buff *skb)
+{
+   void *data_end = (void *)(long)skb->data_end;
+   void *data = (void *)(long)skb->data;
+   struct ethhdr *eth = (struct ethhdr *)(data);
+   struct tcphdr *tcp = NULL;
+   __u8 proto = 255;
+   __u64 ihl_len;
+
+   if (eth + 1 > data_end)
+   return TC_ACT_SHOT;
+
+   if (eth->h_proto == _htons(ETH_P_IP)) {
+   struct iphdr *iph = (struct iphdr *)(eth + 1);
+
+   if (iph + 1 > data_end)
+   return TC_ACT_SHOT;
+   ihl_len = iph->ihl * 4;
+   proto = iph->protocol;
+   tcp = (struct tcphdr *)((void *)(iph) + ihl_len);
+   } else if (eth->h_proto == _htons(ETH_P_IPV6)) {
+   struct ipv6hdr *ip6h = (struct ipv6hdr *)(eth + 1);
+
+   if (ip6h + 1 > data_end)
+   return TC_ACT_SHOT;
+   ihl_len = sizeof(*ip6h);
+   proto = ip6h->nexthdr;
+   tcp = (struct tcphdr *)((void *)(ip6h) + ihl_len);
+   }
+
+   if (tcp) {
+   if (((void *)(tcp) + 20) > data_end || proto != 6)
+   return TC_ACT_SHOT;
+   barrier(); /* to force ordering of checks */
+   if (((void *)(tcp) + 18) > data_end)
+   return TC_ACT_SHOT;
+   if (tcp->urg_ptr == 123)
+   return TC_ACT_OK;
+   }
+
+   return TC_ACT_UNSPEC;
+}
diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
new file mode 100644
index ..1d9a310e71e5
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -0,0 +1,138 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+typedef __u16 __sum16;
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+#define _htons __builtin_bswap16
+
+static int error_cnt, pass_cnt;
+
+/* ipv4 test vector */
+static struct {
+   struct ethhdr eth;
+   struct iphdr iph;
+   struct tcphdr tcp;
+} __packed pkt_v4 = {
+   .eth.h_proto = _htons(ETH_P_IP),
+

[PATCH v2 net-next 3/6] tools/lib/bpf: expose bpf_program__set_type()

2017-03-30 Thread Alexei Starovoitov

expose bpf_program__set_type() to set program type

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 tools/lib/bpf/libbpf.c | 3 +--
 tools/lib/bpf/libbpf.h | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index ac6eb863b2a4..1a2c07eb7795 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1618,8 +1618,7 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n)
return fd;
 }
 
-static void bpf_program__set_type(struct bpf_program *prog,
- enum bpf_prog_type type)
+void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type)
 {
prog->type = type;
 }
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index b30394f9947a..32c7252f734e 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include   // for size_t
+#include 
 
 enum libbpf_errno {
__LIBBPF_ERRNO__START = 4000,
@@ -185,6 +186,7 @@ int bpf_program__set_sched_cls(struct bpf_program *prog);
 int bpf_program__set_sched_act(struct bpf_program *prog);
 int bpf_program__set_xdp(struct bpf_program *prog);
 int bpf_program__set_perf_event(struct bpf_program *prog);
+void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type);
 
 bool bpf_program__is_socket_filter(struct bpf_program *prog);
 bool bpf_program__is_tracepoint(struct bpf_program *prog);
-- 
2.9.3

[PATCH v2 net-next 6/6] selftests/bpf: add l4 load balancer test based on sched_cls

2017-03-30 Thread Alexei Starovoitov

this l4lb demo is a comprehensive test case for LLVM codegen and
kernel verifier. It's using fully inlined jhash(), complex packet
parsing and multiple map lookups of different types to stress
llvm and verifier.
The map sizes, map population and test vectors are artificial to
exercise different paths through the bpf program.

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 tools/testing/selftests/bpf/Makefile |   2 +-
 tools/testing/selftests/bpf/test_l4lb.c  | 474 +++
 tools/testing/selftests/bpf/test_progs.c |  88 ++
 3 files changed, 563 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_l4lb.c

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 76cbe1d42dda..32fb7a294f0f 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -6,7 +6,7 @@ LDLIBS += -lcap -lelf
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs
 
-TEST_GEN_FILES = test_pkt_access.o test_xdp.o
+TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o
 
 TEST_PROGS := test_kmod.sh
 
diff --git a/tools/testing/selftests/bpf/test_l4lb.c 
b/tools/testing/selftests/bpf/test_l4lb.c
new file mode 100644
index ..368bfe8b9842
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_l4lb.c
@@ -0,0 +1,474 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include "test_iptunnel_common.h"
+
+#define htons __builtin_bswap16
+#define ntohs __builtin_bswap16
+int _version SEC("version") = 1;
+
+static inline __u32 rol32(__u32 word, unsigned int shift)
+{
+   return (word << shift) | (word >> ((-shift) & 31));
+}
+
+/* copy paste of jhash from kernel sources to make sure llvm
+ * can compile it into valid sequence of bpf instructions
+ */
+#define __jhash_mix(a, b, c)   \
+{  \
+   a -= c;  a ^= rol32(c, 4);  c += b; \
+   b -= a;  b ^= rol32(a, 6);  a += c; \
+   c -= b;  c ^= rol32(b, 8);  b += a; \
+   a -= c;  a ^= rol32(c, 16); c += b; \
+   b -= a;  b ^= rol32(a, 19); a += c; \
+   c -= b;  c ^= rol32(b, 4);  b += a; \
+}
+
+#define __jhash_final(a, b, c) \
+{  \
+   c ^= b; c -= rol32(b, 14);  \
+   a ^= c; a -= rol32(c, 11);  \
+   b ^= a; b -= rol32(a, 25);  \
+   c ^= b; c -= rol32(b, 16);  \
+   a ^= c; a -= rol32(c, 4);   \
+   b ^= a; b -= rol32(a, 14);  \
+   c ^= b; c -= rol32(b, 24);  \
+}
+
+#define JHASH_INITVAL  0xdeadbeef
+
+typedef unsigned int u32;
+
+static inline u32 jhash(const void *key, u32 length, u32 initval)
+{
+   u32 a, b, c;
+   const unsigned char *k = key;
+
+   a = b = c = JHASH_INITVAL + length + initval;
+
+   while (length > 12) {
+   a += *(u32 *)(k);
+   b += *(u32 *)(k + 4);
+   c += *(u32 *)(k + 8);
+   __jhash_mix(a, b, c);
+   length -= 12;
+   k += 12;
+   }
+   switch (length) {
+   case 12: c += (u32)k[11]<<24;
+   case 11: c += (u32)k[10]<<16;
+   case 10: c += (u32)k[9]<<8;
+   case 9:  c += k[8];
+   case 8:  b += (u32)k[7]<<24;
+   case 7:  b += (u32)k[6]<<16;
+   case 6:  b += (u32)k[5]<<8;
+   case 5:  b += k[4];
+   case 4:  a += (u32)k[3]<<24;
+   case 3:  a += (u32)k[2]<<16;
+   case 2:  a += (u32)k[1]<<8;
+   case 1:  a += k[0];
+__jhash_final(a, b, c);
+   case 0: /* Nothing left to add */
+   break;
+   }
+
+   return c;
+}
+
+static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
+{
+   a += initval;
+   b += initval;
+   c += initval;
+   __jhash_final(a, b, c);
+   return c;
+}
+
+static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+   return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
+}
+
+#define PCKT_FRAGMENTED 65343
+#define IPV4_HDR_LEN_NO_OPT 20
+#define IPV4_PLUS_ICMP_HDR 28
+#define IPV6_PLUS_ICMP_HDR 48
+#define RING_SIZE 2
+#define MAX_VIPS 12
+#define MAX_REALS 5
+#define CTL_MAP_SIZE 16
+#define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE)
+#define F_IPV6 (1 << 0)
+#define F_HASH_NO_SRC_PORT (1 << 0)
+#define F_ICMP (1 << 0)
+#define F_SYN_SET (1 << 1)
+
+struct packet_description {
+   union {
+

[PATCH v2 net-next 2/6] tools/lib/bpf: add support for BPF_PROG_TEST_RUN command

2017-03-30 Thread Alexei Starovoitov

add support for BPF_PROG_TEST_RUN command to libbpf.a

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 tools/include/uapi/linux/bpf.h | 24 
 tools/lib/bpf/bpf.c| 24 
 tools/lib/bpf/bpf.h|  4 +++-
 3 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1ea08ce35567..a1d95386f562 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_cmd {
BPF_OBJ_GET,
BPF_PROG_ATTACH,
BPF_PROG_DETACH,
+   BPF_PROG_TEST_RUN,
 };
 
 enum bpf_map_type {
@@ -189,6 +190,17 @@ union bpf_attr {
__u32   attach_type;
__u32   attach_flags;
};
+
+   struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
+   __u32   prog_fd;
+   __u32   retval;
+   __u32   data_size_in;
+   __u32   data_size_out;
+   __aligned_u64   data_in;
+   __aligned_u64   data_out;
+   __u32   repeat;
+   __u32   duration;
+   } test;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
@@ -459,6 +471,18 @@ union bpf_attr {
  * Return:
  *   > 0 length of the string including the trailing NUL on success
  *   < 0 error
+ *
+ * u64 bpf_bpf_get_socket_cookie(skb)
+ * Get the cookie for the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: 8 Bytes non-decreasing number on success or 0 if the socket
+ * field is missing inside sk_buff
+ *
+ * u32 bpf_get_socket_uid(skb)
+ * Get the owner uid of the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: uid of the socket owner on success or 0 if the socket pointer
+ * inside sk_buff is NULL
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 9b58d20e8c93..f84c398c11f4 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -209,3 +209,27 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type 
type)
 
return sys_bpf(BPF_PROG_DETACH, , sizeof(attr));
 }
+
+int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
+ void *data_out, __u32 *size_out, __u32 *retval,
+ __u32 *duration)
+{
+   union bpf_attr attr;
+   int ret;
+
+   bzero(, sizeof(attr));
+   attr.test.prog_fd = prog_fd;
+   attr.test.data_in = ptr_to_u64(data);
+   attr.test.data_out = ptr_to_u64(data_out);
+   attr.test.data_size_in = size;
+   attr.test.repeat = repeat;
+
+   ret = sys_bpf(BPF_PROG_TEST_RUN, , sizeof(attr));
+   if (size_out)
+   *size_out = attr.test.data_size_out;
+   if (retval)
+   *retval = attr.test.retval;
+   if (duration)
+   *duration = attr.test.duration;
+   return ret;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 93f021932623..edb4daeff7a5 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -47,6 +47,8 @@ int bpf_obj_get(const char *pathname);
 int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
unsigned int flags);
 int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
-
+int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
+ void *data_out, __u32 *size_out, __u32 *retval,
+ __u32 *duration);
 
 #endif
-- 
2.9.3

[PATCH v2 net-next 1/6] bpf: introduce BPF_PROG_TEST_RUN command

2017-03-30 Thread Alexei Starovoitov

development and testing of networking bpf programs is quite cumbersome.
Despite availability of user space bpf interpreters the kernel is
the ultimate authority and execution environment.
Current test frameworks for TC include creation of netns, veth,
qdiscs and use of various packet generators just to test functionality
of a bpf program. XDP testing is even more complicated, since
qemu needs to be started with gro/gso disabled and precise queue
configuration, transferring of xdp program from host into guest,
attaching to virtio/eth0 and generating traffic from the host
while capturing the results from the guest.

Moreover analyzing performance bottlenecks in XDP program is
impossible in virtio environment, since cost of running the program
is tiny comparing to the overhead of virtio packet processing,
so performance testing can only be done on physical nic
with another server generating traffic.

Furthermore ongoing changes to user space control plane of production
applications cannot be run on the test servers leaving bpf programs
stubbed out for testing.

Last but not least, the upstream llvm changes are validated by the bpf
backend testsuite which has no ability to test the code generated.

To improve this situation introduce BPF_PROG_TEST_RUN command
to test and performance benchmark bpf programs.

Joint work with Daniel Borkmann.

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 include/linux/bpf.h  |   7 ++
 include/uapi/linux/bpf.h |  12 
 kernel/bpf/syscall.c |  27 +++-
 net/Makefile |   2 +-
 net/bpf/Makefile |   1 +
 net/bpf/test_run.c   | 172 +++
 net/core/filter.c|   5 ++
 7 files changed, 223 insertions(+), 3 deletions(-)
 create mode 100644 net/bpf/Makefile
 create mode 100644 net/bpf/test_run.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2ae39a3e9ead..bbb513da5075 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -169,6 +169,8 @@ struct bpf_verifier_ops {
  const struct bpf_insn *src,
  struct bpf_insn *dst,
  struct bpf_prog *prog);
+   int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
+   union bpf_attr __user *uattr);
 };
 
 struct bpf_prog_type_list {
@@ -233,6 +235,11 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const 
void *src,
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
 
+int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr);
+int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr);
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 28317a04c34d..a1d95386f562 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_cmd {
BPF_OBJ_GET,
BPF_PROG_ATTACH,
BPF_PROG_DETACH,
+   BPF_PROG_TEST_RUN,
 };
 
 enum bpf_map_type {
@@ -189,6 +190,17 @@ union bpf_attr {
__u32   attach_type;
__u32   attach_flags;
};
+
+   struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
+   __u32   prog_fd;
+   __u32   retval;
+   __u32   data_size_in;
+   __u32   data_size_out;
+   __aligned_u64   data_in;
+   __aligned_u64   data_out;
+   __u32   repeat;
+   __u32   duration;
+   } test;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c35ebfe6d84d..ab0cf4c43690 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -973,6 +973,28 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 }
 #endif /* CONFIG_CGROUP_BPF */
 
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
+
+static int bpf_prog_test_run(const union bpf_attr *attr,
+union bpf_attr __user *uattr)
+{
+   struct bpf_prog *prog;
+   int ret = -ENOTSUPP;
+
+   if (CHECK_ATTR(BPF_PROG_TEST_RUN))
+   return -EINVAL;
+
+   prog = bpf_prog_get(attr->test.prog_fd);
+   if (IS_ERR(prog))
+   return PTR_ERR(prog);
+
+   if (prog->aux->ops->test_run)
+   ret = prog->aux->ops->test_run(prog, attr, uattr);
+
+   bpf_prog_put(prog);
+   return ret;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, 
size)
 {
union bpf_attr attr = {};

Re: [PATCH V2 net-next 5/7] tun: support receiving skb through msg_control

2017-03-30 Thread Jason Wang




On 2017年03月30日 23:06, Michael S. Tsirkin wrote:

On Thu, Mar 30, 2017 at 03:22:28PM +0800, Jason Wang wrote:

This patch makes tun_recvmsg() can receive from skb from its caller
through msg_control. Vhost_net will be the first user.

Signed-off-by: Jason Wang

Do we need to bother with tun? I didn't realize one
can even use that with vhost. What would be the point of
all the virtio header stuff dealing with checksums etc?

Even if you see a use-case is it worth optimizing?




It's for tap in fact. I use "tun" just because we have already had a 
tap.c which is used by macvtap.


Thanks

Re: [PATCH V2 net-next 6/7] tap: support receiving skb from msg_control

2017-03-30 Thread Jason Wang




On 2017年03月30日 23:03, Michael S. Tsirkin wrote:

On Thu, Mar 30, 2017 at 03:22:29PM +0800, Jason Wang wrote:

This patch makes tap_recvmsg() can receive from skb from its caller
through msg_control. Vhost_net will be the first user.

Signed-off-by: Jason Wang
---
  drivers/net/tap.c | 12 
  1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index abdaf86..07d9174 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -824,15 +824,17 @@ static ssize_t tap_put_user(struct tap_queue *q,
  
  static ssize_t tap_do_read(struct tap_queue *q,

   struct iov_iter *to,
-  int noblock)
+  int noblock, struct sk_buff *skb)
  {
DEFINE_WAIT(wait);
-   struct sk_buff *skb;
ssize_t ret = 0;
  
  	if (!iov_iter_count(to))

return 0;
  
+	if (skb)

+   goto done;
+
while (1) {
if (!noblock)
prepare_to_wait(sk_sleep(>sk), ,
@@ -856,6 +858,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
if (!noblock)
finish_wait(sk_sleep(>sk), );
  
+done:

Please just use an if {} block here. goto on error is ok,
but we are far from done here and goto done is misleading.




Ok.

Thanks.

Re: [PATCH 1/6] virtio: wrap find_vqs

2017-03-30 Thread Jason Wang




On 2017年03月30日 22:32, Michael S. Tsirkin wrote:

On Thu, Mar 30, 2017 at 02:00:08PM +0800, Jason Wang wrote:


On 2017年03月30日 04:48, Michael S. Tsirkin wrote:

We are going to add more parameters to find_vqs, let's wrap the call so
we don't need to tweak all drivers every time.

Signed-off-by: Michael S. Tsirkin
---

A quick glance and it looks ok, but what the benefit of this series, is it
required by other changes?

Thanks

Yes - to avoid touching all devices when doing the rest of
the patchset.


Maybe I'm not clear. I mean the benefit of this series not this single 
patch. I guess it may be used by you proposal that avoid reset when set 
XDP? If yes, do we really want to drop some packets after XDP is set?


Thanks

Re: [PATCH V2 net-next 7/7] vhost_net: try batch dequing from skb array

2017-03-30 Thread Jason Wang




On 2017年03月30日 22:21, Michael S. Tsirkin wrote:

On Thu, Mar 30, 2017 at 03:22:30PM +0800, Jason Wang wrote:

We used to dequeue one skb during recvmsg() from skb_array, this could
be inefficient because of the bad cache utilization

which cache does this refer to btw?


Both icache and dcache more or less.




and spinlock
touching for each packet.

Do you mean the effect of extra two atomics here?


In fact four, packet length peeking needs another two.




This patch tries to batch them by calling
batch dequeuing helpers explicitly on the exported skb array and pass
the skb back through msg_control for underlayer socket to finish the
userspace copying.

Tests were done by XDP1:
- small buffer:
   Before: 1.88Mpps
   After : 2.25Mpps (+19.6%)
- mergeable buffer:
   Before: 1.83Mpps
   After : 2.10Mpps (+14.7%)

Signed-off-by: Jason Wang 

Looks like I misread the code previously. More comments below,
sorry about not asking these questions earlier.


---
  drivers/vhost/net.c | 64 +
  1 file changed, 60 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9b51989..ffa78c6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -28,6 +28,8 @@
  #include 
  #include 
  #include 
+#include 
+#include 
  
  #include 
  
@@ -85,6 +87,7 @@ struct vhost_net_ubuf_ref {

struct vhost_virtqueue *vq;
  };
  
+#define VHOST_RX_BATCH 64

  struct vhost_net_virtqueue {
struct vhost_virtqueue vq;
size_t vhost_hlen;

Could you please try playing with batch size and see
what the effect is?


Ok. I tried 32 which seems slower than 64 but still faster than no batching.




@@ -99,6 +102,10 @@ struct vhost_net_virtqueue {
/* Reference counting for outstanding ubufs.
 * Protected by vq mutex. Writers must also take device mutex. */
struct vhost_net_ubuf_ref *ubufs;
+   struct skb_array *rx_array;
+   void *rxq[VHOST_RX_BATCH];
+   int rt;
+   int rh;
  };
  
  struct vhost_net {

@@ -201,6 +208,8 @@ static void vhost_net_vq_reset(struct vhost_net *n)
n->vqs[i].ubufs = NULL;
n->vqs[i].vhost_hlen = 0;
n->vqs[i].sock_hlen = 0;
+   n->vqs[i].rt = 0;
+   n->vqs[i].rh = 0;
}
  
  }

@@ -503,13 +512,30 @@ static void handle_tx(struct vhost_net *net)
mutex_unlock(>mutex);
  }
  
-static int peek_head_len(struct sock *sk)

+static int fetch_skbs(struct vhost_net_virtqueue *rvq)
+{
+   if (rvq->rh != rvq->rt)
+   goto out;
+
+   rvq->rh = rvq->rt = 0;
+   rvq->rt = skb_array_consume_batched(rvq->rx_array, rvq->rxq,
+   VHOST_RX_BATCH);
+   if (!rvq->rt)
+   return 0;
+out:
+   return __skb_array_len_with_tag(rvq->rxq[rvq->rh]);
+}
+
+static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
  {
struct socket *sock = sk->sk_socket;
struct sk_buff *head;
int len = 0;
unsigned long flags;
  
+	if (rvq->rx_array)

+   return fetch_skbs(rvq);
+
if (sock->ops->peek_len)
return sock->ops->peek_len(sock);
  
@@ -535,12 +561,14 @@ static int sk_has_rx_data(struct sock *sk)

return skb_queue_empty(>sk_receive_queue);
  }
  
-static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)

+static int vhost_net_rx_peek_head_len(struct vhost_net *net,
+ struct sock *sk)
  {
+   struct vhost_net_virtqueue *rvq = >vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = >vq;
unsigned long uninitialized_var(endtime);
-   int len = peek_head_len(sk);
+   int len = peek_head_len(rvq, sk);
  
  	if (!len && vq->busyloop_timeout) {

/* Both tx vq and rx socket were polled here */
@@ -561,7 +589,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net 
*net, struct sock *sk)
vhost_poll_queue(>poll);
mutex_unlock(>mutex);
  
-		len = peek_head_len(sk);

+   len = peek_head_len(rvq, sk);
}
  
  	return len;

@@ -699,6 +727,8 @@ static void handle_rx(struct vhost_net *net)
/* On error, stop handling until the next kick. */
if (unlikely(headcount < 0))
goto out;
+   if (nvq->rx_array)
+   msg.msg_control = nvq->rxq[nvq->rh++];
/* On overrun, truncate and discard */
if (unlikely(headcount > UIO_MAXIOV)) {
iov_iter_init(_iter, READ, vq->iov, 1, 1);

So there's a bit of a mystery here. vhost code isn't
batched, all we are batching is the fetch from the tun ring.


I've already had vhost batching code on top (e.g descriptor indices 
prefetching and used ring batched

Re: [PATCH V2 net-next 1/7] ptr_ring: introduce batch dequeuing

2017-03-30 Thread Jason Wang




On 2017年03月30日 21:53, Michael S. Tsirkin wrote:

On Thu, Mar 30, 2017 at 03:22:24PM +0800, Jason Wang wrote:

This patch introduce a batched version of consuming, consumer can
dequeue more than one pointers from the ring at a time. We don't care
about the reorder of reading here so no need for compiler barrier.

Signed-off-by: Jason Wang 
---
  include/linux/ptr_ring.h | 65 
  1 file changed, 65 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6c70444..2be0f350 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -247,6 +247,22 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
return ptr;
  }
  
+static inline int __ptr_ring_consume_batched(struct ptr_ring *r,

+void **array, int n)

Can we use a shorter name? ptr_ring_consume_batch?


Ok, but at least we need to keep the prefix since there's a locked version.






+{
+   void *ptr;
+   int i;
+
+   for (i = 0; i < n; i++) {
+   ptr = __ptr_ring_consume(r);
+   if (!ptr)
+   break;
+   array[i] = ptr;
+   }
+
+   return i;
+}
+
  /*
   * Note: resize (below) nests producer lock within consumer lock, so if you
   * call this in interrupt or BH context, you must disable interrupts/BH when

I'd like to add a code comment here explaining why we don't
care about cpu or compiler reordering. And I think the reason is
in the way you use this API: in vhost it does not matter
if you get less entries than present in the ring.
That's ok but needs to be noted
in a code comment so people use this function correctly.


Interesting, but I still think it's not necessary.

If consumer is doing a busy polling, it will eventually get the entries. 
If the consumer need notification from producer, it should drain the 
queue which means it need enable notification before last try of 
consuming call, otherwise it was a bug. The batch consuming function in 
this patch can guarantee return at least one pointer if there's many, 
this looks sufficient for the correctness?


Thanks



Also, I think you need to repeat the comment about cpu_relax
near this function: if someone uses it in a loop,
a compiler barrier is needed to prevent compiler from
optimizing it out.

I note that ptr_ring_consume currently lacks any of these
comments so I'm ok with merging as is, and I'll add
documentation on top.
Like this perhaps?

/* Consume up to n entries and return the number of entries consumed
  * or 0 on ring empty.
  * Note: this might return early with less entries than present in the
  * ring.
  * Note: callers invoking this in a loop must use a compiler barrier,
  * for example cpu_relax(). Callers must take consumer_lock
  * if the ring is ever resized - see e.g. ptr_ring_consume_batch.
  */




@@ -297,6 +313,55 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
return ptr;
  }
  
+static inline int ptr_ring_consume_batched(struct ptr_ring *r,

+  void **array, int n)
+{
+   int ret;
+
+   spin_lock(>consumer_lock);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock(>consumer_lock);
+
+   return ret;
+}
+
+static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r,
+  void **array, int n)
+{
+   int ret;
+
+   spin_lock_irq(>consumer_lock);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock_irq(>consumer_lock);
+
+   return ret;
+}
+
+static inline int ptr_ring_consume_batched_any(struct ptr_ring *r,
+  void **array, int n)
+{
+   unsigned long flags;
+   int ret;
+
+   spin_lock_irqsave(>consumer_lock, flags);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock_irqrestore(>consumer_lock, flags);
+
+   return ret;
+}
+
+static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
+ void **array, int n)
+{
+   int ret;
+
+   spin_lock_bh(>consumer_lock);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock_bh(>consumer_lock);
+
+   return ret;
+}
+
  /* Cast to structure type and call a function without discarding from FIFO.
   * Function must return a value.
   * Callers must take consumer_lock.
--
2.7.4

[PATCH] treewide: Correct diffrent[iate] and banlance typos

2017-03-30 Thread Joe Perches

Add these misspellings to scripts/spelling.txt too

Signed-off-by: Joe Perches 
---
 drivers/media/dvb-frontends/drx39xyj/drx_dap_fasi.h | 2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c  | 2 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c   | 2 +-
 drivers/net/ethernet/qlogic/qed/qed_int.c   | 2 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c  | 2 +-
 drivers/net/ethernet/qlogic/qed/qed_sriov.c | 2 +-
 include/linux/mlx4/device.h | 2 +-
 scripts/spelling.txt| 3 +++
 8 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/media/dvb-frontends/drx39xyj/drx_dap_fasi.h 
b/drivers/media/dvb-frontends/drx39xyj/drx_dap_fasi.h
index 354ec07eae87..23ae72468025 100644
--- a/drivers/media/dvb-frontends/drx39xyj/drx_dap_fasi.h
+++ b/drivers/media/dvb-frontends/drx39xyj/drx_dap_fasi.h
@@ -70,7 +70,7 @@
 * (3) both long and short but short preferred and long only when necesarry
 *
 * These modes must be selected compile time via compile switches.
-* Compile switch settings for the diffrent modes:
+* Compile switch settings for the different modes:
 * (1) DRXDAPFASI_LONG_ADDR_ALLOWED=0, DRXDAPFASI_SHORT_ADDR_ALLOWED=1
 * (2) DRXDAPFASI_LONG_ADDR_ALLOWED=1, DRXDAPFASI_SHORT_ADDR_ALLOWED=0
 * (3) DRXDAPFASI_LONG_ADDR_ALLOWED=1, DRXDAPFASI_SHORT_ADDR_ALLOWED=1
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
index cea6bdcde33f..8baf9d3eb4b1 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
@@ -1591,7 +1591,7 @@ static int __bnx2x_vlan_mac_execute_step(struct bnx2x *bp,
if (rc != 0) {
__bnx2x_vlan_mac_h_pend(bp, o, *ramrod_flags);
 
-   /* Calling function should not diffrentiate between this case
+   /* Calling function should not differentiate between this case
 * and the case in which there is already a pending ramrod
 */
rc = 1;
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c 
b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index fca37e2c7f01..e70324f4fe84 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -1207,7 +1207,7 @@ static void hns_set_irq_affinity(struct hns_nic_priv 
*priv)
if (!alloc_cpumask_var(, GFP_KERNEL))
return;
 
-   /*diffrent irq banlance for 16core and 32core*/
+   /* different irq balance for 16core and 32core */
if (h->q_num == num_possible_cpus()) {
for (i = 0; i < h->q_num * 2; i++) {
rd = >ring_data[i];
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c 
b/drivers/net/ethernet/qlogic/qed/qed_int.c
index 84310b60849b..c6b348f00e7b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.c
@@ -3057,7 +3057,7 @@ int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn, struct 
qed_ptt *p_ptt)
 
/* There's a possibility the igu_sb_cnt_iov doesn't properly reflect
 * the number of VF SBs [especially for first VF on engine, as we can't
-* diffrentiate between empty entries and its entries].
+* differentiate between empty entries and its entries].
 * Since we don't really support more SBs than VFs today, prevent any
 * such configuration by sanitizing the number of SBs to equal the
 * number of VFs.
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c 
b/drivers/net/ethernet/qlogic/qed/qed_main.c
index d4edb993b1b0..b595f7dd4a58 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -951,7 +951,7 @@ static int qed_slowpath_start(struct qed_dev *cdev,
if (rc)
goto err2;
 
-   /* First Dword used to diffrentiate between various sources */
+   /* First Dword used to differentiate between various sources */
data = cdev->firmware->data + sizeof(u32);
 
qed_dbg_pf_init(cdev);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c 
b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 18fc6e62ca41..a69774b19712 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -625,7 +625,7 @@ int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
 *  - If !ARI, VFs would start on next device.
 *so offset - (256 - pf_id) would provide the number.
 * Utilize the fact that (256 - pf_id) is achieved only by later
-* to diffrentiate between the two.
+* to differentiate between the two.
 */
 
if (p_hwfn->cdev->p_iov_info->offset < (256 - p_hwfn->abs_pf_id)) {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 1beb1ec2fbdf..eb1a51a6617b 100644
---

Re: [PATCH net-next 1/6] bpf: introduce BPF_PROG_TEST_RUN command

2017-03-30 Thread Wangnan (F)




On 2017/3/31 11:24, Alexei Starovoitov wrote:

On 3/30/17 8:12 PM, Wangnan (F) wrote:



On 2017/3/31 10:57, Alexei Starovoitov wrote:

On 3/30/17 7:53 PM, Wangnan (F) wrote:

I suggest using a CONFIG option to enable/disable code in
test_run.o to reduce attack plane.


attack plane? what attack do you see and how config helps?



I think all testing features are not required to be compiled
for a production system. A feature which should never be used
looks dangerous to me.


It is required on production system, since xdp testing and
xdp production has to use the same kernel. We cannot
keep rebooting the server back and forth to test and then to run.
It's not testing the kernel features, it's testing bpf programs
which are technically user space components.



Okay. Now I understand it is a production feature.

Thank you.

Re: [PATCH net-next] sock: avoid dirtying sk_stamp, if possible

2017-03-30 Thread David Miller

From: Paolo Abeni 
Date: Thu, 30 Mar 2017 14:03:06 +0200

> sock_recv_ts_and_drops() unconditionally set sk->sk_stamp for
> every packet, even if the SOCK_TIMESTAMP flag is not set in the
> related socket.
> If selinux is enabled, this cause a cache miss for every packet
> since sk->sk_stamp and sk->sk_security share the same cacheline.
> With this change sk_stamp is set only if the SOCK_TIMESTAMP
> flag is set, and is cleared for the first packet, so that the user
> perceived behavior is unchanged.
> 
> This gives up to 5% speed-up under udp-flood with small packets.
> 
> Signed-off-by: Paolo Abeni 

Applied, thanks.

Re: [PATCH 1/2] virtio: allow drivers to validate features

2017-03-30 Thread Michael S. Tsirkin

On Thu, Mar 30, 2017 at 12:39:31PM -0700, David Miller wrote:
> From: "Michael S. Tsirkin" 
> Date: Wed, 29 Mar 2017 20:14:44 +0300
> 
> > Some drivers can't support all features in all configurations.  At the
> > moment we blindly set FEATURES_OK and later FAILED.  Support this better
> > by adding a callback drivers can use to do some early checks.
> > 
> > Signed-off-by: Michael S. Tsirkin 
> 
> Michael do you want me to take these virtio networking fixes into my
> tree directly or are you going to send me a pull request or something
> after it all settles down?
> 
> Thanks.

I think I'll send a pull request.

Thanks,

-- 
MST

Re: [PATCH net-next 2/6] tools/lib/bpf: add support for BPF_PROG_TEST_RUN command

2017-03-30 Thread Alexei Starovoitov


On 3/30/17 8:15 PM, Wangnan (F) wrote:

-
+int bpf_program_test_run(int prog_fd, int repeat, void *data, __u32
size,
+ void *data_out, __u32 *size_out, __u32 *retval,
+ __u32 *duration);



Please call it bpf_prog_test_run() so it looks uniform with others.


sure. will do. good catch.

Re: [PATCH net-next 1/6] bpf: introduce BPF_PROG_TEST_RUN command

2017-03-30 Thread Alexei Starovoitov


On 3/30/17 8:12 PM, Wangnan (F) wrote:



On 2017/3/31 10:57, Alexei Starovoitov wrote:

On 3/30/17 7:53 PM, Wangnan (F) wrote:

I suggest using a CONFIG option to enable/disable code in
test_run.o to reduce attack plane.


attack plane? what attack do you see and how config helps?



I think all testing features are not required to be compiled
for a production system. A feature which should never be used
looks dangerous to me.


It is required on production system, since xdp testing and
xdp production has to use the same kernel. We cannot
keep rebooting the server back and forth to test and then to run.
It's not testing the kernel features, it's testing bpf programs
which are technically user space components.


I suggest adding a CONFIG option like CONFIG_BPF_PROGRAM_TEST_RUN
to control whether the kernel should be compiled with this feature
or not. We can enable by default, and give people a chance to
turn it off. At least in my company people tends to turn all
unneeded features off. If you don't provide a config option they
will make one by themselves.


Using this logic huawei should be turning off xdp as well.
Sorry we're not going stub xdp facility out of the core
and the drivers just because you don't use it.

Re: [PATCH net-next 2/6] tools/lib/bpf: add support for BPF_PROG_TEST_RUN command

2017-03-30 Thread Wangnan (F)




On 2017/3/31 9:31, Alexei Starovoitov wrote:

add support for BPF_PROG_TEST_RUN command to libbpf.a

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
  tools/include/uapi/linux/bpf.h | 24 
  tools/lib/bpf/bpf.c| 24 
  tools/lib/bpf/bpf.h|  4 +++-
  3 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1ea08ce35567..a1d95386f562 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_cmd {
BPF_OBJ_GET,
BPF_PROG_ATTACH,
BPF_PROG_DETACH,
+   BPF_PROG_TEST_RUN,
  };
  
  enum bpf_map_type {

@@ -189,6 +190,17 @@ union bpf_attr {
__u32   attach_type;
__u32   attach_flags;
};
+
+   struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
+   __u32   prog_fd;
+   __u32   retval;
+   __u32   data_size_in;
+   __u32   data_size_out;
+   __aligned_u64   data_in;
+   __aligned_u64   data_out;
+   __u32   repeat;
+   __u32   duration;
+   } test;
  } __attribute__((aligned(8)));
  
  /* BPF helper function descriptions:

@@ -459,6 +471,18 @@ union bpf_attr {
   * Return:
   *   > 0 length of the string including the trailing NUL on success
   *   < 0 error
+ *
+ * u64 bpf_bpf_get_socket_cookie(skb)
+ * Get the cookie for the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: 8 Bytes non-decreasing number on success or 0 if the socket
+ * field is missing inside sk_buff
+ *
+ * u32 bpf_get_socket_uid(skb)
+ * Get the owner uid of the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: uid of the socket owner on success or 0 if the socket pointer
+ * inside sk_buff is NULL
   */
  #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 9b58d20e8c93..b5ca5277e30c 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -209,3 +209,27 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type 
type)
  
  	return sys_bpf(BPF_PROG_DETACH, , sizeof(attr));

  }
+
+int bpf_program_test_run(int prog_fd, int repeat, void *data, __u32 size,
+void *data_out, __u32 *size_out, __u32 *retval,
+__u32 *duration)
+{
+   union bpf_attr attr;
+   int ret;
+
+   bzero(, sizeof(attr));
+   attr.test.prog_fd = prog_fd;
+   attr.test.data_in = ptr_to_u64(data);
+   attr.test.data_out = ptr_to_u64(data_out);
+   attr.test.data_size_in = size;
+   attr.test.repeat = repeat;
+
+   ret = sys_bpf(BPF_PROG_TEST_RUN, , sizeof(attr));
+   if (size_out)
+   *size_out = attr.test.data_size_out;
+   if (retval)
+   *retval = attr.test.retval;
+   if (duration)
+   *duration = attr.test.duration;
+   return ret;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 93f021932623..adfb320ff21d 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -47,6 +47,8 @@ int bpf_obj_get(const char *pathname);
  int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
unsigned int flags);
  int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
-
+int bpf_program_test_run(int prog_fd, int repeat, void *data, __u32 size,
+void *data_out, __u32 *size_out, __u32 *retval,
+__u32 *duration);
  


Please call it bpf_prog_test_run() so it looks uniform with others.

Thank you.

Re: [PATCH net-next 1/6] bpf: introduce BPF_PROG_TEST_RUN command

2017-03-30 Thread Wangnan (F)




On 2017/3/31 10:57, Alexei Starovoitov wrote:

On 3/30/17 7:53 PM, Wangnan (F) wrote:

I suggest using a CONFIG option to enable/disable code in
test_run.o to reduce attack plane.


attack plane? what attack do you see and how config helps?



I think all testing features are not required to be compiled
for a production system. A feature which should never be used
looks dangerous to me.

I suggest adding a CONFIG option like CONFIG_BPF_PROGRAM_TEST_RUN
to control whether the kernel should be compiled with this feature
or not. We can enable by default, and give people a chance to
turn it off. At least in my company people tends to turn all
unneeded features off. If you don't provide a config option they
will make one by themselves.

Thank you.

Re: [PATCH net 00/19] net: hns: Misc. HNS Bug Fixes & Code Improvements

2017-03-30 Thread David Miller

From: Salil Mehta 
Date: Thu, 30 Mar 2017 16:30:47 +0100

> This patch set introduces various HNS bug fixes, optimizations and code
> improvements.

There is no way you should do such an expensive calculation for every
single transmit packet as you are doing in your select_queue() routine.

That's really crazy.

Just use the networking stack's queue selection scheme, or suggest
ways to improve it.  Don't do private hashing like this in your
driver, please!

Re: [PATCH net-next 1/6] bpf: introduce BPF_PROG_TEST_RUN command

2017-03-30 Thread Alexei Starovoitov


On 3/30/17 7:53 PM, Wangnan (F) wrote:

I suggest using a CONFIG option to enable/disable code in
test_run.o to reduce attack plane.


attack plane? what attack do you see and how config helps?

Re: [PATCH net-next 4/6] tools/lib/bpf: expose bpf_program__set_type()

2017-03-30 Thread Alexei Starovoitov


On 3/30/17 7:48 PM, Wangnan (F) wrote:



On 2017/3/31 10:37, Alexei Starovoitov wrote:

On 3/30/17 7:33 PM, Wangnan (F) wrote:

+void bpf_program__set_type(struct bpf_program *prog, enum
bpf_prog_type type);



This makes libbpf.h depend on uapi/linux/bpf.h (because of enum
bpf_prog_type), which is not always available.

What about defining another enum inside libbpf.h?


how about just including bpf.h? or making it 'int' instead of enum?



Including either kernel header into libbpf.h makes a lot of trouble,
because kernel header and uapi have many other things we don't need
and may conflict with existing code.


I'm not proposing to include kernel headers. Regular 
/usr/include/linux/bpf.h is enough. This library isn't going to be 
compiled on distros

that don't have bpf support anyway.


Making it 'int' looks like a backdoor. We still need macro to define
each program type.


macro for each program wasn't the greatest idea. It always
behind new program types and not usable for this use case.
See patches 5 and 6.

Re: [PATCH net-next 1/6] bpf: introduce BPF_PROG_TEST_RUN command

2017-03-30 Thread Wangnan (F)




On 2017/3/31 9:31, Alexei Starovoitov wrote:

development and testing of networking bpf programs is quite cumbersome.
Despite availability of user space bpf interpreters the kernel is
the ultimate authority and execution environment.
Current test frameworks for TC include creation of netns, veth,
qdiscs and use of various packet generators just to test functionality
of a bpf program. XDP testing is even more complicated, since
qemu needs to be started with gro/gso disabled and precise queue
configuration, transferring of xdp program from host into guest,
attaching to virtio/eth0 and generating traffic from the host
while capturing the results from the guest.

Moreover analyzing performance bottlenecks in XDP program is
impossible in virtio environment, since cost of running the program
is tiny comparing to the overhead of virtio packet processing,
so performance testing can only be done on physical nic
with another server generating traffic.

Furthermore ongoing changes to user space control plane of production
applications cannot be run on the test servers leaving bpf programs
stubbed out for testing.

Last but not least, the upstream llvm changes are validated by the bpf
backend testsuite which has no ability to test the code generated.

To improve this situation introduce BPF_PROG_TEST_RUN command
to test and performance benchmark bpf programs.

Joint work with Daniel Borkmann.

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
  include/linux/bpf.h  |   7 ++
  include/uapi/linux/bpf.h |  12 
  kernel/bpf/syscall.c |  27 +++-
  net/Makefile |   2 +-
  net/bpf/Makefile |   1 +
  net/bpf/test_run.c   | 172 +++
  net/core/filter.c|   5 ++
  7 files changed, 223 insertions(+), 3 deletions(-)
  create mode 100644 net/bpf/Makefile
  create mode 100644 net/bpf/test_run.c



[SNIP]


diff --git a/net/Makefile b/net/Makefile
index 9b681550e3a3..9086ffbb5085 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_NET) += $(tmp-y)
  
  # LLC has to be linked before the files in net/802/

  obj-$(CONFIG_LLC) += llc/
-obj-$(CONFIG_NET)  += ethernet/ 802/ sched/ netlink/
+obj-$(CONFIG_NET)  += ethernet/ 802/ sched/ netlink/ bpf/
  obj-$(CONFIG_NETFILTER)   += netfilter/
  obj-$(CONFIG_INET)+= ipv4/
  obj-$(CONFIG_XFRM)+= xfrm/
diff --git a/net/bpf/Makefile b/net/bpf/Makefile
new file mode 100644
index ..27b2992a0692
--- /dev/null
+++ b/net/bpf/Makefile
@@ -0,0 +1 @@
+obj-y  := test_run.o


I suggest using a CONFIG option to enable/disable code in
test_run.o to reduce attack plane.

Thank you.

hello dear

2017-03-30 Thread terumi barger



Hello nice to meet you my name is Terumi. Your profile really impressed
me, send me a message  through my private email address
(terumibar...@gmail.com) so that i will tell you more about me and send my
pictures to you . I'm looking forward to hear from you.

Re: [PATCH net-next 4/6] tools/lib/bpf: expose bpf_program__set_type()

2017-03-30 Thread Wangnan (F)




On 2017/3/31 10:37, Alexei Starovoitov wrote:

On 3/30/17 7:33 PM, Wangnan (F) wrote:

+void bpf_program__set_type(struct bpf_program *prog, enum
bpf_prog_type type);



This makes libbpf.h depend on uapi/linux/bpf.h (because of enum
bpf_prog_type), which is not always available.

What about defining another enum inside libbpf.h?


how about just including bpf.h? or making it 'int' instead of enum?



Including either kernel header into libbpf.h makes a lot of trouble,
because kernel header and uapi have many other things we don't need
and may conflict with existing code.

Making it 'int' looks like a backdoor. We still need macro to define
each program type.

Thank you.

Re: [PATCH net-next v2 5/6] net: mpls: bump maximum number of labels

2017-03-30 Thread kbuild test robot

Hi David,

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/David-Ahern/net-mpls-Allow-users-to-configure-more-labels-per-route/20170331-080314
config: x86_64-randconfig-ne0-03310806 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   In file included from include/linux/skbuff.h:17:0,
from net/mpls/af_mpls.c:2:
   net/mpls/af_mpls.c: In function 'mpls_rtm_newroute':
>> include/linux/kernel.h:766:21: warning: 'n_labels' may be used uninitialized 
>> in this function [-Wmaybe-uninitialized]
 max1 > max2 ? max1 : max2; })
^
   net/mpls/af_mpls.c:774:6: note: 'n_labels' was declared here
  u8 n_labels;
 ^~~~
--
   In file included from include/linux/skbuff.h:17:0,
from net//mpls/af_mpls.c:2:
   net//mpls/af_mpls.c: In function 'mpls_rtm_newroute':
>> include/linux/kernel.h:766:21: warning: 'n_labels' may be used uninitialized 
>> in this function [-Wmaybe-uninitialized]
 max1 > max2 ? max1 : max2; })
^
   net//mpls/af_mpls.c:774:6: note: 'n_labels' was declared here
  u8 n_labels;
 ^~~~

vim +/n_labels +766 include/linux/kernel.h

^1da177e Linus Torvalds 2005-04-16  750   * "unnecessary" pointer 
comparison.
^1da177e Linus Torvalds 2005-04-16  751   */
589a9785 Johannes Berg  2016-10-07  752  #define __min(t1, t2, min1, min2, 
x, y) ({ \
589a9785 Johannes Berg  2016-10-07  753 t1 min1 = (x);  
\
589a9785 Johannes Berg  2016-10-07  754 t2 min2 = (y);  
\
589a9785 Johannes Berg  2016-10-07  755 (void) ( == );
\
589a9785 Johannes Berg  2016-10-07  756 min1 < min2 ? min1 : min2; })
589a9785 Johannes Berg  2016-10-07  757  #define min(x, y)  
\
589a9785 Johannes Berg  2016-10-07  758 __min(typeof(x), typeof(y), 
\
589a9785 Johannes Berg  2016-10-07  759   __UNIQUE_ID(min1_), 
__UNIQUE_ID(min2_),   \
589a9785 Johannes Berg  2016-10-07  760   x, y)
589a9785 Johannes Berg  2016-10-07  761  
589a9785 Johannes Berg  2016-10-07  762  #define __max(t1, t2, max1, max2, 
x, y) ({ \
589a9785 Johannes Berg  2016-10-07  763 t1 max1 = (x);  
\
589a9785 Johannes Berg  2016-10-07  764 t2 max2 = (y);  
\
589a9785 Johannes Berg  2016-10-07  765 (void) ( == );
\
589a9785 Johannes Berg  2016-10-07 @766 max1 > max2 ? max1 : max2; })
589a9785 Johannes Berg  2016-10-07  767  #define max(x, y)  
\
589a9785 Johannes Berg  2016-10-07  768 __max(typeof(x), typeof(y), 
\
589a9785 Johannes Berg  2016-10-07  769   __UNIQUE_ID(max1_), 
__UNIQUE_ID(max2_),   \
589a9785 Johannes Berg  2016-10-07  770   x, y)
bdf4bbaa Harvey Harrison2008-04-30  771  
2e1d06e1 Michal Nazarewicz  2014-10-09  772  #define min3(x, y, z) 
min((typeof(x))min(x, y), z)
2e1d06e1 Michal Nazarewicz  2014-10-09  773  #define max3(x, y, z) 
max((typeof(x))max(x, y), z)
f27c85c5 Hagen Paul Pfeifer 2010-10-26  774  

:: The code at line 766 was first introduced by commit
:: 589a9785ee3a7cb85f1dedc3dad1c9754c691880 min/max: remove sparse warnings 
when they're nested

:: TO: Johannes Berg 
:: CC: Linus Torvalds 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip

Re: [PATCH net-next 4/6] tools/lib/bpf: expose bpf_program__set_type()

2017-03-30 Thread Alexei Starovoitov


On 3/30/17 7:33 PM, Wangnan (F) wrote:

+void bpf_program__set_type(struct bpf_program *prog, enum
bpf_prog_type type);



This makes libbpf.h depend on uapi/linux/bpf.h (because of enum
bpf_prog_type), which is not always available.

What about defining another enum inside libbpf.h?


how about just including bpf.h? or making it 'int' instead of enum?

Re: [PATCH net-next 4/6] tools/lib/bpf: expose bpf_program__set_type()

2017-03-30 Thread Wangnan (F)




On 2017/3/31 9:31, Alexei Starovoitov wrote:

expose bpf_program__set_type() to set program type

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
  tools/lib/bpf/libbpf.c | 3 +--
  tools/lib/bpf/libbpf.h | 1 +
  2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index ac6eb863b2a4..1a2c07eb7795 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1618,8 +1618,7 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n)
return fd;
  }
  
-static void bpf_program__set_type(struct bpf_program *prog,

- enum bpf_prog_type type)
+void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type)
  {
prog->type = type;
  }
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index b30394f9947a..82adde30b696 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -185,6 +185,7 @@ int bpf_program__set_sched_cls(struct bpf_program *prog);
  int bpf_program__set_sched_act(struct bpf_program *prog);
  int bpf_program__set_xdp(struct bpf_program *prog);
  int bpf_program__set_perf_event(struct bpf_program *prog);
+void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type);
  


This makes libbpf.h depend on uapi/linux/bpf.h (because of enum
bpf_prog_type), which is not always available.

What about defining another enum inside libbpf.h?

Thank you.

[PATCH net-next v2] net: dsa: Mock-up driver

2017-03-30 Thread Florian Fainelli

This patch adds support for a DSA mock-up driver which essentially does
the following:

- registers/unregisters 4 fixed PHYs to the slave network devices
- uses eth0 (configurable) as the master netdev
- registers the switch as a fixed MDIO device against the fixed MDIO bus
  at address 31
- includes dynamic debug prints for dsa_switch_ops functions that can be
  enabled to get call traces

This is a good way to test modular builds as well as exercise the DSA
APIs without requiring access to real hardware. This does not test the
data-path, although this could be added later on.

Signed-off-by: Florian Fainelli 
---
Changes in v2;

- removed an unnecessary change to include/linux/fixed_phy.h

 drivers/net/dsa/Kconfig   |   8 +
 drivers/net/dsa/Makefile  |   2 +-
 drivers/net/dsa/dsa_loop.c| 329 ++
 drivers/net/dsa/dsa_loop.h|  19 +++
 drivers/net/dsa/dsa_loop_bdinfo.c |  34 
 5 files changed, 391 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/dsa/dsa_loop.c
 create mode 100644 drivers/net/dsa/dsa_loop.h
 create mode 100644 drivers/net/dsa/dsa_loop_bdinfo.c

diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index 564b267c8428..ba2e655eec19 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -34,4 +34,12 @@ config NET_DSA_QCA8K
  This enables support for the Qualcomm Atheros QCA8K Ethernet
  switch chips.
 
+config NET_DSA_LOOP
+   tristate "DSA mock-up Ethernet switch chip support"
+   depends on NET_DSA
+   select FIXED_PHY
+   ---help---
+ This enables support for a fake mock-up switch chip which
+ exercises the DSA APIs.
+
 endmenu
diff --git a/drivers/net/dsa/Makefile b/drivers/net/dsa/Makefile
index a3c941632217..5c8830991041 100644
--- a/drivers/net/dsa/Makefile
+++ b/drivers/net/dsa/Makefile
@@ -2,6 +2,6 @@ obj-$(CONFIG_NET_DSA_MV88E6060) += mv88e6060.o
 obj-$(CONFIG_NET_DSA_BCM_SF2)  += bcm-sf2.o
 bcm-sf2-objs   := bcm_sf2.o bcm_sf2_cfp.o
 obj-$(CONFIG_NET_DSA_QCA8K)+= qca8k.o
-
 obj-y  += b53/
 obj-y  += mv88e6xxx/
+obj-$(CONFIG_NET_DSA_LOOP) += dsa_loop.o dsa_loop_bdinfo.o
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
new file mode 100644
index ..0a9a2f846efd
--- /dev/null
+++ b/drivers/net/dsa/dsa_loop.c
@@ -0,0 +1,329 @@
+/*
+ * Distributed Switch Architecture loopback driver
+ *
+ * Copyright (C) 2016, Florian Fainelli 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "dsa_loop.h"
+
+struct dsa_loop_vlan {
+   u16 members;
+   u16 untagged;
+};
+
+#define DSA_LOOP_VLANS 5
+
+struct dsa_loop_priv {
+   struct mii_bus  *bus;
+   unsigned intport_base;
+   struct dsa_loop_vlan vlans[DSA_LOOP_VLANS];
+   struct net_device *netdev;
+   u16 pvid;
+};
+
+static struct phy_device *phydevs[PHY_MAX_ADDR];
+
+static enum dsa_tag_protocol dsa_loop_get_protocol(struct dsa_switch *ds)
+{
+   dev_dbg(ds->dev, "%s\n", __func__);
+
+   return DSA_TAG_PROTO_NONE;
+}
+
+static int dsa_loop_setup(struct dsa_switch *ds)
+{
+   dev_dbg(ds->dev, "%s\n", __func__);
+
+   return 0;
+}
+
+static int dsa_loop_set_addr(struct dsa_switch *ds, u8 *addr)
+{
+   dev_dbg(ds->dev, "%s\n", __func__);
+
+   return 0;
+}
+
+static int dsa_loop_phy_read(struct dsa_switch *ds, int port, int regnum)
+{
+   struct dsa_loop_priv *ps = ds->priv;
+   struct mii_bus *bus = ps->bus;
+
+   dev_dbg(ds->dev, "%s\n", __func__);
+
+   return mdiobus_read_nested(bus, ps->port_base + port, regnum);
+}
+
+static int dsa_loop_phy_write(struct dsa_switch *ds, int port,
+ int regnum, u16 value)
+{
+   struct dsa_loop_priv *ps = ds->priv;
+   struct mii_bus *bus = ps->bus;
+
+   dev_dbg(ds->dev, "%s\n", __func__);
+
+   return mdiobus_write_nested(bus, ps->port_base + port, regnum, value);
+}
+
+static int dsa_loop_port_bridge_join(struct dsa_switch *ds, int port,
+struct net_device *bridge)
+{
+   dev_dbg(ds->dev, "%s\n", __func__);
+
+   return 0;
+}
+
+static void dsa_loop_port_bridge_leave(struct dsa_switch *ds, int port,
+  struct net_device *bridge)
+{
+   dev_dbg(ds->dev, "%s\n", __func__);
+}
+
+static void dsa_loop_port_stp_state_set(struct dsa_switch *ds, int port,
+   u8 state)
+{
+   dev_dbg(ds->dev, "%s\n", __func__);
+}
+
+static int

[PATCH net-next 5/6] selftests/bpf: add a test for basic XDP functionality

2017-03-30 Thread Alexei Starovoitov

add C test for xdp_adjust_head(), packet rewrite and map lookups

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 tools/testing/selftests/bpf/Makefile   |   2 +-
 tools/testing/selftests/bpf/test_iptunnel_common.h |  37 
 tools/testing/selftests/bpf/test_progs.c   |  58 +
 tools/testing/selftests/bpf/test_xdp.c | 236 +
 4 files changed, 332 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_iptunnel_common.h
 create mode 100644 tools/testing/selftests/bpf/test_xdp.c

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 3f76ab6250da..e4acc5b38f43 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -6,7 +6,7 @@ LDLIBS += -lcap -lelf
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs
 
-TEST_GEN_FILES = test_pkt_access.o
+TEST_GEN_FILES = test_pkt_access.o test_xdp.o
 
 TEST_PROGS := test_kmod.sh
 
diff --git a/tools/testing/selftests/bpf/test_iptunnel_common.h 
b/tools/testing/selftests/bpf/test_iptunnel_common.h
new file mode 100644
index ..e4cd252a1b20
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_iptunnel_common.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _TEST_IPTNL_COMMON_H
+#define _TEST_IPTNL_COMMON_H
+
+#include 
+
+#define MAX_IPTNL_ENTRIES 256U
+
+struct vip {
+   union {
+   __u32 v6[4];
+   __u32 v4;
+   } daddr;
+   __u16 dport;
+   __u16 family;
+   __u8 protocol;
+};
+
+struct iptnl_info {
+   union {
+   __u32 v6[4];
+   __u32 v4;
+   } saddr;
+   union {
+   __u32 v6[4];
+   __u32 v4;
+   } daddr;
+   __u16 family;
+   __u8 dmac[6];
+};
+
+#endif
diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
index bc6002a1dfcc..667b6c5bcc7b 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -27,6 +27,7 @@ typedef __u16 __sum16;
 #include 
 #include 
 #include 
+#include "test_iptunnel_common.h"
 
 #define _htons __builtin_bswap16
 
@@ -100,6 +101,20 @@ static int bpf_prog_load(const char *file, enum 
bpf_prog_type type,
return 0;
 }
 
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+   const char *name)
+{
+   struct bpf_map *map;
+
+   map = bpf_object__find_map_by_name(obj, name);
+   if (!map) {
+   printf("%s:FAIL:map '%s' not found\n", test, name);
+   error_cnt++;
+   return -1;
+   }
+   return bpf_map__fd(map);
+}
+
 static void test_pkt_access(void)
 {
const char *file = "./test_pkt_access.o";
@@ -125,6 +140,48 @@ static void test_pkt_access(void)
bpf_object__close(obj);
 }
 
+static void test_xdp(void)
+{
+   struct vip key4 = {.protocol = 6, .family = AF_INET};
+   struct vip key6 = {.protocol = 6, .family = AF_INET6};
+   struct iptnl_info value4 = {.family = AF_INET};
+   struct iptnl_info value6 = {.family = AF_INET6};
+   const char *file = "./test_xdp.o";
+   struct bpf_object *obj;
+   char buf[128];
+   struct ipv6hdr *iph6 = (void *)buf + sizeof(struct ethhdr);
+   struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+   __u32 duration, retval, size;
+   int err, prog_fd, map_fd;
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, , _fd);
+   if (err)
+   return;
+
+   map_fd = bpf_find_map(__func__, obj, "vip2tnl");
+   if (map_fd < 0)
+   goto out;
+   bpf_map_update_elem(map_fd, , , 0);
+   bpf_map_update_elem(map_fd, , , 0);
+
+   err = bpf_program_test_run(prog_fd, 1, _v4, sizeof(pkt_v4),
+  buf, , , );
+
+   CHECK(err || errno || retval != XDP_TX || size != 74 ||
+ iph->protocol != IPPROTO_IPIP, "ipv4",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size);
+
+   err = bpf_program_test_run(prog_fd, 1, _v6, sizeof(pkt_v6),
+  buf, , , );
+   CHECK(err || errno || retval != XDP_TX || size != 114 ||
+ iph6->nexthdr != IPPROTO_IPV6, "ipv6",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size);
+out:
+   bpf_object__close(obj);
+}
+
 int main(void)
 {
struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
@@ -132,6 +189,7 @@ int main(void)
setrlimit(RLIMIT_MEMLOCK, );
 
test_pkt_access();
+   test_xdp();

[PATCH net-next 2/6] tools/lib/bpf: add support for BPF_PROG_TEST_RUN command

2017-03-30 Thread Alexei Starovoitov

add support for BPF_PROG_TEST_RUN command to libbpf.a

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 tools/include/uapi/linux/bpf.h | 24 
 tools/lib/bpf/bpf.c| 24 
 tools/lib/bpf/bpf.h|  4 +++-
 3 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1ea08ce35567..a1d95386f562 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_cmd {
BPF_OBJ_GET,
BPF_PROG_ATTACH,
BPF_PROG_DETACH,
+   BPF_PROG_TEST_RUN,
 };
 
 enum bpf_map_type {
@@ -189,6 +190,17 @@ union bpf_attr {
__u32   attach_type;
__u32   attach_flags;
};
+
+   struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
+   __u32   prog_fd;
+   __u32   retval;
+   __u32   data_size_in;
+   __u32   data_size_out;
+   __aligned_u64   data_in;
+   __aligned_u64   data_out;
+   __u32   repeat;
+   __u32   duration;
+   } test;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
@@ -459,6 +471,18 @@ union bpf_attr {
  * Return:
  *   > 0 length of the string including the trailing NUL on success
  *   < 0 error
+ *
+ * u64 bpf_bpf_get_socket_cookie(skb)
+ * Get the cookie for the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: 8 Bytes non-decreasing number on success or 0 if the socket
+ * field is missing inside sk_buff
+ *
+ * u32 bpf_get_socket_uid(skb)
+ * Get the owner uid of the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: uid of the socket owner on success or 0 if the socket pointer
+ * inside sk_buff is NULL
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 9b58d20e8c93..b5ca5277e30c 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -209,3 +209,27 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type 
type)
 
return sys_bpf(BPF_PROG_DETACH, , sizeof(attr));
 }
+
+int bpf_program_test_run(int prog_fd, int repeat, void *data, __u32 size,
+void *data_out, __u32 *size_out, __u32 *retval,
+__u32 *duration)
+{
+   union bpf_attr attr;
+   int ret;
+
+   bzero(, sizeof(attr));
+   attr.test.prog_fd = prog_fd;
+   attr.test.data_in = ptr_to_u64(data);
+   attr.test.data_out = ptr_to_u64(data_out);
+   attr.test.data_size_in = size;
+   attr.test.repeat = repeat;
+
+   ret = sys_bpf(BPF_PROG_TEST_RUN, , sizeof(attr));
+   if (size_out)
+   *size_out = attr.test.data_size_out;
+   if (retval)
+   *retval = attr.test.retval;
+   if (duration)
+   *duration = attr.test.duration;
+   return ret;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 93f021932623..adfb320ff21d 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -47,6 +47,8 @@ int bpf_obj_get(const char *pathname);
 int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
unsigned int flags);
 int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
-
+int bpf_program_test_run(int prog_fd, int repeat, void *data, __u32 size,
+void *data_out, __u32 *size_out, __u32 *retval,
+__u32 *duration);
 
 #endif
-- 
2.9.3

[PATCH net-next 6/6] selftests/bpf: add l4 load balancer test based on sched_cls

2017-03-30 Thread Alexei Starovoitov

this l4lb demo is a comprehensive test case for LLVM codegen and
kernel verifier. It's using fully inlined jhash(), complex packet
parsing and multiple map lookups of different types to stress
llvm and verifier.
The map sizes, map population and test vectors are artificial to
exercise different paths through the bpf program.

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 tools/testing/selftests/bpf/Makefile |   2 +-
 tools/testing/selftests/bpf/test_l4lb.c  | 474 +++
 tools/testing/selftests/bpf/test_progs.c |  88 ++
 3 files changed, 563 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_l4lb.c

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index e4acc5b38f43..207e91c5780b 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -6,7 +6,7 @@ LDLIBS += -lcap -lelf
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs
 
-TEST_GEN_FILES = test_pkt_access.o test_xdp.o
+TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o
 
 TEST_PROGS := test_kmod.sh
 
diff --git a/tools/testing/selftests/bpf/test_l4lb.c 
b/tools/testing/selftests/bpf/test_l4lb.c
new file mode 100644
index ..368bfe8b9842
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_l4lb.c
@@ -0,0 +1,474 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include "test_iptunnel_common.h"
+
+#define htons __builtin_bswap16
+#define ntohs __builtin_bswap16
+int _version SEC("version") = 1;
+
+static inline __u32 rol32(__u32 word, unsigned int shift)
+{
+   return (word << shift) | (word >> ((-shift) & 31));
+}
+
+/* copy paste of jhash from kernel sources to make sure llvm
+ * can compile it into valid sequence of bpf instructions
+ */
+#define __jhash_mix(a, b, c)   \
+{  \
+   a -= c;  a ^= rol32(c, 4);  c += b; \
+   b -= a;  b ^= rol32(a, 6);  a += c; \
+   c -= b;  c ^= rol32(b, 8);  b += a; \
+   a -= c;  a ^= rol32(c, 16); c += b; \
+   b -= a;  b ^= rol32(a, 19); a += c; \
+   c -= b;  c ^= rol32(b, 4);  b += a; \
+}
+
+#define __jhash_final(a, b, c) \
+{  \
+   c ^= b; c -= rol32(b, 14);  \
+   a ^= c; a -= rol32(c, 11);  \
+   b ^= a; b -= rol32(a, 25);  \
+   c ^= b; c -= rol32(b, 16);  \
+   a ^= c; a -= rol32(c, 4);   \
+   b ^= a; b -= rol32(a, 14);  \
+   c ^= b; c -= rol32(b, 24);  \
+}
+
+#define JHASH_INITVAL  0xdeadbeef
+
+typedef unsigned int u32;
+
+static inline u32 jhash(const void *key, u32 length, u32 initval)
+{
+   u32 a, b, c;
+   const unsigned char *k = key;
+
+   a = b = c = JHASH_INITVAL + length + initval;
+
+   while (length > 12) {
+   a += *(u32 *)(k);
+   b += *(u32 *)(k + 4);
+   c += *(u32 *)(k + 8);
+   __jhash_mix(a, b, c);
+   length -= 12;
+   k += 12;
+   }
+   switch (length) {
+   case 12: c += (u32)k[11]<<24;
+   case 11: c += (u32)k[10]<<16;
+   case 10: c += (u32)k[9]<<8;
+   case 9:  c += k[8];
+   case 8:  b += (u32)k[7]<<24;
+   case 7:  b += (u32)k[6]<<16;
+   case 6:  b += (u32)k[5]<<8;
+   case 5:  b += k[4];
+   case 4:  a += (u32)k[3]<<24;
+   case 3:  a += (u32)k[2]<<16;
+   case 2:  a += (u32)k[1]<<8;
+   case 1:  a += k[0];
+__jhash_final(a, b, c);
+   case 0: /* Nothing left to add */
+   break;
+   }
+
+   return c;
+}
+
+static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
+{
+   a += initval;
+   b += initval;
+   c += initval;
+   __jhash_final(a, b, c);
+   return c;
+}
+
+static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+   return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
+}
+
+#define PCKT_FRAGMENTED 65343
+#define IPV4_HDR_LEN_NO_OPT 20
+#define IPV4_PLUS_ICMP_HDR 28
+#define IPV6_PLUS_ICMP_HDR 48
+#define RING_SIZE 2
+#define MAX_VIPS 12
+#define MAX_REALS 5
+#define CTL_MAP_SIZE 16
+#define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE)
+#define F_IPV6 (1 << 0)
+#define F_HASH_NO_SRC_PORT (1 << 0)
+#define F_ICMP (1 << 0)
+#define F_SYN_SET (1 << 1)
+
+struct packet_description {
+   union {
+

[PATCH net-next 3/6] selftests/bpf: add a test for overlapping packet range checks

2017-03-30 Thread Alexei Starovoitov

add simple C test case for llvm and verifier range check fix from
commit b1977682a385 ("bpf: improve verifier packet range checks")

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 tools/testing/selftests/bpf/Makefile  |  18 +++-
 tools/testing/selftests/bpf/test_pkt_access.c |  64 
 tools/testing/selftests/bpf/test_progs.c  | 138 ++
 3 files changed, 216 insertions(+), 4 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_pkt_access.c
 create mode 100644 tools/testing/selftests/bpf/test_progs.c

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 6a1ad58cb66f..3f76ab6250da 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,16 +1,18 @@
 LIBDIR := ../../../lib
 BPFDIR := $(LIBDIR)/bpf
 
-CFLAGS += -Wall -O2 -I../../../include/uapi -I$(LIBDIR)
-LDLIBS += -lcap
+CFLAGS += -Wall -O2 -I../../../include/uapi -I$(LIBDIR) -I../../../include
+LDLIBS += -lcap -lelf
 
-TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map
+TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs
+
+TEST_GEN_FILES = test_pkt_access.o
 
 TEST_PROGS := test_kmod.sh
 
 include ../lib.mk
 
-BPFOBJ := $(OUTPUT)/bpf.o
+BPFOBJ := $(OUTPUT)/libbpf.a
 
 $(TEST_GEN_PROGS): $(BPFOBJ)
 
@@ -21,3 +23,11 @@ $(TEST_GEN_PROGS): $(BPFOBJ)
 
 $(BPFOBJ): force
$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/
+
+CLANG ?= clang
+
+%.o: %.c
+   $(CLANG) -I../../../include/uapi  \
+   -I../../../../samples/bpf/ -D__x86_64__ \
+   -Wno-compare-distinct-pointer-types \
+   -O2 -target bpf -c $< -o $@
diff --git a/tools/testing/selftests/bpf/test_pkt_access.c 
b/tools/testing/selftests/bpf/test_pkt_access.c
new file mode 100644
index ..fd1e0832d409
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_pkt_access.c
@@ -0,0 +1,64 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define _htons __builtin_bswap16
+#define barrier() __asm__ __volatile__("": : :"memory")
+int _version SEC("version") = 1;
+
+SEC("test1")
+int process(struct __sk_buff *skb)
+{
+   void *data_end = (void *)(long)skb->data_end;
+   void *data = (void *)(long)skb->data;
+   struct ethhdr *eth = (struct ethhdr *)(data);
+   struct tcphdr *tcp = NULL;
+   __u8 proto = 255;
+   __u64 ihl_len;
+
+   if (eth + 1 > data_end)
+   return TC_ACT_SHOT;
+
+   if (eth->h_proto == _htons(ETH_P_IP)) {
+   struct iphdr *iph = (struct iphdr *)(eth + 1);
+
+   if (iph + 1 > data_end)
+   return TC_ACT_SHOT;
+   ihl_len = iph->ihl * 4;
+   proto = iph->protocol;
+   tcp = (struct tcphdr *)((void *)(iph) + ihl_len);
+   } else if (eth->h_proto == _htons(ETH_P_IPV6)) {
+   struct ipv6hdr *ip6h = (struct ipv6hdr *)(eth + 1);
+
+   if (ip6h + 1 > data_end)
+   return TC_ACT_SHOT;
+   ihl_len = sizeof(*ip6h);
+   proto = ip6h->nexthdr;
+   tcp = (struct tcphdr *)((void *)(ip6h) + ihl_len);
+   }
+
+   if (tcp) {
+   if (((void *)(tcp) + 20) > data_end || proto != 6)
+   return TC_ACT_SHOT;
+   barrier(); /* to force ordering of checks */
+   if (((void *)(tcp) + 18) > data_end)
+   return TC_ACT_SHOT;
+   if (tcp->urg_ptr == 123)
+   return TC_ACT_OK;
+   }
+
+   return TC_ACT_UNSPEC;
+}
diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
new file mode 100644
index ..bc6002a1dfcc
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -0,0 +1,138 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+typedef __u16 __sum16;
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+#define _htons __builtin_bswap16
+
+static int error_cnt, pass_cnt;
+
+/* ipv4 test vector */
+static struct {
+   struct ethhdr eth;
+   struct iphdr iph;
+   struct tcphdr tcp;
+} __packed pkt_v4 = {
+   .eth.h_proto =

[PATCH net-next 4/6] tools/lib/bpf: expose bpf_program__set_type()

2017-03-30 Thread Alexei Starovoitov

expose bpf_program__set_type() to set program type

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 tools/lib/bpf/libbpf.c | 3 +--
 tools/lib/bpf/libbpf.h | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index ac6eb863b2a4..1a2c07eb7795 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1618,8 +1618,7 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n)
return fd;
 }
 
-static void bpf_program__set_type(struct bpf_program *prog,
- enum bpf_prog_type type)
+void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type)
 {
prog->type = type;
 }
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index b30394f9947a..82adde30b696 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -185,6 +185,7 @@ int bpf_program__set_sched_cls(struct bpf_program *prog);
 int bpf_program__set_sched_act(struct bpf_program *prog);
 int bpf_program__set_xdp(struct bpf_program *prog);
 int bpf_program__set_perf_event(struct bpf_program *prog);
+void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type);
 
 bool bpf_program__is_socket_filter(struct bpf_program *prog);
 bool bpf_program__is_tracepoint(struct bpf_program *prog);
-- 
2.9.3

[PATCH net-next 1/6] bpf: introduce BPF_PROG_TEST_RUN command

2017-03-30 Thread Alexei Starovoitov

development and testing of networking bpf programs is quite cumbersome.
Despite availability of user space bpf interpreters the kernel is
the ultimate authority and execution environment.
Current test frameworks for TC include creation of netns, veth,
qdiscs and use of various packet generators just to test functionality
of a bpf program. XDP testing is even more complicated, since
qemu needs to be started with gro/gso disabled and precise queue
configuration, transferring of xdp program from host into guest,
attaching to virtio/eth0 and generating traffic from the host
while capturing the results from the guest.

Moreover analyzing performance bottlenecks in XDP program is
impossible in virtio environment, since cost of running the program
is tiny comparing to the overhead of virtio packet processing,
so performance testing can only be done on physical nic
with another server generating traffic.

Furthermore ongoing changes to user space control plane of production
applications cannot be run on the test servers leaving bpf programs
stubbed out for testing.

Last but not least, the upstream llvm changes are validated by the bpf
backend testsuite which has no ability to test the code generated.

To improve this situation introduce BPF_PROG_TEST_RUN command
to test and performance benchmark bpf programs.

Joint work with Daniel Borkmann.

Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
Acked-by: Martin KaFai Lau 
---
 include/linux/bpf.h  |   7 ++
 include/uapi/linux/bpf.h |  12 
 kernel/bpf/syscall.c |  27 +++-
 net/Makefile |   2 +-
 net/bpf/Makefile |   1 +
 net/bpf/test_run.c   | 172 +++
 net/core/filter.c|   5 ++
 7 files changed, 223 insertions(+), 3 deletions(-)
 create mode 100644 net/bpf/Makefile
 create mode 100644 net/bpf/test_run.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2ae39a3e9ead..bbb513da5075 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -169,6 +169,8 @@ struct bpf_verifier_ops {
  const struct bpf_insn *src,
  struct bpf_insn *dst,
  struct bpf_prog *prog);
+   int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
+   union bpf_attr __user *uattr);
 };
 
 struct bpf_prog_type_list {
@@ -233,6 +235,11 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const 
void *src,
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
 
+int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr);
+int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr);
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 28317a04c34d..a1d95386f562 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_cmd {
BPF_OBJ_GET,
BPF_PROG_ATTACH,
BPF_PROG_DETACH,
+   BPF_PROG_TEST_RUN,
 };
 
 enum bpf_map_type {
@@ -189,6 +190,17 @@ union bpf_attr {
__u32   attach_type;
__u32   attach_flags;
};
+
+   struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
+   __u32   prog_fd;
+   __u32   retval;
+   __u32   data_size_in;
+   __u32   data_size_out;
+   __aligned_u64   data_in;
+   __aligned_u64   data_out;
+   __u32   repeat;
+   __u32   duration;
+   } test;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c35ebfe6d84d..ab0cf4c43690 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -973,6 +973,28 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 }
 #endif /* CONFIG_CGROUP_BPF */
 
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
+
+static int bpf_prog_test_run(const union bpf_attr *attr,
+union bpf_attr __user *uattr)
+{
+   struct bpf_prog *prog;
+   int ret = -ENOTSUPP;
+
+   if (CHECK_ATTR(BPF_PROG_TEST_RUN))
+   return -EINVAL;
+
+   prog = bpf_prog_get(attr->test.prog_fd);
+   if (IS_ERR(prog))
+   return PTR_ERR(prog);
+
+   if (prog->aux->ops->test_run)
+   ret = prog->aux->ops->test_run(prog, attr, uattr);
+
+   bpf_prog_put(prog);
+   return ret;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, 
size)
 {
union bpf_attr attr = {};

[PATCH net-next 0/6] bpf: program testing framework

2017-03-30 Thread Alexei Starovoitov

Development and testing of networking bpf programs is quite cumbersome.
Especially tricky are XDP programs that attach to real netdevices and
program development feels like working on the car engine while
the car is in motion.
Another problem is ongoing changes to upstream llvm core
that can introduce an optimization that verifier will not
recognize. llvm bpf backend tests have no ability to run the programs.
To improve this situation introduce BPF_PROG_TEST_RUN command
to test and performance benchmark bpf programs.
It achieves several goals:
- development of xdp and skb based bpf programs can be done
in a canned environment with unit tests
- program performance optimizations can be benchmarked outside of
networking core (without driver and skb costs)
- continuous testing of upstream changes is finally practical

Patches 3,5,6 add C based test cases of various complexity
to cover some sched_cls and xdp features. More tests will
be added in the future. The tests were run on centos7 only.

For now the framework supports only skb and xdp programs. In the future
it can be extended to socket_filter and tracing program types.

More details are in individual patches.

Alexei Starovoitov (6):
  bpf: introduce BPF_PROG_TEST_RUN command
  tools/lib/bpf: add support for BPF_PROG_TEST_RUN command
  selftests/bpf: add a test for overlapping packet range checks
  tools/lib/bpf: expose bpf_program__set_type()
  selftests/bpf: add a test for basic XDP functionality
  selftests/bpf: add l4 load balancer test based on sched_cls

 include/linux/bpf.h|   7 +
 include/uapi/linux/bpf.h   |  12 +
 kernel/bpf/syscall.c   |  27 +-
 net/Makefile   |   2 +-
 net/bpf/Makefile   |   1 +
 net/bpf/test_run.c | 172 
 net/core/filter.c  |   5 +
 tools/include/uapi/linux/bpf.h |  24 ++
 tools/lib/bpf/bpf.c|  24 ++
 tools/lib/bpf/bpf.h|   4 +-
 tools/lib/bpf/libbpf.c |   3 +-
 tools/lib/bpf/libbpf.h |   1 +
 tools/testing/selftests/bpf/Makefile   |  18 +-
 tools/testing/selftests/bpf/test_iptunnel_common.h |  37 ++
 tools/testing/selftests/bpf/test_l4lb.c| 474 +
 tools/testing/selftests/bpf/test_pkt_access.c  |  64 +++
 tools/testing/selftests/bpf/test_progs.c   | 284 
 tools/testing/selftests/bpf/test_xdp.c | 236 ++
 18 files changed, 1385 insertions(+), 10 deletions(-)
 create mode 100644 net/bpf/Makefile
 create mode 100644 net/bpf/test_run.c
 create mode 100644 tools/testing/selftests/bpf/test_iptunnel_common.h
 create mode 100644 tools/testing/selftests/bpf/test_l4lb.c
 create mode 100644 tools/testing/selftests/bpf/test_pkt_access.c
 create mode 100644 tools/testing/selftests/bpf/test_progs.c
 create mode 100644 tools/testing/selftests/bpf/test_xdp.c

-- 
2.9.3

Re: [next-queue v6 PATCH 7/7] i40e: Add support to get switch id and port number for port netdevs

2017-03-30 Thread Jakub Kicinski

On Thu, 30 Mar 2017 15:31:01 -0700, Alexander Duyck wrote:
> On Thu, Mar 30, 2017 at 2:45 PM, Jakub Kicinski
>  wrote:
> > On Wed, 29 Mar 2017 17:22:55 -0700, Sridhar Samudrala wrote:  
> >> Introduce switchdev_ops to PF and port netdevs to return the switch id via
> >> SWITCHDEV_ATTR_ID_PORT_PARENT_ID attribute.
> >> Also, ndo_get_phys_port_name() support is added to port netdevs to return
> >> the port number.
> >>  
> > ...  
> >> +static int
> >> +i40e_port_netdev_get_phys_port_name(struct net_device *dev, char *buf,
> >> + size_t len)
> >> +{
> >> + struct i40e_port_netdev_priv *priv = netdev_priv(dev);
> >> + struct i40e_vf *vf;
> >> + int ret;
> >> +
> >> + switch (priv->type) {
> >> + case I40E_PORT_NETDEV_VF:
> >> + vf = (struct i40e_vf *)priv->f;
> >> + ret = snprintf(buf, len, "%d", vf->vf_id);
> >> + break;
> >> + case I40E_PORT_NETDEV_PF:
> >> + ret = snprintf(buf, len, "%d", I40E_MAIN_VSI_PORT_ID);
> >> + break;
> >> + default:
> >> + return -EOPNOTSUPP;
> >> + }
> >> +
> >> + if (ret >= len)
> >> + return -EOPNOTSUPP;
> >> +
> >> + return 0;
> >> +}  
> >
> > You are using only an integer here, which forces you to manually name
> > the netdev in patch 2, and that is what phys_port_name is supposed to
> > help avoid doing AFAIU.
> >
> > We have naming rules in Documentation/networking/switchdev.txt for
> > switch ports suggested as pX for physical ports or pXsY for ports which
> > are broken out/split.  Could we establish similar suggestion for vf and
> > pf representors and document it? (note: we may need pf representors for
> > multi-host devices.)
> >
> > IMHO naming representors pfr%d or vfr%d would make sense.  This way
> > actual VF and PF netdevs could be called pf%d and vf%d, and
> > udev/systemd will give all netdevs nice, meaningful names without any
> > custom rules.
> >
> > Sorry for the bike shedding but I was hoping we could save some user
> > pain by establishing those rules (more or less) upfront.  
> 
> This is something we should probably discuss at netdev/netconf next
> week. It seems like the convention has been to just use an integer and
> I think we might want to look at doing something like you are
> suggesting where if nothing else we come up with a way of identifying
> that a VF versus something like a segmented port which is the only
> thing currently defined in the documentation.

Sure.  If we want to talk about this at netdev there is another
more minor thing we were pondering.  How to represent the VF -- PCI DEV
-- representor netdev relation nicely e.g. for OpenStack integration?

AFAIU when PCI device is added to a VM user space should add the
representors to appropriate bridges and fire the legacy sriov ndos
to set mac/vlan.  VF PCI dev and PF PCI dev are nicely linked in sysfs
via virtfnX and physfn files.  But going from VF PCI dev to the
representor requires iteration over all representor netdevs to find the
right switchdev_id + phys_port_name combination.

One way to solve this would be to SET_NETDEV_DEV() the representor
netdev to the VF pci dev, but then representors may not share the base
enpXsYfZ name since they will be using different PCI devices as the
parent...

RE: [PATCH nf-next 1/1] net: tcp: Refine the __tcp_select_window

2017-03-30 Thread Gao Feng


> -Original Message-
> From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org]
> On Behalf Of David Miller
> Sent: Friday, March 31, 2017 6:42 AM
> To: gfree.w...@foxmail.com
> Cc: kuz...@ms2.inr.ac.ru; jmor...@namei.org; netdev@vger.kernel.org;
> f...@ikuai8.com
> Subject: Re: [PATCH nf-next 1/1] net: tcp: Refine the __tcp_select_window
> 
> From: gfree.w...@foxmail.com
> Date: Thu, 30 Mar 2017 06:49:19 +0800
> 
> > From: Gao Feng 
> >
> > 1. Move the "window = tp->rcv_wnd;" into the condition block without
> > tp->rx_opt.rcv_wscale.
> > Because it is unnecessary when enable wscale;
> >
> > 2. Use the macro ALIGN instead of two statements.
> > The two statements are used to make window align to 1< > Use the ALIGN is more clearer.
> >
> > 3. Use the rounddown to make codes clearer.
> >
> > Signed-off-by: Gao Feng 
> 
> Applied, but please do not target non-netfilter patches using "nf-next" in
your
> Subject lines.

Sorry, I misspelled the subject.
I would pay more attention on it.

Regards
Feng

[PATCH net 3/3] bpf: add various verifier test cases for self-tests

2017-03-30 Thread Daniel Borkmann

Add a couple of test cases, for example, probing for xadd on a spilled
pointer to packet and map_value_adj register, various other map_value_adj
tests including the unaligned load/store, and trying out pointer arithmetic
on map_value_adj register itself. For the unaligned load/store, we need
to figure out whether the architecture has efficient unaligned access and
need to mark affected tests accordingly.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 tools/include/linux/filter.h|  10 ++
 tools/testing/selftests/bpf/Makefile|   9 +-
 tools/testing/selftests/bpf/test_verifier.c | 270 +++-
 3 files changed, 283 insertions(+), 6 deletions(-)

diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index 122153b..390d7c9 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -168,6 +168,16 @@
.off   = OFF,   \
.imm   = 0 })
 
+/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */
+
+#define BPF_STX_XADD(SIZE, DST, SRC, OFF)  \
+   ((struct bpf_insn) {\
+   .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,   \
+   .dst_reg = DST, \
+   .src_reg = SRC, \
+   .off   = OFF,   \
+   .imm   = 0 })
+
 /* Memory store, *(uint *) (dst_reg + off16) = imm32 */
 
 #define BPF_ST_MEM(SIZE, DST, OFF, IMM)\
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 6a1ad58..9af09e8 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,7 +1,14 @@
 LIBDIR := ../../../lib
 BPFDIR := $(LIBDIR)/bpf
+APIDIR := ../../../include/uapi
+GENDIR := ../../../../include/generated
+GENHDR := $(GENDIR)/autoconf.h
 
-CFLAGS += -Wall -O2 -I../../../include/uapi -I$(LIBDIR)
+ifneq ($(wildcard $(GENHDR)),)
+  GENFLAGS := -DHAVE_GENHDR
+endif
+
+CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS)
 LDLIBS += -lcap
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map
diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 7d761d4..c848e90 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -30,6 +30,14 @@
 
 #include 
 
+#ifdef HAVE_GENHDR
+# include "autoconf.h"
+#else
+# if defined(__i386) || defined(__x86_64) || defined(__s390x__) || 
defined(__aarch64__)
+#  define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1
+# endif
+#endif
+
 #include "../../../include/linux/filter.h"
 
 #ifndef ARRAY_SIZE
@@ -39,6 +47,8 @@
 #define MAX_INSNS  512
 #define MAX_FIXUPS 8
 
+#define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0)
+
 struct bpf_test {
const char *descr;
struct bpf_insn insns[MAX_INSNS];
@@ -53,6 +63,7 @@ struct bpf_test {
REJECT
} result, result_unpriv;
enum bpf_prog_type prog_type;
+   uint8_t flags;
 };
 
 /* Note we want this to be 64 bit aligned so that the end of our array is
@@ -2432,6 +2443,30 @@ struct test_val {
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
},
{
+   "direct packet access: test15 (spill with xadd)",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+   offsetof(struct __sk_buff, data)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+   offsetof(struct __sk_buff, data_end)),
+   BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+   BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 8),
+   BPF_MOV64_IMM(BPF_REG_5, 4096),
+   BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+   BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+   BPF_STX_XADD(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+   BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0),
+   BPF_STX_MEM(BPF_W, BPF_REG_2, BPF_REG_5, 0),
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   },
+   .errstr = "R2 invalid mem access 'inv'",
+   .result = REJECT,
+   .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+   },
+   {
"helper access to packet: test1, valid packet_ptr range",
.insns = {
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
@@ -2934,6 +2969,7 @@ struct test_val {

[PATCH net 1/3] bpf, verifier: fix alu ops against map_value{,_adj} register types

2017-03-30 Thread Daniel Borkmann

While looking into map_value_adj, I noticed that alu operations
directly on the map_value() resp. map_value_adj() register (any
alu operation on a map_value() register will turn it into a
map_value_adj() typed register) are not sufficiently protected
against some of the operations. Two non-exhaustive examples are
provided that the verifier needs to reject:

 i) BPF_AND on r0 (map_value_adj):

  0: (bf) r2 = r10
  1: (07) r2 += -8
  2: (7a) *(u64 *)(r2 +0) = 0
  3: (18) r1 = 0xbf842a00
  5: (85) call bpf_map_lookup_elem#1
  6: (15) if r0 == 0x0 goto pc+2
   R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
  7: (57) r0 &= 8
  8: (7a) *(u64 *)(r0 +0) = 22
   R0=map_value_adj(ks=8,vs=48,id=0),min_value=0,max_value=8 R10=fp
  9: (95) exit

  from 6 to 9: R0=inv,min_value=0,max_value=0 R10=fp
  9: (95) exit
  processed 10 insns

ii) BPF_ADD in 32 bit mode on r0 (map_value_adj):

  0: (bf) r2 = r10
  1: (07) r2 += -8
  2: (7a) *(u64 *)(r2 +0) = 0
  3: (18) r1 = 0xc24eee00
  5: (85) call bpf_map_lookup_elem#1
  6: (15) if r0 == 0x0 goto pc+2
   R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
  7: (04) (u32) r0 += (u32) 0
  8: (7a) *(u64 *)(r0 +0) = 22
   R0=map_value_adj(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
  9: (95) exit

  from 6 to 9: R0=inv,min_value=0,max_value=0 R10=fp
  9: (95) exit
  processed 10 insns

Issue is, while min_value / max_value boundaries for the access
are adjusted appropriately, we change the pointer value in a way
that cannot be sufficiently tracked anymore from its origin.
Operations like BPF_{AND,OR,DIV,MUL,etc} on a destination register
that is PTR_TO_MAP_VALUE{,_ADJ} was probably unintended, in fact,
all the test cases coming with 484611357c19 ("bpf: allow access
into map value arrays") perform BPF_ADD only on the destination
register that is PTR_TO_MAP_VALUE_ADJ.

Only for UNKNOWN_VALUE register types such operations make sense,
f.e. with unknown memory content fetched initially from a constant
offset from the map value memory into a register. That register is
then later tested against lower / upper bounds, so that the verifier
can then do the tracking of min_value / max_value, and properly
check once that UNKNOWN_VALUE register is added to the destination
register with type PTR_TO_MAP_VALUE{,_ADJ}. This is also what the
original use-case is solving. Note, tracking on what is being
added is done through adjust_reg_min_max_vals() and later access
to the map value enforced with these boundaries and the given offset
from the insn through check_map_access_adj().

Tests will fail for non-root environment due to prohibited pointer
arithmetic, in particular in check_alu_op(), we bail out on the
is_pointer_value() check on the dst_reg (which is false in root
case as we allow for pointer arithmetic via env->allow_ptr_leaks).

Similarly to PTR_TO_PACKET, one way to fix it is to restrict the
allowed operations on PTR_TO_MAP_VALUE{,_ADJ} registers to 64 bit
mode BPF_ADD. The test_verifier suite runs fine after the patch
and it also rejects mentioned test cases.

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Daniel Borkmann 
Reviewed-by: Josef Bacik 
Acked-by: Alexei Starovoitov 
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5e6202e..86dedde 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1925,6 +1925,7 @@ static int check_alu_op(struct bpf_verifier_env *env, 
struct bpf_insn *insn)
 * register as unknown.
 */
if (env->allow_ptr_leaks &&
+   BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
(dst_reg->type == PTR_TO_MAP_VALUE ||
 dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
-- 
1.9.3

[PATCH net 2/3] bpf, verifier: fix rejection of unaligned access checks for map_value_adj

2017-03-30 Thread Daniel Borkmann

Currently, the verifier doesn't reject unaligned access for map_value_adj
register types. Commit 484611357c19 ("bpf: allow access into map value
arrays") added logic to check_ptr_alignment() extending it from PTR_TO_PACKET
to also PTR_TO_MAP_VALUE_ADJ, but for PTR_TO_MAP_VALUE_ADJ no enforcement
is in place, because reg->id for PTR_TO_MAP_VALUE_ADJ reg types is never
non-zero, meaning, we can cause BPF_H/_W/_DW-based unaligned access for
architectures not supporting efficient unaligned access, and thus worst
case could raise exceptions on some archs that are unable to correct the
unaligned access or perform a different memory access to the actual
requested one and such.

i) Unaligned load with !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
   on r0 (map_value_adj):

   0: (bf) r2 = r10
   1: (07) r2 += -8
   2: (7a) *(u64 *)(r2 +0) = 0
   3: (18) r1 = 0x42533a00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+11
R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
   7: (61) r1 = *(u32 *)(r0 +0)
   8: (35) if r1 >= 0xb goto pc+9
R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 
R1=inv,min_value=0,max_value=10 R10=fp
   9: (07) r0 += 3
  10: (79) r7 = *(u64 *)(r0 +0)
R0=map_value_adj(ks=8,vs=48,id=0),min_value=3,max_value=3 
R1=inv,min_value=0,max_value=10 R10=fp
  11: (79) r7 = *(u64 *)(r0 +2)
R0=map_value_adj(ks=8,vs=48,id=0),min_value=3,max_value=3 
R1=inv,min_value=0,max_value=10 R7=inv R10=fp
  [...]

ii) Unaligned store with !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
on r0 (map_value_adj):

   0: (bf) r2 = r10
   1: (07) r2 += -8
   2: (7a) *(u64 *)(r2 +0) = 0
   3: (18) r1 = 0x4df16a00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+19
R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
   7: (07) r0 += 3
   8: (7a) *(u64 *)(r0 +0) = 42
R0=map_value_adj(ks=8,vs=48,id=0),min_value=3,max_value=3 R10=fp
   9: (7a) *(u64 *)(r0 +2) = 43
R0=map_value_adj(ks=8,vs=48,id=0),min_value=3,max_value=3 R10=fp
  10: (7a) *(u64 *)(r0 -2) = 44
R0=map_value_adj(ks=8,vs=48,id=0),min_value=3,max_value=3 R10=fp
  [...]

For the PTR_TO_PACKET type, reg->id is initially zero when skb->data
was fetched, it later receives a reg->id from env->id_gen generator
once another register with UNKNOWN_VALUE type was added to it via
check_packet_ptr_add(). The purpose of this reg->id is twofold: i) it
is used in find_good_pkt_pointers() for setting the allowed access
range for regs with PTR_TO_PACKET of same id once verifier matched
on data/data_end tests, and ii) for check_ptr_alignment() to determine
that when not having efficient unaligned access and register with
UNKNOWN_VALUE was added to PTR_TO_PACKET, that we're only allowed
to access the content bytewise due to unknown unalignment. reg->id
was never intended for PTR_TO_MAP_VALUE{,_ADJ} types and thus is
always zero, the only marking is in PTR_TO_MAP_VALUE_OR_NULL that
was added after 484611357c19 via 57a09bf0a416 ("bpf: Detect identical
PTR_TO_MAP_VALUE_OR_NULL registers"). Above tests will fail for
non-root environment due to prohibited pointer arithmetic.

The fix splits register-type specific checks into their own helper
instead of keeping them combined, so we don't run into a similar
issue in future once we extend check_ptr_alignment() further and
forget to add reg->type checks for some of the checks.

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Daniel Borkmann 
Reviewed-by: Josef Bacik 
Acked-by: Alexei Starovoitov 
---
 kernel/bpf/verifier.c | 58 +--
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 86dedde..a834068 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -765,38 +765,56 @@ static bool is_pointer_value(struct bpf_verifier_env 
*env, int regno)
}
 }
 
-static int check_ptr_alignment(struct bpf_verifier_env *env,
-  struct bpf_reg_state *reg, int off, int size)
+static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+  int off, int size)
 {
-   if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
-   if (off % size != 0) {
-   verbose("misaligned access off %d size %d\n",
-   off, size);
-   return -EACCES;
-   } else {
-   return 0;
-   }
-   }
-
-   if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-   /* misaligned access to packet is ok on x86,arm,arm64 */
-   return 0;
-
if (reg->id && size != 1) {
-   verbose("Unknown packet alignment. Only byte-sized access 
allowed\n");
+   verbose("Unknown alignment. Only byte-sized access allowed in 
packet access.\n");

[PATCH net 0/3] BPF fixes on map_value_adj reg types

2017-03-30 Thread Daniel Borkmann

This set adds two fixes for map_value_adj register type in the
verifier and user space tests along with them for the BPF self
test suite. For details, please see individual patches.

Thanks!

Daniel Borkmann (3):
  bpf, verifier: fix alu ops against map_value{,_adj} register types
  bpf, verifier: fix rejection of unaligned access checks for map_value_adj
  bpf: add various verifier test cases for self-tests

 kernel/bpf/verifier.c   |  59 +++---
 tools/include/linux/filter.h|  10 ++
 tools/testing/selftests/bpf/Makefile|   9 +-
 tools/testing/selftests/bpf/test_verifier.c | 270 +++-
 4 files changed, 322 insertions(+), 26 deletions(-)

-- 
1.9.3

Re: EINVAL when using connect() for udp sockets

2017-03-30 Thread Eric Dumazet

On Thu, 2017-03-30 at 16:36 -0700, Cong Wang wrote:
> On Tue, Mar 28, 2017 at 5:52 PM, Eric Dumazet  wrote:
> > On Tue, 2017-03-28 at 16:11 -0700, Eric Dumazet wrote:
> >
> >> Yes, this looks better.
> >>
> >> Although you probably need to change a bit later this part :
> >>
> >> if (!inet->inet_saddr)
> >>   inet->inet_saddr = fl4->saddr;  /* Update source address */
> >>
> >
> > I came up with the following tested patch for IPv4
> >
> > diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
> > index 
> > f915abff1350a86af8d5bb89725b751c061b0fb5..1454b6191e0d38ffae0ae260578858285bc5f77b
> >  100644
> > --- a/net/ipv4/datagram.c
> > +++ b/net/ipv4/datagram.c
> > @@ -40,7 +40,7 @@ int __ip4_datagram_connect(struct sock *sk, struct 
> > sockaddr *uaddr, int addr_len
> > sk_dst_reset(sk);
> >
> > oif = sk->sk_bound_dev_if;
> > -   saddr = inet->inet_saddr;
> > +   saddr = (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ? inet->inet_saddr 
> > : 0;
> > if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
> > if (!oif)
> > oif = inet->mc_index;
> > @@ -64,9 +64,8 @@ int __ip4_datagram_connect(struct sock *sk, struct 
> > sockaddr *uaddr, int addr_len
> > err = -EACCES;
> > goto out;
> > }
> > -   if (!inet->inet_saddr)
> > +   if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
> > inet->inet_saddr = fl4->saddr;  /* Update source address */
> > -   if (!inet->inet_rcv_saddr) {
> > inet->inet_rcv_saddr = fl4->saddr;
> > if (sk->sk_prot->rehash)
> > sk->sk_prot->rehash(sk);
> 
> Why do we need this here? If you mean bind() INADDR_ANY is bound,
> then it is totally a different problem?


Proper delivery of RX packets will need to find the socket, and this
needs the 2-tuple (source address, source port) info for UDP.

So after a connect(), we need to rehash

> 
> BTW, I am still not sure about what POSIX says about the connect()
> behavior, I can only find this [1]:
> 
> "
> If the initiating socket is not connection-mode, then connect() shall set the
> socket's peer address, and no connection is made. For SOCK_DGRAM
> sockets, the peer address identifies where all datagrams are sent on
> subsequent send() functions, and limits the remote sender for subsequent
> recv() functions.
> "
> 
> It doesn't say anything about source address. But the man page [2] says:
> 
> "
> When
>connect(2) is called on an unbound socket, the socket is
>automatically bound to a random free port or to a usable shared port
>with the local address set to INADDR_ANY.
> "
> 
> Seems the last part is inaccurate, kernel actually picks a source address
> from route instead of just using INADDR_ANY for connect(2).
> 
> So, for me, I think the following behaviors make sense for UDP:
> 
> 1) When a bind() is called before connect()'s, aka:
> 
> bind();
> connect(addr1); // should not change source addr

It depends. bind() can be only allocating the source port.

If bind(INADDR_ANY) was used, then we need to determine source addr at
connect() time.

Point of connect() is that future send() wont have to guess the 4-tuple
infos. But also that incoming packets will find this precise socket
thanks to a higher score.

And tools like "ss -aun" should display the 4-tuple after a successful
connect()

> connect(addr2); // should fail is the source addr can not reach peer addr
> 
> 2) No bind() before connect()'s, aka:
> 
> connect(addr1); // Free to bind a source addr
> connect(addr2); // Free to bind a new source addr and change peer addr

Exactly. My patch does this.

> 
> Thoughts?
> 
> 1. http://pubs.opengroup.org/onlinepubs/9699919799/functions/connect.html
> 2. http://man7.org/linux/man-pages/man7/ip.7.html

Re: EINVAL when using connect() for udp sockets

2017-03-30 Thread Cong Wang

On Tue, Mar 28, 2017 at 5:52 PM, Eric Dumazet  wrote:
> On Tue, 2017-03-28 at 16:11 -0700, Eric Dumazet wrote:
>
>> Yes, this looks better.
>>
>> Although you probably need to change a bit later this part :
>>
>> if (!inet->inet_saddr)
>>   inet->inet_saddr = fl4->saddr;  /* Update source address */
>>
>
> I came up with the following tested patch for IPv4
>
> diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
> index 
> f915abff1350a86af8d5bb89725b751c061b0fb5..1454b6191e0d38ffae0ae260578858285bc5f77b
>  100644
> --- a/net/ipv4/datagram.c
> +++ b/net/ipv4/datagram.c
> @@ -40,7 +40,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr 
> *uaddr, int addr_len
> sk_dst_reset(sk);
>
> oif = sk->sk_bound_dev_if;
> -   saddr = inet->inet_saddr;
> +   saddr = (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ? inet->inet_saddr : 
> 0;
> if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
> if (!oif)
> oif = inet->mc_index;
> @@ -64,9 +64,8 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr 
> *uaddr, int addr_len
> err = -EACCES;
> goto out;
> }
> -   if (!inet->inet_saddr)
> +   if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
> inet->inet_saddr = fl4->saddr;  /* Update source address */
> -   if (!inet->inet_rcv_saddr) {
> inet->inet_rcv_saddr = fl4->saddr;
> if (sk->sk_prot->rehash)
> sk->sk_prot->rehash(sk);

Why do we need this here? If you mean bind() INADDR_ANY is bound,
then it is totally a different problem?

BTW, I am still not sure about what POSIX says about the connect()
behavior, I can only find this [1]:

"
If the initiating socket is not connection-mode, then connect() shall set the
socket's peer address, and no connection is made. For SOCK_DGRAM
sockets, the peer address identifies where all datagrams are sent on
subsequent send() functions, and limits the remote sender for subsequent
recv() functions.
"

It doesn't say anything about source address. But the man page [2] says:

"
When
   connect(2) is called on an unbound socket, the socket is
   automatically bound to a random free port or to a usable shared port
   with the local address set to INADDR_ANY.
"

Seems the last part is inaccurate, kernel actually picks a source address
from route instead of just using INADDR_ANY for connect(2).

So, for me, I think the following behaviors make sense for UDP:

1) When a bind() is called before connect()'s, aka:

bind();
connect(addr1); // should not change source addr
connect(addr2); // should fail is the source addr can not reach peer addr

2) No bind() before connect()'s, aka:

connect(addr1); // Free to bind a source addr
connect(addr2); // Free to bind a new source addr and change peer addr

Thoughts?

1. http://pubs.opengroup.org/onlinepubs/9699919799/functions/connect.html
2. http://man7.org/linux/man-pages/man7/ip.7.html

Re: [PATCH 5/5] PCI: remove pci_enable_msix

2017-03-30 Thread Bjorn Helgaas

On Mon, Mar 27, 2017 at 10:29:36AM +0200, Christoph Hellwig wrote:
> Unused now that all callers switched to pci_alloc_irq_vectors.
> 
> Signed-off-by: Christoph Hellwig 

Acked-by: Bjorn Helgaas 

I assume this will be merged with the rest via the netdev tree.

> ---
>  drivers/pci/msi.c   | 21 -
>  include/linux/pci.h |  4 
>  2 files changed, 25 deletions(-)
> 
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index d571bc330686..0042c365b29b 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -973,27 +973,6 @@ static int __pci_enable_msix(struct pci_dev *dev, struct 
> msix_entry *entries,
>   return msix_capability_init(dev, entries, nvec, affd);
>  }
>  
> -/**
> - * pci_enable_msix - configure device's MSI-X capability structure
> - * @dev: pointer to the pci_dev data structure of MSI-X device function
> - * @entries: pointer to an array of MSI-X entries (optional)
> - * @nvec: number of MSI-X irqs requested for allocation by device driver
> - *
> - * Setup the MSI-X capability structure of device function with the number
> - * of requested irqs upon its software driver call to request for
> - * MSI-X mode enabled on its hardware device function. A return of zero
> - * indicates the successful configuration of MSI-X capability structure
> - * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
> - * Or a return of > 0 indicates that driver request is exceeding the number
> - * of irqs or MSI-X vectors available. Driver should use the returned value 
> to
> - * re-send its request.
> - **/
> -int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int 
> nvec)
> -{
> - return __pci_enable_msix(dev, entries, nvec, NULL);
> -}
> -EXPORT_SYMBOL(pci_enable_msix);
> -
>  void pci_msix_shutdown(struct pci_dev *dev)
>  {
>   struct msi_desc *entry;
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index eb3da1a04e6c..82dec36845e6 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1300,7 +1300,6 @@ int pci_msi_vec_count(struct pci_dev *dev);
>  void pci_msi_shutdown(struct pci_dev *dev);
>  void pci_disable_msi(struct pci_dev *dev);
>  int pci_msix_vec_count(struct pci_dev *dev);
> -int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int 
> nvec);
>  void pci_msix_shutdown(struct pci_dev *dev);
>  void pci_disable_msix(struct pci_dev *dev);
>  void pci_restore_msi_state(struct pci_dev *dev);
> @@ -1330,9 +1329,6 @@ static inline int pci_msi_vec_count(struct pci_dev 
> *dev) { return -ENOSYS; }
>  static inline void pci_msi_shutdown(struct pci_dev *dev) { }
>  static inline void pci_disable_msi(struct pci_dev *dev) { }
>  static inline int pci_msix_vec_count(struct pci_dev *dev) { return -ENOSYS; }
> -static inline int pci_enable_msix(struct pci_dev *dev,
> -   struct msix_entry *entries, int nvec)
> -{ return -ENOSYS; }
>  static inline void pci_msix_shutdown(struct pci_dev *dev) { }
>  static inline void pci_disable_msix(struct pci_dev *dev) { }
>  static inline void pci_restore_msi_state(struct pci_dev *dev) { }
> -- 
> 2.11.0
>

Re: [PATCH 5/5] PCI: remove pci_enable_msix

2017-03-30 Thread David Daney


On 03/30/2017 03:56 PM, Bjorn Helgaas wrote:

On Tue, Mar 28, 2017 at 09:24:15AM -0700, David Daney wrote:

On 03/27/2017 11:41 PM, Christoph Hellwig wrote:

On Mon, Mar 27, 2017 at 10:30:46AM -0700, David Daney wrote:

Use pci_enable_msix_{exact,range} for now, as I told you before.



That still results in twice as many MSI-X being provisioned than are needed.


How so?  Except for the return value, a pci_enable_msix_exact call with the
same arguments as your previous pci_enable_msix will work exactly the
same.



Sorry, I think it was my misunderstanding.  I didn't realize that we
had essentially renamed the function, but left the functionality
mostly unchanged.


Does this mean you're OK with this patch?


Yes.  I have re-written my GPIO driver to use the newer functions, so I 
withdraw my objections to the patch.


Thanks,
David Daney



I know it may require some
work on out-of-tree drivers and so on, but if that work is possible
and you don't actually lose functionality, I'm OK with this patch.

Bjorn

Re: [PATCH net] be2net: Fix endian issue in logical link config command

2017-03-30 Thread David Miller

From: Suresh Reddy 
Date: Thu, 30 Mar 2017 00:58:32 -0400

> Use cpu_to_le32() for link_config variable in set_logical_link_config
> command as this variable is of type u32.
> 
> Signed-off-by: Suresh Reddy 

Applied.

Re: [PATCH net 0/7] ibmvnic: Cleanup resource handling

2017-03-30 Thread David Miller

From: Nathan Fontenot 
Date: Thu, 30 Mar 2017 02:48:49 -0400

> In order to better manage the resources of the ibmvnic driver, this set of
> patches creates a set of initialization and release routines for the
> drivers resources. Additionally, some patches do some re-naming of the
> affected routines so that there is a common naming scheme in the driver.

Series applied to net-next.

[PATCH] ath6kl: Add __printf verification to ath6kl_dbg

2017-03-30 Thread Joe Perches

Fix fallout too.

Signed-off-by: Joe Perches 
---
 drivers/net/wireless/ath/ath6kl/debug.h| 2 ++
 drivers/net/wireless/ath/ath6kl/htc_pipe.c | 2 +-
 drivers/net/wireless/ath/ath6kl/wmi.c  | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/debug.h 
b/drivers/net/wireless/ath/ath6kl/debug.h
index 0614393dd7ae..94297572914f 100644
--- a/drivers/net/wireless/ath/ath6kl/debug.h
+++ b/drivers/net/wireless/ath/ath6kl/debug.h
@@ -63,6 +63,7 @@ int ath6kl_read_tgt_stats(struct ath6kl *ar, struct 
ath6kl_vif *vif);
 
 #ifdef CONFIG_ATH6KL_DEBUG
 
+__printf(2, 3)
 void ath6kl_dbg(enum ATH6K_DEBUG_MASK mask, const char *fmt, ...);
 void ath6kl_dbg_dump(enum ATH6K_DEBUG_MASK mask,
 const char *msg, const char *prefix,
@@ -83,6 +84,7 @@ int ath6kl_debug_init_fs(struct ath6kl *ar);
 void ath6kl_debug_cleanup(struct ath6kl *ar);
 
 #else
+__printf(2, 3)
 static inline void ath6kl_dbg(enum ATH6K_DEBUG_MASK dbg_mask,
  const char *fmt, ...)
 {
diff --git a/drivers/net/wireless/ath/ath6kl/htc_pipe.c 
b/drivers/net/wireless/ath/ath6kl/htc_pipe.c
index ca1a18c86c0d..d127a08d60df 100644
--- a/drivers/net/wireless/ath/ath6kl/htc_pipe.c
+++ b/drivers/net/wireless/ath/ath6kl/htc_pipe.c
@@ -995,7 +995,7 @@ static int ath6kl_htc_pipe_rx_complete(struct ath6kl *ar, 
struct sk_buff *skb,
 
if (netlen < (payload_len + HTC_HDR_LENGTH)) {
ath6kl_dbg(ATH6KL_DBG_HTC,
-  "HTC Rx: insufficient length, got:%d expected =%u\n",
+  "HTC Rx: insufficient length, got:%d expected 
=%zu\n",
   netlen, payload_len + HTC_HDR_LENGTH);
status = -EINVAL;
goto free_skb;
diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c 
b/drivers/net/wireless/ath/ath6kl/wmi.c
index 84a6d12c3f8a..a082de81ec4c 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.c
+++ b/drivers/net/wireless/ath/ath6kl/wmi.c
@@ -1596,7 +1596,7 @@ static int ath6kl_wmi_txe_notify_event_rx(struct wmi 
*wmi, u8 *datap, int len,
rate = le32_to_cpu(ev->rate);
pkts = le32_to_cpu(ev->pkts);
 
-   ath6kl_dbg(ATH6KL_DBG_WMI, "TXE notify event: peer %pM rate %d% pkts %d 
intvl %ds\n",
+   ath6kl_dbg(ATH6KL_DBG_WMI, "TXE notify event: peer %pM rate %d%% pkts 
%d intvl %ds\n",
   vif->bssid, rate, pkts, vif->txe_intvl);
 
cfg80211_cqm_txe_notify(vif->ndev, vif->bssid, pkts,
-- 
2.10.0.rc2.1.g053435c

Re: [PATCH 5/5] PCI: remove pci_enable_msix

2017-03-30 Thread Bjorn Helgaas

On Tue, Mar 28, 2017 at 09:24:15AM -0700, David Daney wrote:
> On 03/27/2017 11:41 PM, Christoph Hellwig wrote:
> >On Mon, Mar 27, 2017 at 10:30:46AM -0700, David Daney wrote:
> >>>Use pci_enable_msix_{exact,range} for now, as I told you before.
> >>>
> >>
> >>That still results in twice as many MSI-X being provisioned than are needed.
> >
> >How so?  Except for the return value, a pci_enable_msix_exact call with the
> >same arguments as your previous pci_enable_msix will work exactly the
> >same.
> >
> 
> Sorry, I think it was my misunderstanding.  I didn't realize that we
> had essentially renamed the function, but left the functionality
> mostly unchanged.

Does this mean you're OK with this patch?  I know it may require some
work on out-of-tree drivers and so on, but if that work is possible
and you don't actually lose functionality, I'm OK with this patch.

Bjorn

Re: [PATCH] net/faraday: Explicitly include linux/of.h and linux/property.h

2017-03-30 Thread Joel Stanley

On Fri, Mar 31, 2017 at 2:30 AM, Mark Brown  wrote:
> This driver uses interfaces from linux/of.h and linux/property.h but
> relies on implict inclusion of those headers which means that changes in
> other headers could break the build, as happened in -next for arm today.
> Add a explicit includes.
>
> Signed-off-by: Mark Brown 

Acked-by: Joel Stanley 

Thank you for fixing this Mark.

Cheers,

Joel

> ---
>  drivers/net/ethernet/faraday/ftgmac100.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/drivers/net/ethernet/faraday/ftgmac100.c 
> b/drivers/net/ethernet/faraday/ftgmac100.c
> index 928b0df2b8e0..ade6b3e4ed13 100644
> --- a/drivers/net/ethernet/faraday/ftgmac100.c
> +++ b/drivers/net/ethernet/faraday/ftgmac100.c
> @@ -28,8 +28,10 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>
> --
> 2.11.0
>

Re: [PATCH nf-next 1/1] net: tcp: Refine the __tcp_select_window

2017-03-30 Thread David Miller

From: gfree.w...@foxmail.com
Date: Thu, 30 Mar 2017 06:49:19 +0800

> From: Gao Feng 
> 
> 1. Move the "window = tp->rcv_wnd;" into the condition block without
> tp->rx_opt.rcv_wscale.
> Because it is unnecessary when enable wscale;
> 
> 2. Use the macro ALIGN instead of two statements.
> The two statements are used to make window align to 1< Use the ALIGN is more clearer.
> 
> 3. Use the rounddown to make codes clearer.
> 
> Signed-off-by: Gao Feng 

Applied, but please do not target non-netfilter patches using
"nf-next" in your Subject lines.

Re: [PATCH RESEND net-next] net: dsa: mv88e6xxx: debug ATU Age Time

2017-03-30 Thread David Miller

From: Vivien Didelot 
Date: Wed, 29 Mar 2017 15:38:37 -0400

> The ATU ageing time value programmed in the switch is rounded up to the
> nearest multiple of its coefficient (variable depending on the model.)
> 
> Add a debug message to inform the user about the exact programmed value.
> 
> On 6352, "brctl setageing br0 18" gives "AgeTime set to 0x01 (15000 ms)"
> while on 6390 we get "AgeTime set to 0x05 (18750 ms)".
> 
> Signed-off-by: Vivien Didelot 

Applied.

Re: [next-queue v6 PATCH 7/7] i40e: Add support to get switch id and port number for port netdevs

2017-03-30 Thread Alexander Duyck

On Thu, Mar 30, 2017 at 2:45 PM, Jakub Kicinski
 wrote:
> On Wed, 29 Mar 2017 17:22:55 -0700, Sridhar Samudrala wrote:
>> Introduce switchdev_ops to PF and port netdevs to return the switch id via
>> SWITCHDEV_ATTR_ID_PORT_PARENT_ID attribute.
>> Also, ndo_get_phys_port_name() support is added to port netdevs to return
>> the port number.
>>
> ...
>> +static int
>> +i40e_port_netdev_get_phys_port_name(struct net_device *dev, char *buf,
>> + size_t len)
>> +{
>> + struct i40e_port_netdev_priv *priv = netdev_priv(dev);
>> + struct i40e_vf *vf;
>> + int ret;
>> +
>> + switch (priv->type) {
>> + case I40E_PORT_NETDEV_VF:
>> + vf = (struct i40e_vf *)priv->f;
>> + ret = snprintf(buf, len, "%d", vf->vf_id);
>> + break;
>> + case I40E_PORT_NETDEV_PF:
>> + ret = snprintf(buf, len, "%d", I40E_MAIN_VSI_PORT_ID);
>> + break;
>> + default:
>> + return -EOPNOTSUPP;
>> + }
>> +
>> + if (ret >= len)
>> + return -EOPNOTSUPP;
>> +
>> + return 0;
>> +}
>
> You are using only an integer here, which forces you to manually name
> the netdev in patch 2, and that is what phys_port_name is supposed to
> help avoid doing AFAIU.
>
> We have naming rules in Documentation/networking/switchdev.txt for
> switch ports suggested as pX for physical ports or pXsY for ports which
> are broken out/split.  Could we establish similar suggestion for vf and
> pf representors and document it? (note: we may need pf representors for
> multi-host devices.)
>
> IMHO naming representors pfr%d or vfr%d would make sense.  This way
> actual VF and PF netdevs could be called pf%d and vf%d, and
> udev/systemd will give all netdevs nice, meaningful names without any
> custom rules.
>
> Sorry for the bike shedding but I was hoping we could save some user
> pain by establishing those rules (more or less) upfront.

This is something we should probably discuss at netdev/netconf next
week. It seems like the convention has been to just use an integer and
I think we might want to look at doing something like you are
suggesting where if nothing else we come up with a way of identifying
that a VF versus something like a segmented port which is the only
thing currently defined in the documentation.

- Alex

Re: [next-queue v6 PATCH 7/7] i40e: Add support to get switch id and port number for port netdevs

2017-03-30 Thread Jakub Kicinski

On Wed, 29 Mar 2017 17:22:55 -0700, Sridhar Samudrala wrote:
> Introduce switchdev_ops to PF and port netdevs to return the switch id via
> SWITCHDEV_ATTR_ID_PORT_PARENT_ID attribute.
> Also, ndo_get_phys_port_name() support is added to port netdevs to return
> the port number.
> 
...
> +static int
> +i40e_port_netdev_get_phys_port_name(struct net_device *dev, char *buf,
> + size_t len)
> +{
> + struct i40e_port_netdev_priv *priv = netdev_priv(dev);
> + struct i40e_vf *vf;
> + int ret;
> +
> + switch (priv->type) {
> + case I40E_PORT_NETDEV_VF:
> + vf = (struct i40e_vf *)priv->f;
> + ret = snprintf(buf, len, "%d", vf->vf_id);
> + break;
> + case I40E_PORT_NETDEV_PF:
> + ret = snprintf(buf, len, "%d", I40E_MAIN_VSI_PORT_ID);
> + break;
> + default:
> + return -EOPNOTSUPP;
> + }
> +
> + if (ret >= len)
> + return -EOPNOTSUPP;
> +
> + return 0;
> +}

You are using only an integer here, which forces you to manually name
the netdev in patch 2, and that is what phys_port_name is supposed to
help avoid doing AFAIU.

We have naming rules in Documentation/networking/switchdev.txt for
switch ports suggested as pX for physical ports or pXsY for ports which
are broken out/split.  Could we establish similar suggestion for vf and
pf representors and document it? (note: we may need pf representors for
multi-host devices.)

IMHO naming representors pfr%d or vfr%d would make sense.  This way
actual VF and PF netdevs could be called pf%d and vf%d, and
udev/systemd will give all netdevs nice, meaningful names without any
custom rules.

Sorry for the bike shedding but I was hoping we could save some user
pain by establishing those rules (more or less) upfront.

[PATCH net-next v2 2/9] net: dsa: mv88e6xxx: use 4-bit port for PVT data

2017-03-30 Thread Vivien Didelot

The Cross-chip Port Based VLAN Table (PVT) supports two indexing modes,
one using 5-bit for device and 4-bit for port, the other using 4-bit for
device and 5-bit for port, configured via the Global 2 Misc register.

Only 4 bits for the source port are needed when interconnecting 88E6xxx
switch devices since they all support less than 16 physical ports. The
full 5 bits are needed when interconnecting a device with 98DXxxx switch
devices since they support more than 16 physical ports.

Add a mv88e6xxx_pvt_setup helper to set the 4-bit port PVT mode, which
will be extended later to also initialize the PVT content.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 15 +++
 drivers/net/dsa/mv88e6xxx/global2.c   | 25 +
 drivers/net/dsa/mv88e6xxx/global2.h   |  7 +++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  1 +
 4 files changed, 48 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 8f1f881d0375..2a32bb490f92 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1198,6 +1198,17 @@ static int mv88e6xxx_atu_setup(struct mv88e6xxx_chip 
*chip)
return mv88e6xxx_g1_atu_set_age_time(chip, 30);
 }
 
+static int mv88e6xxx_pvt_setup(struct mv88e6xxx_chip *chip)
+{
+   if (!mv88e6xxx_has_pvt(chip))
+   return 0;
+
+   /* Clear 5 Bit Port for usage with Marvell Link Street devices:
+* use 4 bits for the Src_Port/Src_Trunk and 5 bits for the Src_Dev.
+*/
+   return mv88e6xxx_g2_misc_4_bit_port(chip);
+}
+
 static void mv88e6xxx_port_fast_age(struct dsa_switch *ds, int port)
 {
struct mv88e6xxx_chip *chip = ds->priv;
@@ -2594,6 +2605,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
goto unlock;
}
 
+   err = mv88e6xxx_pvt_setup(chip);
+   if (err)
+   goto unlock;
+
err = mv88e6xxx_atu_setup(chip);
if (err)
goto unlock;
diff --git a/drivers/net/dsa/mv88e6xxx/global2.c 
b/drivers/net/dsa/mv88e6xxx/global2.c
index 6228aab2ad35..d64f4c15ccb7 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.c
+++ b/drivers/net/dsa/mv88e6xxx/global2.c
@@ -784,6 +784,31 @@ static int mv88e6xxx_g2_watchdog_setup(struct 
mv88e6xxx_chip *chip)
return err;
 }
 
+/* Offset 0x1D: Misc Register */
+
+static int mv88e6xxx_g2_misc_5_bit_port(struct mv88e6xxx_chip *chip,
+   bool port_5_bit)
+{
+   u16 val;
+   int err;
+
+   err = mv88e6xxx_g2_read(chip, GLOBAL2_MISC, );
+   if (err)
+   return err;
+
+   if (port_5_bit)
+   val |= GLOBAL2_MISC_5_BIT_PORT;
+   else
+   val &= ~GLOBAL2_MISC_5_BIT_PORT;
+
+   return mv88e6xxx_g2_write(chip, GLOBAL2_MISC, val);
+}
+
+int mv88e6xxx_g2_misc_4_bit_port(struct mv88e6xxx_chip *chip)
+{
+   return mv88e6xxx_g2_misc_5_bit_port(chip, false);
+}
+
 static void mv88e6xxx_g2_irq_mask(struct irq_data *d)
 {
struct mv88e6xxx_chip *chip = irq_data_get_irq_chip_data(d);
diff --git a/drivers/net/dsa/mv88e6xxx/global2.h 
b/drivers/net/dsa/mv88e6xxx/global2.h
index f8b6dd93213a..71fb2ff541ba 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.h
+++ b/drivers/net/dsa/mv88e6xxx/global2.h
@@ -42,6 +42,8 @@ int mv88e6xxx_g2_get_eeprom16(struct mv88e6xxx_chip *chip,
 int mv88e6xxx_g2_set_eeprom16(struct mv88e6xxx_chip *chip,
  struct ethtool_eeprom *eeprom, u8 *data);
 
+int mv88e6xxx_g2_misc_4_bit_port(struct mv88e6xxx_chip *chip);
+
 int mv88e6xxx_g2_setup(struct mv88e6xxx_chip *chip);
 int mv88e6xxx_g2_irq_setup(struct mv88e6xxx_chip *chip);
 void mv88e6xxx_g2_irq_free(struct mv88e6xxx_chip *chip);
@@ -110,6 +112,11 @@ static inline int mv88e6xxx_g2_set_eeprom16(struct 
mv88e6xxx_chip *chip,
return -EOPNOTSUPP;
 }
 
+int mv88e6xxx_g2_misc_4_bit_port(struct mv88e6xxx_chip *chip)
+{
+   return -EOPNOTSUPP;
+}
+
 static inline int mv88e6xxx_g2_setup(struct mv88e6xxx_chip *chip)
 {
return -EOPNOTSUPP;
diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h 
b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
index 97dd3e2d2a56..bcaa55b20f5a 100644
--- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
@@ -439,6 +439,7 @@
 #define GLOBAL2_WDOG_FORCE_IRQ BIT(0)
 #define GLOBAL2_QOS_WEIGHT 0x1c
 #define GLOBAL2_MISC   0x1d
+#define GLOBAL2_MISC_5_BIT_PORTBIT(14)
 
 #define MV88E6XXX_N_FID4096
 
-- 
2.12.1

[PATCH net-next v2 8/9] net: dsa: add cross-chip bridging operations

2017-03-30 Thread Vivien Didelot

Introduce crosschip_bridge_{join,leave} operations in the dsa_switch_ops
structure, which can be used by switches supporting interconnection.

Signed-off-by: Vivien Didelot 
---
 include/net/dsa.h |  8 
 net/dsa/switch.c  | 12 ++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 951b5e49e899..ffe56cc338fe 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -456,6 +456,14 @@ struct dsa_switch_ops {
   bool ingress);
void(*port_mirror_del)(struct dsa_switch *ds, int port,
   struct dsa_mall_mirror_tc_entry *mirror);
+
+   /*
+* Cross-chip operations
+*/
+   int (*crosschip_bridge_join)(struct dsa_switch *ds, int sw_index,
+int port, struct net_device *br);
+   void(*crosschip_bridge_leave)(struct dsa_switch *ds, int sw_index,
+ int port, struct net_device *br);
 };
 
 struct dsa_switch_driver {
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 7b6f38e5fef6..ca6e26e514f0 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -20,9 +20,9 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds,
if (ds->index == info->sw_index && ds->ops->port_bridge_join)
return ds->ops->port_bridge_join(ds, info->port, info->br);
 
-   if (ds->index != info->sw_index)
-   dev_dbg(ds->dev, "crosschip DSA port %d.%d bridged to %s\n",
-   info->sw_index, info->port, netdev_name(info->br));
+   if (ds->index != info->sw_index && ds->ops->crosschip_bridge_join)
+   return ds->ops->crosschip_bridge_join(ds, info->sw_index,
+ info->port, info->br);
 
return 0;
 }
@@ -33,9 +33,9 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
if (ds->index == info->sw_index && ds->ops->port_bridge_leave)
ds->ops->port_bridge_leave(ds, info->port, info->br);
 
-   if (ds->index != info->sw_index)
-   dev_dbg(ds->dev, "crosschip DSA port %d.%d unbridged from %s\n",
-   info->sw_index, info->port, netdev_name(info->br));
+   if (ds->index != info->sw_index && ds->ops->crosschip_bridge_leave)
+   ds->ops->crosschip_bridge_leave(ds, info->sw_index, info->port,
+   info->br);
 
return 0;
 }
-- 
2.12.1

[PATCH net-next v2 6/9] net: dsa: mv88e6xxx: factorize in-chip bridge map

2017-03-30 Thread Vivien Didelot

Factorize the code in the DSA port_bridge_{join,leave} routines used to
program the port VLAN map of all local ports of a given bridge group.

At the same time shorten the _mv88e6xxx_port_based_vlan_map to get rid
of the old underscore prefix naming convention.

Signed-off-by: Vivien Didelot 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 47 +---
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 9b2d369715d7..3802e1bdd111 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1156,7 +1156,7 @@ static u16 mv88e6xxx_port_vlan(struct mv88e6xxx_chip 
*chip, int dev, int port)
return pvlan;
 }
 
-static int _mv88e6xxx_port_based_vlan_map(struct mv88e6xxx_chip *chip, int 
port)
+static int mv88e6xxx_port_vlan_map(struct mv88e6xxx_chip *chip, int port)
 {
u16 output_ports = mv88e6xxx_port_vlan(chip, chip->ds->index, port);
 
@@ -2140,23 +2140,32 @@ static int mv88e6xxx_port_fdb_dump(struct dsa_switch 
*ds, int port,
return err;
 }
 
+static int mv88e6xxx_bridge_map(struct mv88e6xxx_chip *chip,
+   struct net_device *br)
+{
+   int port;
+   int err;
+
+   /* Remap the Port VLAN of each local bridge group member */
+   for (port = 0; port < mv88e6xxx_num_ports(chip); ++port) {
+   if (chip->ds->ports[port].bridge_dev == br) {
+   err = mv88e6xxx_port_vlan_map(chip, port);
+   if (err)
+   return err;
+   }
+   }
+
+   return 0;
+}
+
 static int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
  struct net_device *br)
 {
struct mv88e6xxx_chip *chip = ds->priv;
-   int i, err = 0;
+   int err;
 
mutex_lock(>reg_lock);
-
-   /* Remap each port's VLANTable */
-   for (i = 0; i < mv88e6xxx_num_ports(chip); ++i) {
-   if (ds->ports[i].bridge_dev == br) {
-   err = _mv88e6xxx_port_based_vlan_map(chip, i);
-   if (err)
-   break;
-   }
-   }
-
+   err = mv88e6xxx_bridge_map(chip, br);
mutex_unlock(>reg_lock);
 
return err;
@@ -2166,17 +2175,11 @@ static void mv88e6xxx_port_bridge_leave(struct 
dsa_switch *ds, int port,
struct net_device *br)
 {
struct mv88e6xxx_chip *chip = ds->priv;
-   int i;
 
mutex_lock(>reg_lock);
-
-   /* Remap each port's VLANTable */
-   for (i = 0; i < mv88e6xxx_num_ports(chip); ++i)
-   if (i == port || ds->ports[i].bridge_dev == br)
-   if (_mv88e6xxx_port_based_vlan_map(chip, i))
-   netdev_warn(ds->ports[i].netdev,
-   "failed to remap\n");
-
+   if (mv88e6xxx_bridge_map(chip, br) ||
+   mv88e6xxx_port_vlan_map(chip, port))
+   dev_err(ds->dev, "failed to remap in-chip Port VLAN\n");
mutex_unlock(>reg_lock);
 }
 
@@ -2490,7 +2493,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
if (err)
return err;
 
-   err = _mv88e6xxx_port_based_vlan_map(chip, port);
+   err = mv88e6xxx_port_vlan_map(chip, port);
if (err)
return err;
 
-- 
2.12.1

Re: [PATCH net-next] rtnl: Add support for netdev event to link messages

2017-03-30 Thread David Ahern

On 3/30/17 9:21 AM, Vladislav Yasevich wrote:
> 
> So, something like the patch below would be better in your opinion as a
> starting point.  It'll can at least get the discussion strarted on whether
> an event would usefull to user space or not.

IMO that is a more direct, explicit statement of what is intended to happen.

> 
> However, that's really a separate point from what I was originally try to do.
> I would like to provide the event type itself to the user, so the user may
> perform some action based on that event.


yes, we took a tangent. I was asking about why the PRECHANGEMTU was
causing a message to be sent to userspace.

[PATCH net-next v2 3/9] net: dsa: mv88e6xxx: program the PVT with all ones

2017-03-30 Thread Vivien Didelot

The Cross-chip Port Based VLAN Table (PVT) is currently initialized with
all ones, allowing any external ports to egress frames on local ports.

This commit implements the PVT access functions and programs the PVT
with all ones for the local switch ports only, instead of using the Init
operation. The current behavior is unchanged for the moment.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 31 -
 drivers/net/dsa/mv88e6xxx/global2.c   | 52 +--
 drivers/net/dsa/mv88e6xxx/global2.h   |  8 ++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  4 +++
 4 files changed, 86 insertions(+), 9 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 2a32bb490f92..fb6a723c2137 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1198,15 +1198,44 @@ static int mv88e6xxx_atu_setup(struct mv88e6xxx_chip 
*chip)
return mv88e6xxx_g1_atu_set_age_time(chip, 30);
 }
 
+static int mv88e6xxx_pvt_map(struct mv88e6xxx_chip *chip, int dev, int port)
+{
+   u16 pvlan = 0;
+
+   if (!mv88e6xxx_has_pvt(chip))
+   return -EOPNOTSUPP;
+
+   /* Skip the local source device, which uses in-chip port VLAN */
+   if (dev != chip->ds->index)
+   pvlan = mv88e6xxx_port_mask(chip);
+
+   return mv88e6xxx_g2_pvt_write(chip, dev, port, pvlan);
+}
+
 static int mv88e6xxx_pvt_setup(struct mv88e6xxx_chip *chip)
 {
+   int dev, port;
+   int err;
+
if (!mv88e6xxx_has_pvt(chip))
return 0;
 
/* Clear 5 Bit Port for usage with Marvell Link Street devices:
 * use 4 bits for the Src_Port/Src_Trunk and 5 bits for the Src_Dev.
 */
-   return mv88e6xxx_g2_misc_4_bit_port(chip);
+   err = mv88e6xxx_g2_misc_4_bit_port(chip);
+   if (err)
+   return err;
+
+   for (dev = 0; dev < MV88E6XXX_MAX_PVT_SWITCHES; ++dev) {
+   for (port = 0; port < MV88E6XXX_MAX_PVT_PORTS; ++port) {
+   err = mv88e6xxx_pvt_map(chip, dev, port);
+   if (err)
+   return err;
+   }
+   }
+
+   return 0;
 }
 
 static void mv88e6xxx_port_fast_age(struct dsa_switch *ds, int port)
diff --git a/drivers/net/dsa/mv88e6xxx/global2.c 
b/drivers/net/dsa/mv88e6xxx/global2.c
index d64f4c15ccb7..7c6bc33a9516 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.c
+++ b/drivers/net/dsa/mv88e6xxx/global2.c
@@ -172,6 +172,50 @@ static int mv88e6xxx_g2_clear_irl(struct mv88e6xxx_chip 
*chip)
return err;
 }
 
+/* Offset 0x0B: Cross-chip Port VLAN (Addr) Register
+ * Offset 0x0C: Cross-chip Port VLAN Data Register
+ */
+
+static int mv88e6xxx_g2_pvt_op_wait(struct mv88e6xxx_chip *chip)
+{
+   return mv88e6xxx_g2_wait(chip, GLOBAL2_PVT_ADDR, GLOBAL2_PVT_ADDR_BUSY);
+}
+
+static int mv88e6xxx_g2_pvt_op(struct mv88e6xxx_chip *chip, int src_dev,
+  int src_port, u16 op)
+{
+   int err;
+
+   /* 9-bit Cross-chip PVT pointer: with GLOBAL2_MISC_5_BIT_PORT cleared,
+* source device is 5-bit, source port is 4-bit.
+*/
+   op |= (src_dev & 0x1f) << 4;
+   op |= (src_port & 0xf);
+
+   err = mv88e6xxx_g2_write(chip, GLOBAL2_PVT_ADDR, op);
+   if (err)
+   return err;
+
+   return mv88e6xxx_g2_pvt_op_wait(chip);
+}
+
+int mv88e6xxx_g2_pvt_write(struct mv88e6xxx_chip *chip, int src_dev,
+  int src_port, u16 data)
+{
+   int err;
+
+   err = mv88e6xxx_g2_pvt_op_wait(chip);
+   if (err)
+   return err;
+
+   err = mv88e6xxx_g2_write(chip, GLOBAL2_PVT_DATA, data);
+   if (err)
+   return err;
+
+   return mv88e6xxx_g2_pvt_op(chip, src_dev, src_port,
+  GLOBAL2_PVT_ADDR_OP_WRITE_PVLAN);
+}
+
 /* Offset 0x0D: Switch MAC/WoL/WoF register */
 
 static int mv88e6xxx_g2_switch_mac_write(struct mv88e6xxx_chip *chip,
@@ -991,14 +1035,6 @@ int mv88e6xxx_g2_setup(struct mv88e6xxx_chip *chip)
return err;
}
 
-   if (mv88e6xxx_has_pvt(chip)) {
-   /* Initialize Cross-chip Port VLAN Table to reset defaults */
-   err = mv88e6xxx_g2_write(chip, GLOBAL2_PVT_ADDR,
-GLOBAL2_PVT_ADDR_OP_INIT_ONES);
-   if (err)
-   return err;
-   }
-
if (mv88e6xxx_has(chip, MV88E6XXX_FLAG_G2_POT)) {
/* Clear the priority override table. */
err = mv88e6xxx_g2_clear_pot(chip);
diff --git a/drivers/net/dsa/mv88e6xxx/global2.h 
b/drivers/net/dsa/mv88e6xxx/global2.h
index 71fb2ff541ba..96046bb12ca1 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.h
+++ b/drivers/net/dsa/mv88e6xxx/global2.h
@@ -42,6 +42,8 @@ int mv88e6xxx_g2_get_eeprom16(struct mv88e6xxx_chip *chip,

[PATCH net-next v2 9/9] net: dsa: mv88e6xxx: add cross-chip bridging

2017-03-30 Thread Vivien Didelot

Implement the DSA cross-chip bridging operations by remapping the local
ports an external source port can egress frames to, when this cross-chip
port joins or leaves a bridge.

The PVT is no longer configured with all ones allowing any external
frame to egress any local port. Only DSA and CPU ports, as well as
bridge group members, can egress frames on local ports.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 34 +-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index c6f45a2a9335..44ba8cff5631 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1222,7 +1222,7 @@ static int mv88e6xxx_pvt_map(struct mv88e6xxx_chip *chip, 
int dev, int port)
 
/* Skip the local source device, which uses in-chip port VLAN */
if (dev != chip->ds->index)
-   pvlan = mv88e6xxx_port_mask(chip);
+   pvlan = mv88e6xxx_port_vlan(chip, dev, port);
 
return mv88e6xxx_g2_pvt_write(chip, dev, port, pvlan);
 }
@@ -2203,6 +2203,36 @@ static void mv88e6xxx_port_bridge_leave(struct 
dsa_switch *ds, int port,
mutex_unlock(>reg_lock);
 }
 
+static int mv88e6xxx_crosschip_bridge_join(struct dsa_switch *ds, int dev,
+  int port, struct net_device *br)
+{
+   struct mv88e6xxx_chip *chip = ds->priv;
+   int err;
+
+   if (!mv88e6xxx_has_pvt(chip))
+   return 0;
+
+   mutex_lock(>reg_lock);
+   err = mv88e6xxx_pvt_map(chip, dev, port);
+   mutex_unlock(>reg_lock);
+
+   return err;
+}
+
+static void mv88e6xxx_crosschip_bridge_leave(struct dsa_switch *ds, int dev,
+int port, struct net_device *br)
+{
+   struct mv88e6xxx_chip *chip = ds->priv;
+
+   if (!mv88e6xxx_has_pvt(chip))
+   return;
+
+   mutex_lock(>reg_lock);
+   if (mv88e6xxx_pvt_map(chip, dev, port))
+   dev_err(ds->dev, "failed to remap cross-chip Port VLAN\n");
+   mutex_unlock(>reg_lock);
+}
+
 static int mv88e6xxx_software_reset(struct mv88e6xxx_chip *chip)
 {
if (chip->info->ops->reset)
@@ -4313,6 +4343,8 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = 
{
.port_mdb_add   = mv88e6xxx_port_mdb_add,
.port_mdb_del   = mv88e6xxx_port_mdb_del,
.port_mdb_dump  = mv88e6xxx_port_mdb_dump,
+   .crosschip_bridge_join  = mv88e6xxx_crosschip_bridge_join,
+   .crosschip_bridge_leave = mv88e6xxx_crosschip_bridge_leave,
 };
 
 static struct dsa_switch_driver mv88e6xxx_switch_drv = {
-- 
2.12.1

[PATCH net-next v2 7/9] net: dsa: mv88e6xxx: remap existing bridge members

2017-03-30 Thread Vivien Didelot

When a local port of a switch chip becomes a member of a bridge group,
we need to reprogram the Cross-chip Port Based VLAN Table (PVT) to allow
existing cross-chip bridge members to egress frames on the new ports.

There is no functional changes yet, since the PVT is still programmed
with all ones, allowing any external port to egress frames locally.

Signed-off-by: Vivien Didelot 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 3802e1bdd111..c6f45a2a9335 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2143,7 +2143,9 @@ static int mv88e6xxx_port_fdb_dump(struct dsa_switch *ds, 
int port,
 static int mv88e6xxx_bridge_map(struct mv88e6xxx_chip *chip,
struct net_device *br)
 {
+   struct dsa_switch *ds;
int port;
+   int dev;
int err;
 
/* Remap the Port VLAN of each local bridge group member */
@@ -2155,6 +2157,24 @@ static int mv88e6xxx_bridge_map(struct mv88e6xxx_chip 
*chip,
}
}
 
+   if (!mv88e6xxx_has_pvt(chip))
+   return 0;
+
+   /* Remap the Port VLAN of each cross-chip bridge group member */
+   for (dev = 0; dev < DSA_MAX_SWITCHES; ++dev) {
+   ds = chip->ds->dst->ds[dev];
+   if (!ds)
+   break;
+
+   for (port = 0; port < ds->num_ports; ++port) {
+   if (ds->ports[port].bridge_dev == br) {
+   err = mv88e6xxx_pvt_map(chip, dev, port);
+   if (err)
+   return err;
+   }
+   }
+   }
+
return 0;
 }
 
-- 
2.12.1

[PATCH net-next v2 4/9] net: dsa: mv88e6xxx: allocate the number of ports

2017-03-30 Thread Vivien Didelot

The current code allocates DSA_MAX_PORTS ports for a Marvell dsa_switch
structure. Provide the exact number of ports so the corresponding
ds->num_ports is accurate.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index fb6a723c2137..28bdfadbf050 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4286,7 +4286,7 @@ static int mv88e6xxx_register_switch(struct 
mv88e6xxx_chip *chip)
struct device *dev = chip->dev;
struct dsa_switch *ds;
 
-   ds = dsa_switch_alloc(dev, DSA_MAX_PORTS);
+   ds = dsa_switch_alloc(dev, mv88e6xxx_num_ports(chip));
if (!ds)
return -ENOMEM;
 
-- 
2.12.1

[PATCH net-next v2 0/9] net: dsa: mv88e6xxx: program cross-chip bridging

2017-03-30 Thread Vivien Didelot

The purpose of this patch series is to bring hardware cross-chip
bridging configuration to the DSA layer and the mv88e6xxx DSA driver.

Most recent Marvell switch chips have a Cross-chip Port Based VLAN Table
(PVT) used to restrict to which internal destination port an arbitrary
external source port is allowed to egress frames to.

The current behavior of the mv88e6xxx driver is to program this table
table with all ones, allowing any external ports to egress frames on any
internal ports. This means that carefully crafted Ethernet frames can
potentially bypass the user bridging configuration.

Patches 1 to 7 prepare the setup of this table and factorize the common
bits of both in-chip and cross-chip Marvell bridging code.

Patch 8 adds new optional cross-chip bridging operations to DSA switch.

Patch 9 switches the current behavior to program the table according to
the user bridging configuration when (cross-chip) ports get (un)bridged.

On a ZII Rev B board, bridging together the 3 user ports of both 88E6352
will result in the following PVTs on respectively switch 0 and switch 1:

External   Internal Ports
Dev Port   0  1  2  3  4  5  6

 10*  *  *  -  -  *  *
 11*  *  *  -  -  *  *
 12*  *  *  -  -  *  *
 13-  -  -  -  -  *  *
 14-  -  -  -  -  *  *
 15*  *  *  *  *  *  *
 16*  *  *  *  *  *  *

 00*  *  *  -  -  *  *
 01*  *  *  -  -  *  *
 02*  *  *  -  -  *  *
 03-  -  -  -  -  *  *
 04-  -  -  -  -  *  *
 05*  *  *  *  *  *  *
 06*  *  *  *  *  *  *

Changes since v2:
  - Define MV88E6XXX_MAX_PVT_SWITCHES and MV88E6XXX_MAX_PVT_PORTS
  - use mv88e6xxx_g2_misc_4_bit_port instead of the 5-bit variant
  - add Andrew's tags and reword commit 6/9


Vivien Didelot (9):
  net: dsa: mv88e6xxx: move PVT description in info
  net: dsa: mv88e6xxx: use 4-bit port for PVT data
  net: dsa: mv88e6xxx: program the PVT with all ones
  net: dsa: mv88e6xxx: allocate the number of ports
  net: dsa: mv88e6xxx: rework in-chip bridging
  net: dsa: mv88e6xxx: factorize in-chip bridge map
  net: dsa: mv88e6xxx: remap existing bridge members
  net: dsa: add cross-chip bridging operations
  net: dsa: mv88e6xxx: add cross-chip bridging

 drivers/net/dsa/mv88e6xxx/chip.c  | 212 --
 drivers/net/dsa/mv88e6xxx/global2.c   |  77 ++--
 drivers/net/dsa/mv88e6xxx/global2.h   |  15 +++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  37 +++---
 include/net/dsa.h |   8 ++
 net/dsa/switch.c  |  12 +-
 6 files changed, 288 insertions(+), 73 deletions(-)

-- 
2.12.1

[PATCH net-next v2 1/9] net: dsa: mv88e6xxx: move PVT description in info

2017-03-30 Thread Vivien Didelot

Not all Marvell switch chips feature a Cross-chip Port VLAN Table (PVT).

Chips with a PVT use the same implementation, so a new mv88e6xxx_ops
member won't be necessary yet. Add a "pvt" boolean member to the
mv88e6xxx_info structure and kill the obsolete MV88E6XXX_FLAGS_PVT flag.

Add a mv88e6xxx_has_pvt helper to wrap future checks of that condition.

Signed-off-by: Vivien Didelot 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 22 ++
 drivers/net/dsa/mv88e6xxx/global2.c   |  2 +-
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 32 +++-
 3 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 3c946af1159d..8f1f881d0375 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3578,6 +3578,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.age_time_coeff = 15000,
.g1_irqs = 8,
.atu_move_port_mask = 0xf,
+   .pvt = true,
.tag_protocol = DSA_TAG_PROTO_DSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6097,
.ops = _ops,
@@ -3610,6 +3611,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.age_time_coeff = 15000,
.g1_irqs = 8,
.atu_move_port_mask = 0xf,
+   .pvt = true,
.tag_protocol = DSA_TAG_PROTO_EDSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6097,
.ops = _ops,
@@ -3626,6 +3628,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.age_time_coeff = 15000,
.g1_irqs = 9,
.atu_move_port_mask = 0xf,
+   .pvt = true,
.tag_protocol = DSA_TAG_PROTO_DSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6165,
.ops = _ops,
@@ -3657,6 +3660,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.global1_addr = 0x1b,
.age_time_coeff = 3750,
.atu_move_port_mask = 0x1f,
+   .pvt = true,
.tag_protocol = DSA_TAG_PROTO_EDSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6341,
.ops = _ops,
@@ -3673,6 +3677,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.age_time_coeff = 15000,
.g1_irqs = 9,
.atu_move_port_mask = 0xf,
+   .pvt = true,
.tag_protocol = DSA_TAG_PROTO_DSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6165,
.ops = _ops,
@@ -3689,6 +3694,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.age_time_coeff = 15000,
.g1_irqs = 9,
.atu_move_port_mask = 0xf,
+   .pvt = true,
.tag_protocol = DSA_TAG_PROTO_DSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6165,
.ops = _ops,
@@ -3705,6 +3711,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.age_time_coeff = 15000,
.g1_irqs = 9,
.atu_move_port_mask = 0xf,
+   .pvt = true,
.tag_protocol = DSA_TAG_PROTO_EDSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6351,
.ops = _ops,
@@ -3721,6 +3728,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.age_time_coeff = 15000,
.g1_irqs = 9,
.atu_move_port_mask = 0xf,
+   .pvt = true,
.tag_protocol = DSA_TAG_PROTO_EDSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6352,
.ops = _ops,
@@ -3737,6 +3745,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.age_time_coeff = 15000,
.g1_irqs = 9,
.atu_move_port_mask = 0xf,
+   .pvt = true,
.tag_protocol = DSA_TAG_PROTO_EDSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6351,
.ops = _ops,
@@ -3753,6 +3762,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.age_time_coeff = 15000,
.g1_irqs = 9,
.atu_move_port_mask = 0xf,
+   .pvt = true,
.tag_protocol = DSA_TAG_PROTO_EDSA,
.flags = MV88E6XXX_FLAGS_FAMILY_6352,
.ops = _ops,
@@ -3785,6 +3795,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.tag_protocol = DSA_TAG_PROTO_DSA,
.age_time_coeff = 3750,
.g1_irqs = 9,
+   .pvt = true,
.atu_move_port_mask = 0x1f,
.flags = MV88E6XXX_FLAGS_FAMILY_6390,
.ops = _ops,
@@ -3801,6 +3812,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
.age_time_coeff = 3750,
.g1_irqs = 9,

[PATCH net-next v2 5/9] net: dsa: mv88e6xxx: rework in-chip bridging

2017-03-30 Thread Vivien Didelot

All ports -- internal and external, for chips featuring a PVT -- have a
mask restricting to which internal ports a frame is allowed to egress.

Now that DSA exposes the number of ports and their bridge devices, it is
possible to extract the code generating the VLAN map and make it generic
so that it can be shared later with the cross-chip bridging code.

Signed-off-by: Vivien Didelot 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 53 ++--
 1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 28bdfadbf050..9b2d369715d7 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1123,27 +1123,42 @@ static int mv88e6xxx_set_eee(struct dsa_switch *ds, int 
port,
return err;
 }
 
+static u16 mv88e6xxx_port_vlan(struct mv88e6xxx_chip *chip, int dev, int port)
+{
+   struct dsa_switch *ds = NULL;
+   struct net_device *br;
+   u16 pvlan;
+   int i;
+
+   if (dev < DSA_MAX_SWITCHES)
+   ds = chip->ds->dst->ds[dev];
+
+   /* Prevent frames from unknown switch or port */
+   if (!ds || port >= ds->num_ports)
+   return 0;
+
+   /* Frames from DSA links and CPU ports can egress any local port */
+   if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
+   return mv88e6xxx_port_mask(chip);
+
+   br = ds->ports[port].bridge_dev;
+   pvlan = 0;
+
+   /* Frames from user ports can egress any local DSA links and CPU ports,
+* as well as any local member of their bridge group.
+*/
+   for (i = 0; i < mv88e6xxx_num_ports(chip); ++i)
+   if (dsa_is_cpu_port(chip->ds, i) ||
+   dsa_is_dsa_port(chip->ds, i) ||
+   (br && chip->ds->ports[i].bridge_dev == br))
+   pvlan |= BIT(i);
+
+   return pvlan;
+}
+
 static int _mv88e6xxx_port_based_vlan_map(struct mv88e6xxx_chip *chip, int 
port)
 {
-   struct dsa_switch *ds = chip->ds;
-   struct net_device *bridge = ds->ports[port].bridge_dev;
-   u16 output_ports = 0;
-   int i;
-
-   /* allow CPU port or DSA link(s) to send frames to every port */
-   if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) {
-   output_ports = ~0;
-   } else {
-   for (i = 0; i < mv88e6xxx_num_ports(chip); ++i) {
-   /* allow sending frames to every group member */
-   if (bridge && ds->ports[i].bridge_dev == bridge)
-   output_ports |= BIT(i);
-
-   /* allow sending frames to CPU port and DSA link(s) */
-   if (dsa_is_cpu_port(ds, i) || dsa_is_dsa_port(ds, i))
-   output_ports |= BIT(i);
-   }
-   }
+   u16 output_ports = mv88e6xxx_port_vlan(chip, chip->ds->index, port);
 
/* prevent frames from going back out of the port they came in on */
output_ports &= ~BIT(port);
-- 
2.12.1

[PATCH] net: batman-adv: use new api ethtool_{get|set}_link_ksettings

2017-03-30 Thread Philippe Reynes

The ethtool api {get|set}_settings is deprecated.
We move this driver to new api {get|set}_link_ksettings.

I've only compiled this change. If someone may test it,
it would be very nice.

Signed-off-by: Philippe Reynes 
---
 net/batman-adv/soft-interface.c |   25 -
 1 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index d042c99..07f6627 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -64,7 +64,8 @@
 #include "sysfs.h"
 #include "translation-table.h"
 
-static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd 
*cmd);
+static int batadv_get_link_ksettings(struct net_device *dev,
+struct ethtool_link_ksettings *cmd);
 static void batadv_get_drvinfo(struct net_device *dev,
   struct ethtool_drvinfo *info);
 static u32 batadv_get_msglevel(struct net_device *dev);
@@ -76,7 +77,6 @@ static void batadv_get_ethtool_stats(struct net_device *dev,
 static int batadv_get_sset_count(struct net_device *dev, int stringset);
 
 static const struct ethtool_ops batadv_ethtool_ops = {
-   .get_settings = batadv_get_settings,
.get_drvinfo = batadv_get_drvinfo,
.get_msglevel = batadv_get_msglevel,
.set_msglevel = batadv_set_msglevel,
@@ -84,6 +84,7 @@ static void batadv_get_ethtool_stats(struct net_device *dev,
.get_strings = batadv_get_strings,
.get_ethtool_stats = batadv_get_ethtool_stats,
.get_sset_count = batadv_get_sset_count,
+   .get_link_ksettings = batadv_get_link_ksettings,
 };
 
 int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
@@ -1085,18 +1086,16 @@ struct rtnl_link_ops batadv_link_ops __read_mostly = {
 };
 
 /* ethtool */
-static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+static int batadv_get_link_ksettings(struct net_device *dev,
+struct ethtool_link_ksettings *cmd)
 {
-   cmd->supported = 0;
-   cmd->advertising = 0;
-   ethtool_cmd_speed_set(cmd, SPEED_10);
-   cmd->duplex = DUPLEX_FULL;
-   cmd->port = PORT_TP;
-   cmd->phy_address = 0;
-   cmd->transceiver = XCVR_INTERNAL;
-   cmd->autoneg = AUTONEG_DISABLE;
-   cmd->maxtxpkt = 0;
-   cmd->maxrxpkt = 0;
+   ethtool_link_ksettings_zero_link_mode(cmd, supported);
+   ethtool_link_ksettings_zero_link_mode(cmd, advertising);
+   cmd->base.speed = SPEED_10;
+   cmd->base.duplex = DUPLEX_FULL;
+   cmd->base.port = PORT_TP;
+   cmd->base.phy_address = 0;
+   cmd->base.autoneg = AUTONEG_DISABLE;
 
return 0;
 }
-- 
1.7.4.4

Re: [PATCH] r8152: The Microsoft Surface docks also use R8152.

2017-03-30 Thread Rene Rebe

> On Thu, 2017-03-30 at 19:47 +0200, René Rebe wrote:
> > Hi,
> > 
> > On Mar 30, 2017, at 19:06, Dan Williams  wrote:
> > 
> > > On Tue, 2017-03-28 at 06:42 +0200, Rene Rebe wrote:
> > > > Without this the generic cdc_ether grabs the device,
> > > > and does not really work.
> > > 
> > > Does this need a corresponding blacklist in cdc_ether
> > > then?  Otherwise
> > > you're really depending on driver loading order.
> > 
> > right, I already test build, and will send after reboot.
> > 
> > Any other nitpick in the meantime?
> 
> Nope, other than it's pretty odd that the device would expose standard
> cdc-ether compatible USB descriptors, but apparently not be compatible
> with cdc-ether?  Are we sure we don't just need a tweak or something to
> cdc-ether?

Based on the existing glue and blacklist code I would assume most if
not all R8152 devices have this "issue".

However, sometimes the standard macOS driver works with it. IIRC if
the dock is connected after booting or so, so maybe the BIOS network
boot glue alters the behavior or whatever. It never was able to TX any
packet with the generic Linux cdc-ether, though.

Anyways, given the existing specific Linux driver I would assume it is
better to use it in any case.

   René

-- 
  René Rebe, ExactCODE GmbH, Lietzenburger Str. 42, DE-10789 Berlin
  http://exactcode.com | http://t2-project.org | http://rene.rebe.de

[PATCH] r8152: The Microsoft Surface docks also use R8152 v2

2017-03-30 Thread Rene Rebe

Without this the generic cdc_ether grabs the device,
and does not really work.

Signed-off-by: René Rebe 

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index f5552aa..f3ae88f 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -532,6 +532,7 @@ static const struct driver_info wwan_info = {
 #define LENOVO_VENDOR_ID   0x17ef
 #define NVIDIA_VENDOR_ID   0x0955
 #define HP_VENDOR_ID   0x03f0
+#define MICROSOFT_VENDOR_ID0x045e
 
 static const struct usb_device_id  products[] = {
 /* BLACKLIST !!
@@ -761,6 +762,20 @@ static const struct usb_device_id  products[] = {
.driver_info = 0,
 },
 
+/* Microsoft Surface 2 dock (based on Realtek RTL8152) */
+{
+   USB_DEVICE_AND_INTERFACE_INFO(MICROSOFT_VENDOR_ID, 0x07ab, 
USB_CLASS_COMM,
+   USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+   .driver_info = 0,
+},
+
+/* Microsoft Surface 3 dock (based on Realtek RTL8153) */
+{
+   USB_DEVICE_AND_INTERFACE_INFO(MICROSOFT_VENDOR_ID, 0x07c6, 
USB_CLASS_COMM,
+   USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+   .driver_info = 0,
+},
+
 /* WHITELIST!!!
  *
  * CDC Ether uses two interfaces, not necessarily consecutive.
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index c34df33..07f788c 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -517,6 +517,7 @@ enum rtl8152_flags {
 
 /* Define these values to match your device */
 #define VENDOR_ID_REALTEK  0x0bda
+#define VENDOR_ID_MICROSOFT0x045e
 #define VENDOR_ID_SAMSUNG  0x04e8
 #define VENDOR_ID_LENOVO   0x17ef
 #define VENDOR_ID_NVIDIA   0x0955
@@ -4521,6 +4522,8 @@ static void rtl8152_disconnect(struct usb_interface *intf)
 static struct usb_device_id rtl8152_table[] = {
{REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8152)},
{REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8153)},
+   {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07ab)},
+   {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07c6)},
{REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101)},
{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x304f)},
{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x3062)},

-- 
  René Rebe, ExactCODE GmbH, Lietzenburger Str. 42, DE-10789 Berlin
  http://exactcode.com | http://t2-project.org | http://rene.rebe.de

Re: [PATCH net v2] openvswitch: Fix ovs_flow_key_update()

2017-03-30 Thread Jiri Benc

On Thu, 30 Mar 2017 12:36:03 -0700, Yi-Hung Wei wrote:
> ovs_flow_key_update() is called when the flow key is invalid, and it is
> used to update and revalidate the flow key. Commit 329f45bc4f19
> ("openvswitch: add mac_proto field to the flow key") introduces mac_proto
> field to flow key and use it to determine whether the flow key is valid.
> However, the commit does not update the code path in ovs_flow_key_update()
> to revalidate the flow key which may cause BUG_ON() on execute_recirc().
> This patch addresses the aforementioned issue.
> 
> Fixes: 329f45bc4f19 ("openvswitch: add mac_proto field to the flow key")
> Signed-off-by: Yi-Hung Wei 

Acked-by: Jiri Benc 

Thanks!

 Jiri

Re: [PATCH v3 1/2] net: phy: Fix PHY AN done state machine for interrupt driven PHYs

2017-03-30 Thread Florian Fainelli

On 03/27/2017 04:59 AM, Roger Quadros wrote:
> The Ethernet link on an interrupt driven PHY was not coming up if the
> Ethernet cable was plugged before the Ethernet interface was brought up.
> 
> The PHY state machine seems to be stuck from RUNNING to AN state
> with no new interrupts from the PHY. So it doesn't know when the
> PHY Auto-negotiation has been completed and doesn't transition to RUNNING
> state with ANEG done thus netif_carrier_on() is never called.
> 
> NOTE: genphy_config_aneg() will not restart PHY Auto-negotiation of
> advertisement parameters didn't change.
> 
> Fix this by scheduling the PHY state machine in phy_start_aneg().
> There is no way of knowing in phy.c whether auto-negotiation was
> restarted or not by the PHY driver so we just wait for the next
> poll/interrupt to update the PHY state machine.
> 
> Fixes: 3c293f4e08b5 ("net: phy: Trigger state machine on state change and not 
> polling.")
> Cc: stable  # v4.9+
> Signed-off-by: Roger Quadros 

Reviewed-by: Florian Fainelli 
-- 
Florian

[PATCH net v2] openvswitch: Fix ovs_flow_key_update()

2017-03-30 Thread Yi-Hung Wei

ovs_flow_key_update() is called when the flow key is invalid, and it is
used to update and revalidate the flow key. Commit 329f45bc4f19
("openvswitch: add mac_proto field to the flow key") introduces mac_proto
field to flow key and use it to determine whether the flow key is valid.
However, the commit does not update the code path in ovs_flow_key_update()
to revalidate the flow key which may cause BUG_ON() on execute_recirc().
This patch addresses the aforementioned issue.

Fixes: 329f45bc4f19 ("openvswitch: add mac_proto field to the flow key")
Signed-off-by: Yi-Hung Wei 
---
 net/openvswitch/flow.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 9d4bb8eb63f2..3f76cb765e5b 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -527,7 +527,7 @@ static int key_extract(struct sk_buff *skb, struct 
sw_flow_key *key)
 
/* Link layer. */
clear_vlan(key);
-   if (key->mac_proto == MAC_PROTO_NONE) {
+   if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
if (unlikely(eth_type_vlan(skb->protocol)))
return -EINVAL;
 
@@ -745,7 +745,13 @@ static int key_extract(struct sk_buff *skb, struct 
sw_flow_key *key)
 
 int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
 {
-   return key_extract(skb, key);
+   int res;
+
+   res = key_extract(skb, key);
+   if (!res)
+   key->mac_proto &= ~SW_FLOW_KEY_INVALID;
+
+   return res;
 }
 
 static int key_extract_mac_proto(struct sk_buff *skb)
-- 
2.7.4

Re: [PATCH net] ibmvnic: Remove debugfs support

2017-03-30 Thread David Miller

From: Nathan Fontenot 
Date: Wed, 29 Mar 2017 15:14:55 -0400

> The debugfs support in the ibmvnic driver is not, and never has been,
> supported. Just remove it.
> 
> The work done in the debugfs code for the driver was part of the original
> spec for the ibmvnic driver. The corresponding support for this from the
> server side was never supported and has been dropped.
> 
> Signed-off-by: Nathan Fontenot 

Applied to net-next, thanks.

Re: [PATCH net-next] bonding: refine bond_fold_stats() wrap detection

2017-03-30 Thread David Miller

From: Eric Dumazet 
Date: Wed, 29 Mar 2017 10:45:44 -0700

> From: Eric Dumazet 
> 
> Some device drivers reset their stats at down/up events, possibly
> fooling bonding stats, since they operate with relative deltas.
> 
> It is nearly not possible to fix drivers, since some of them compute the
> tx/rx counters based on per rx/tx queue stats, and the queues can be
> reconfigured (ethtool -L) between the down/up sequence.
> 
> Lets avoid accumulating 'negative' values that render bonding stats
> useless.
> 
> It is better to lose small deltas, assuming the bonding stats are
> fetched at a reasonable frequency.
> 
> Fixes: 5f0c5f73e5ef ("bonding: make global bonding stats more reliable")
> Signed-off-by: Eric Dumazet 

Applied, thanks Eric.

Re: [PATCH v3] net/utils: generic inet_pton_with_scope helper

2017-03-30 Thread David Miller

From: Sagi Grimberg 
Date: Wed, 29 Mar 2017 20:48:44 +0300

> Several locations in the stack need to handle ipv4/ipv6
> (with scope) and port strings conversion to sockaddr.
> Add a helper that takes either AF_INET, AF_INET6 or
> AF_UNSPEC (for wildcard) to centralize this handling.
> 
> Suggested-by: Christoph Hellwig 
> Reviewed-by: Christoph Hellwig 
> Signed-off-by: Sagi Grimberg 

This looks fine to me:

Acked-by: David S. Miller

Re: [PATCH 1/2] virtio: allow drivers to validate features

2017-03-30 Thread David Miller

From: "Michael S. Tsirkin" 
Date: Wed, 29 Mar 2017 20:14:44 +0300

> Some drivers can't support all features in all configurations.  At the
> moment we blindly set FEATURES_OK and later FAILED.  Support this better
> by adding a callback drivers can use to do some early checks.
> 
> Signed-off-by: Michael S. Tsirkin 

Michael do you want me to take these virtio networking fixes into my
tree directly or are you going to send me a pull request or something
after it all settles down?

Thanks.

Re: [PATCH net] openvswitch: Fix ovs_flow_key_update()

2017-03-30 Thread Jiri Benc

On Thu, 30 Mar 2017 11:39:35 -0700, Yi-Hung Wei wrote:
> If we invalidate a flow key of a L3 packet, the flow's mac_proto is like this
> (MAC_PROTO_NONE | SW_FLOW_KEY_INVALID), then key_extract() will
> process the link layer of this L3 packet since mac_proto !=MAC_PROTO_NONE?
> 
> In this case, shall we update key_extract() like this
> static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
> 
> /* Link layer. */
> clear_vlan(key);
> -   if (key->mac_proto == MAC_PROTO_NONE) {
> +   if (key->mac_proto & MAC_PROTO_NONE) {

Use ovs_key_mac_proto(key) == MAC_PROTO_NONE.

 Jiri

Re: [PATCH net] openvswitch: Fix ovs_flow_key_update()

2017-03-30 Thread Yi-Hung Wei

On Thu, Mar 30, 2017 at 6:22 AM, Jiri Benc  wrote:
> On Wed, 29 Mar 2017 17:14:10 -0700, Yi-Hung Wei wrote:
>> ovs_flow_key_update() is called when the flow key is invalid, and it is
>> used to update and revalidate the flow key. Commit 329f45bc4f19
>> ("openvswitch: add mac_proto field to the flow key") introduces mac_proto
>> field to flow key and use it to determine whether the flow key is valid.
>> However, the commit does not update the code path in ovs_flow_key_update()
>> to revalidate the flow key which may cause BUG_ON() on execute_recirc().
>> This patch addresses the aforementioned issue.
>
> Good catch.
>
>>  int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
>>  {
>> + int res;
>> +
>> + res = key_extract_mac_proto(skb);
>> + if (res < 0)
>> + return res;
>> + key->mac_proto = res;
>> +
>>   return key_extract(skb, key);
>>  }
>
> But this should just reset the SW_FLOW_KEY_INVALID flag, there's no
> need to recompute mac_proto.
>
> Something like this:
>
>  int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
>  {
> -   return key_extract(skb, key);
> +   int res;
> +
> +   res = key_extract(skb, key);
> +   if (!res)
> +   key->mac_proto &= ~SW_FLOW_KEY_INVALID;
> +   return res;
>  }
Hi Jiri,

One case that I worry is that key_extract() currently relies on mac_proto to
decide whether to process the link layer.  So if we update key->mac_proto
after key_extract(), wouldn't we run into a problem like the following?

If we invalidate a flow key of a L3 packet, the flow's mac_proto is like this
(MAC_PROTO_NONE | SW_FLOW_KEY_INVALID), then key_extract() will
process the link layer of this L3 packet since mac_proto !=MAC_PROTO_NONE?

In this case, shall we update key_extract() like this
static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)

/* Link layer. */
clear_vlan(key);
-   if (key->mac_proto == MAC_PROTO_NONE) {
+   if (key->mac_proto & MAC_PROTO_NONE) {
if (unlikely(eth_type_vlan(skb->protocol)))
return -EINVAL;

Thanks,

-Yi-Hung

>
> Thanks,
>
>  Jiri

Re: [PATCH] r8152: The Microsoft Surface docks also use R8152.

2017-03-30 Thread Dan Williams

On Thu, 2017-03-30 at 19:47 +0200, René Rebe wrote:
> Hi,
> 
> On Mar 30, 2017, at 19:06, Dan Williams  wrote:
> 
> > On Tue, 2017-03-28 at 06:42 +0200, Rene Rebe wrote:
> > > Without this the generic cdc_ether grabs the device,
> > > and does not really work.
> > 
> > Does this need a corresponding blacklist in cdc_ether
> > then?  Otherwise
> > you're really depending on driver loading order.
> 
> right, I already test build, and will send after reboot.
> 
> Any other nitpick in the meantime?

Nope, other than it's pretty odd that the device would expose standard
cdc-ether compatible USB descriptors, but apparently not be compatible
with cdc-ether?  Are we sure we don't just need a tweak or something to
cdc-ether?

Dan

> > Dan
> > 
> > > Signed-off-by: René Rebe 
> > > 
> > > diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
> > > index c34df33..07f788c 100644
> > > --- a/drivers/net/usb/r8152.c
> > > +++ b/drivers/net/usb/r8152.c
> > > @@ -517,6 +517,7 @@ enum rtl8152_flags {
> > >  
> > >  /* Define these values to match your device */
> > >  #define VENDOR_ID_REALTEK  0x0bda
> > > +#define VENDOR_ID_MICROSOFT0x045e
> > >  #define VENDOR_ID_SAMSUNG  0x04e8
> > >  #define VENDOR_ID_LENOVO   0x17ef
> > >  #define VENDOR_ID_NVIDIA   0x0955
> > > @@ -4521,6 +4522,8 @@ static void rtl8152_disconnect(struct
> > > usb_interface *intf)
> > >  static struct usb_device_id rtl8152_table[] = {
> > > {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8152)},
> > > {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8153)},
> > > +   {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07ab)},
> > > +   {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07c6)},
> > > {REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101)},
> > > {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x304f)},
> > > {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x3062)},
> > > 
> > > 
> > > -- 
> > >   René Rebe, ExactCODE GmbH, Lietzenburger Str. 42, DE-10789
> > > Berlin
> > >   http://exactcode.com | http://t2-project.org |
> > > http://rene.rebe.de
> 
>

Re: [PATCH net] sctp: alloc stream info when initializing asoc

2017-03-30 Thread David Miller

From: Xin Long 
Date: Thu, 30 Mar 2017 01:00:53 +0800

> When sending a msg without asoc established, sctp will send INIT packet
> first and then enqueue chunks.
> 
> Before receiving INIT_ACK, stream info is not yet alloced. But enqueuing
> chunks needs to access stream info, like out stream state and out stream
> cnt.
> 
> This patch is to fix it by allocing out stream info when initializing an
> asoc, allocing in stream and re-allocing out stream when processing init.
> 
> Signed-off-by: Xin Long 

Applied, thanks.

Re: [PATCH][V2] VSOCK: remove unnecessary ternary operator on return value

2017-03-30 Thread David Miller

From: Colin King 
Date: Wed, 29 Mar 2017 16:33:55 +0100

> From: Colin Ian King 
> 
> Rather than assign the positive errno values to ret and then
> checking if it is positive and flip the sign, just return the
> errno value.
> 
> Detected by CoverityScan, CID#986649 ("Logically Dead Code")
> 
> Signed-off-by: Colin Ian King 

Applied to net-next, thanks.

Re: [PATCH net v2 0/3] net/packet: fix multiple overflow issues in ring buffers

2017-03-30 Thread David Miller

From: Andrey Konovalov 
Date: Wed, 29 Mar 2017 16:11:19 +0200

> This patchset addresses multiple overflows and signedness-related issues
> in packet socket ring buffers.
> 
> Changes in v2:
> - remove cleanup patches, will send in a separate patchset
> - use a > UINT_MAX / b to check for a * b overflow

All applied and queued up for -stable, thanks.

Re: bond procfs hw addr prints

2017-03-30 Thread Jarod Wilson


On 2017-03-13 11:26 PM, Jarod Wilson wrote:

On 2017-03-13 10:06 PM, Jarod Wilson wrote:

On 2017-03-13 8:28 PM, Jay Vosburgh wrote:

Jarod Wilson  wrote:


I've got a bug report for someone using a Intel OPA devices in a
bond, and
it appears these devices have a hardware address length of 20,
opposed to
the typical 6 on ethernet. When they dump /proc/net/bonding/bondX, it
only
prints the first 6 of the address, per %pM and mac_address_string(),
while
sysfs for the interface does print the right thing, since it uses
sysfs_print_mac(), which takes a length argument.


This (20 octet MAC length) is true for any Infiniband device.


So the question is... What's the best route to take here? Expand %pM to
support variable length hardware addresses? Use sysfs_* in procfs?
Reinvent the wheel? Nothing I've tinkered with just yet feels very
clean,
on top of not actually working yet. :)


sysfs_format_mac (not _print_mac) uses "%*phC", len, addr in its
format string.  Perhaps that format would be a better choice than %pM
for this case?


Ah, I'd failed to fully grasp how %phC worked, had actually tried it w/o
the * in there, and only the first char of the addr was printing.
Working on an updated version that uses %*phC properly, which does look
like the way to go here. (Didn't help that I was also looking at an
older codebase that didn't have the sysfs_format_mac de-duplication).
I'll try to have a tested patch in flight tomorrow.


Hm... One problem I'm seeing: perm_hwaddr[ETH_ALEN],
partner_system[ETH_ALEN], mac_addr_value[ETH_ALEN]. Looks like just
about all places where storage for only ETH_ALEN is available needs to
be adjusted to maybe MAX_ADDR_LEN?

So I have something tested that uses %*phC, but only on ethernet
hardware so far, and I forsee bad juju for infiniband, because of that
ETH_ALEN issue...


After some possibly incomplete hacking, I've got an Infiniband bond now 
spewing:


$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)

Bonding Mode: fault-tolerance (active-backup)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100

Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr: 
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01

Slave queue ID: 0

I need to do a lot more testing with this set, but basically, I've 
replaced ETH_ALEN with MAX_ADDR_LEN anywhere I can, and added a function 
that takes addr_len as a param for address copies and put that in place 
of ether_addr_copy(), the used %*phC in bond_procfs.c to get the above.


--
Jarod Wilson
ja...@redhat.com

Re: [PATCH] virtio_net: fix mergeable bufs error handling

2017-03-30 Thread David Miller

From: "Michael S. Tsirkin" 
Date: Wed, 29 Mar 2017 15:37:37 +0300

> @@ -570,7 +570,7 @@ static struct sk_buff *receive_mergeable(struct 
> net_device *dev,
>   u16 num_buf;
>   struct page *page;
>   int offset;
> - struct sk_buff *head_skb, *curr_skb;
> + struct sk_buff *head_skb = NULL, *curr_skb;
>   struct bpf_prog *xdp_prog;
>   unsigned int truesize;
>  

Like Jason, I see the very next line is:

head_skb = NULL;

so I don't think this is necessary.

Re: [PATCH v2] net: stmmac: dwmac-rk: Add handling for RGMII_ID/RXID/TXID

2017-03-30 Thread David Miller

From: Wadim Egorov 
Date: Wed, 29 Mar 2017 14:12:19 +0200

> ATM dwmac-rk will always set and enable it's internal delay lines.
> Using PHY internal delays in combination with the phy-mode
> rgmii-id/rxid/txid was not possible. Only rgmii was supported.
> 
> Now we can disable rockchip's gmac delay lines and also use
> rgmii-id/rxid/txid.
> 
> Tested only with a RK3288 based board.
> 
> Signed-off-by: Wadim Egorov 
> ---
> Changes in v2: Added parenthesis around both expressions in DELAY_ENABLE

Applied to net-next, thanks.

Re: [PATCH] [net-next] stmmac: use netif_set_real_num_{rx,tx}_queues

2017-03-30 Thread David Miller

From: Thierry Reding 
Date: Thu, 30 Mar 2017 16:34:36 +0200

> On Thu, Mar 30, 2017 at 09:45:36AM +0200, Corentin Labbe wrote:
>> On Tue, Mar 28, 2017 at 06:01:05PM -0700, David Miller wrote:
>> > From: Arnd Bergmann 
>> > Date: Tue, 28 Mar 2017 11:48:21 +0200
>> > 
>> > > A driver must not access the two fields directly but should instead use
>> > > the helper functions to set the values and keep a consistent internal
>> > > state:
>> > > 
>> > > ethernet/stmicro/stmmac/stmmac_main.c: In function 'stmmac_dvr_probe':
>> > > ethernet/stmicro/stmmac/stmmac_main.c:4083:8: error: 'struct net_device' 
>> > > has no member named 'real_num_rx_queues'; did you mean 
>> > > 'real_num_tx_queues'?
>> > > 
>> > > Fixes: a8f5102af2a7 ("net: stmmac: TX and RX queue priority 
>> > > configuration")
>> > > Signed-off-by: Arnd Bergmann 
>> > 
>> > Applied.
>> 
>> This break my revert patch. (since it patch ("net: stmmac: enable multiple 
>> buffers").
>> Since dwmac-sunxi is still broken, what can I do ? send two revert patch ? 
>> or adapt the reverting patch.
> 
> Have you tried if the kcalloc() patch I sent on Tuesday fixes things the
> issues introduced by the multiple buffers patch? Niklas reported that it
> restores functionality on his setup.

I think he said yesterday that he did indeed test all of your patches and it
did not fix things for him.

http://marc.info/?l=linux-kernel=149076922813085=2

I am going to revert the enable multiple buffers commit, and I would ask that
all involved parties work together in the background to resolve all of this.

Thank you.

Re: [PATCH] r8152: The Microsoft Surface docks also use R8152.

2017-03-30 Thread René Rebe

Hi,

On Mar 30, 2017, at 19:06, Dan Williams  wrote:

> On Tue, 2017-03-28 at 06:42 +0200, Rene Rebe wrote:
>> Without this the generic cdc_ether grabs the device,
>> and does not really work.
> 
> Does this need a corresponding blacklist in cdc_ether then?  Otherwise
> you're really depending on driver loading order.

right, I already test build, and will send after reboot.

Any other nitpick in the meantime?

> Dan
> 
>> Signed-off-by: René Rebe 
>> 
>> diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
>> index c34df33..07f788c 100644
>> --- a/drivers/net/usb/r8152.c
>> +++ b/drivers/net/usb/r8152.c
>> @@ -517,6 +517,7 @@ enum rtl8152_flags {
>>  
>>  /* Define these values to match your device */
>>  #define VENDOR_ID_REALTEK  0x0bda
>> +#define VENDOR_ID_MICROSOFT0x045e
>>  #define VENDOR_ID_SAMSUNG  0x04e8
>>  #define VENDOR_ID_LENOVO   0x17ef
>>  #define VENDOR_ID_NVIDIA   0x0955
>> @@ -4521,6 +4522,8 @@ static void rtl8152_disconnect(struct
>> usb_interface *intf)
>>  static struct usb_device_id rtl8152_table[] = {
>> {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8152)},
>> {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8153)},
>> +   {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07ab)},
>> +   {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07c6)},
>> {REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101)},
>> {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x304f)},
>> {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x3062)},
>> 
>> 
>> -- 
>>   René Rebe, ExactCODE GmbH, Lietzenburger Str. 42, DE-10789 Berlin
>>   http://exactcode.com | http://t2-project.org | http://rene.rebe.de

-- 
 ExactCODE GmbH, Lietzenburger Str. 42, DE-10789 Berlin
 http://exactcode.com | http://exactscan.com | http://ocrkit.com | 
http://t2-project.org | http://rene.rebe.de

RE: [PATCH net 00/19] net: hns: Misc. HNS Bug Fixes & Code Improvements

2017-03-30 Thread Salil Mehta



> -Original Message-
> From: David Miller [mailto:da...@davemloft.net]
> Sent: Thursday, March 30, 2017 6:22 PM
> To: Salil Mehta
> Cc: Zhuangyuzeng (Yisen); mehta.salil@gmail.com;
> netdev@vger.kernel.org; linux-ker...@vger.kernel.org; Linuxarm
> Subject: Re: [PATCH net 00/19] net: hns: Misc. HNS Bug Fixes & Code
> Improvements
> 
> From: Salil Mehta 
> Date: Thu, 30 Mar 2017 17:19:44 +
> 
> >
> >> -Original Message-
> >> From: David Miller [mailto:da...@davemloft.net]
> >> Sent: Thursday, March 30, 2017 6:09 PM
> >> To: Salil Mehta
> >> Cc: Zhuangyuzeng (Yisen); mehta.salil@gmail.com;
> >> netdev@vger.kernel.org; linux-ker...@vger.kernel.org; Linuxarm
> >> Subject: Re: [PATCH net 00/19] net: hns: Misc. HNS Bug Fixes & Code
> >> Improvements
> >>
> >> From: Salil Mehta 
> >> Date: Thu, 30 Mar 2017 16:30:47 +0100
> >>
> >> > This patch set introduces various HNS bug fixes, optimizations and
> code
> >> > improvements.
> >>
> >> What tree are you targetting?
> >>
> >> You say "net" in your Subject lines, but this series has cleanups
> and all
> >> sorts
> >> of other things which are absolutely not appropriate for 'net' and
> are
> >> 'net-next'
> >> material.
> > Hi David,
> > Sorry David, These bug fixes are for the next merge window and for
> net-next.
> > Should I resend the patches with below change?
> 
> It is not necessary to resend, thanks for clarifying.
Sure thanks.

Re: [PATCH net 00/19] net: hns: Misc. HNS Bug Fixes & Code Improvements

2017-03-30 Thread David Miller

From: Salil Mehta 
Date: Thu, 30 Mar 2017 17:19:44 +

> 
>> -Original Message-
>> From: David Miller [mailto:da...@davemloft.net]
>> Sent: Thursday, March 30, 2017 6:09 PM
>> To: Salil Mehta
>> Cc: Zhuangyuzeng (Yisen); mehta.salil@gmail.com;
>> netdev@vger.kernel.org; linux-ker...@vger.kernel.org; Linuxarm
>> Subject: Re: [PATCH net 00/19] net: hns: Misc. HNS Bug Fixes & Code
>> Improvements
>> 
>> From: Salil Mehta 
>> Date: Thu, 30 Mar 2017 16:30:47 +0100
>> 
>> > This patch set introduces various HNS bug fixes, optimizations and code
>> > improvements.
>> 
>> What tree are you targetting?
>> 
>> You say "net" in your Subject lines, but this series has cleanups and all
>> sorts
>> of other things which are absolutely not appropriate for 'net' and are
>> 'net-next'
>> material.
> Hi David,
> Sorry David, These bug fixes are for the next merge window and for net-next.
> Should I resend the patches with below change?

It is not necessary to resend, thanks for clarifying.

RE: [PATCH net 00/19] net: hns: Misc. HNS Bug Fixes & Code Improvements

2017-03-30 Thread Salil Mehta


> -Original Message-
> From: David Miller [mailto:da...@davemloft.net]
> Sent: Thursday, March 30, 2017 6:09 PM
> To: Salil Mehta
> Cc: Zhuangyuzeng (Yisen); mehta.salil@gmail.com;
> netdev@vger.kernel.org; linux-ker...@vger.kernel.org; Linuxarm
> Subject: Re: [PATCH net 00/19] net: hns: Misc. HNS Bug Fixes & Code
> Improvements
> 
> From: Salil Mehta 
> Date: Thu, 30 Mar 2017 16:30:47 +0100
> 
> > This patch set introduces various HNS bug fixes, optimizations and code
> > improvements.
> 
> What tree are you targetting?
> 
> You say "net" in your Subject lines, but this series has cleanups and all
> sorts
> of other things which are absolutely not appropriate for 'net' and are
> 'net-next'
> material.
Hi David,
Sorry David, These bug fixes are for the next merge window and for net-next.
Should I resend the patches with below change?

[PATCH net-next 00/19] net: hns: Misc. HNS Bug Fixes & Code Improvements  

Best regards
Salil

Re: [PATCH net 00/19] net: hns: Misc. HNS Bug Fixes & Code Improvements

2017-03-30 Thread David Miller

From: Salil Mehta 
Date: Thu, 30 Mar 2017 16:30:47 +0100

> This patch set introduces various HNS bug fixes, optimizations and code
> improvements.

What tree are you targetting?

You say "net" in your Subject lines, but this series has cleanups and all sorts
of other things which are absolutely not appropriate for 'net' and are 
'net-next'
material.

Re: [PATCH] r8152: The Microsoft Surface docks also use R8152.

2017-03-30 Thread Dan Williams

On Tue, 2017-03-28 at 06:42 +0200, Rene Rebe wrote:
> Without this the generic cdc_ether grabs the device,
> and does not really work.

Does this need a corresponding blacklist in cdc_ether then?  Otherwise
you're really depending on driver loading order.

Dan

> Signed-off-by: René Rebe 
> 
> diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
> index c34df33..07f788c 100644
> --- a/drivers/net/usb/r8152.c
> +++ b/drivers/net/usb/r8152.c
> @@ -517,6 +517,7 @@ enum rtl8152_flags {
>  
>  /* Define these values to match your device */
>  #define VENDOR_ID_REALTEK  0x0bda
> +#define VENDOR_ID_MICROSOFT0x045e
>  #define VENDOR_ID_SAMSUNG  0x04e8
>  #define VENDOR_ID_LENOVO   0x17ef
>  #define VENDOR_ID_NVIDIA   0x0955
> @@ -4521,6 +4522,8 @@ static void rtl8152_disconnect(struct
> usb_interface *intf)
>  static struct usb_device_id rtl8152_table[] = {
> {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8152)},
> {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8153)},
> +   {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07ab)},
> +   {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07c6)},
> {REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101)},
> {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x304f)},
> {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x3062)},
> 
> 
> -- 
>   René Rebe, ExactCODE GmbH, Lietzenburger Str. 42, DE-10789 Berlin
>   http://exactcode.com | http://t2-project.org | http://rene.rebe.de

[PATCH] r8152: The Microsoft Surface docks also use R8152.

2017-03-30 Thread Rene Rebe

Without this the generic cdc_ether grabs the device,
and does not really work.

Signed-off-by: René Rebe 

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index c34df33..07f788c 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -517,6 +517,7 @@ enum rtl8152_flags {
 
 /* Define these values to match your device */
 #define VENDOR_ID_REALTEK  0x0bda
+#define VENDOR_ID_MICROSOFT0x045e
 #define VENDOR_ID_SAMSUNG  0x04e8
 #define VENDOR_ID_LENOVO   0x17ef
 #define VENDOR_ID_NVIDIA   0x0955
@@ -4521,6 +4522,8 @@ static void rtl8152_disconnect(struct usb_interface *intf)
 static struct usb_device_id rtl8152_table[] = {
{REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8152)},
{REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8153)},
+   {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07ab)},
+   {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07c6)},
{REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101)},
{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x304f)},
{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x3062)},


-- 
  René Rebe, ExactCODE GmbH, Lietzenburger Str. 42, DE-10789 Berlin
  http://exactcode.com | http://t2-project.org | http://rene.rebe.de

Re: [PATCH] [net-next] stmmac: use netif_set_real_num_{rx,tx}_queues

2017-03-30 Thread Joao Pinto

Às 5:35 PM de 3/30/2017, Niklas Cassel escreveu:
> On 03/30/2017 04:34 PM, Thierry Reding wrote:
>> On Thu, Mar 30, 2017 at 09:45:36AM +0200, Corentin Labbe wrote:
>>> On Tue, Mar 28, 2017 at 06:01:05PM -0700, David Miller wrote:
 From: Arnd Bergmann 
 Date: Tue, 28 Mar 2017 11:48:21 +0200

> A driver must not access the two fields directly but should instead use
> the helper functions to set the values and keep a consistent internal
> state:
>
> ethernet/stmicro/stmmac/stmmac_main.c: In function 'stmmac_dvr_probe':
> ethernet/stmicro/stmmac/stmmac_main.c:4083:8: error: 'struct net_device' 
> has no member named 'real_num_rx_queues'; did you mean 
> 'real_num_tx_queues'?
>
> Fixes: a8f5102af2a7 ("net: stmmac: TX and RX queue priority 
> configuration")
> Signed-off-by: Arnd Bergmann 

 Applied.
>>>
>>> This break my revert patch. (since it patch ("net: stmmac: enable multiple 
>>> buffers").
>>> Since dwmac-sunxi is still broken, what can I do ? send two revert patch ? 
>>> or adapt the reverting patch.
>>
>> Have you tried if the kcalloc() patch I sent on Tuesday fixes things the
>> issues introduced by the multiple buffers patch? Niklas reported that it
>> restores functionality on his setup.
>>
>> If it makes things work for you as well, we could maybe avoid the revert
>> altogether.
> 
> Thierry, I know that you are using DWMAC CORE 4.XX
> How many RX queues and how many TX queues have you got?
> 
> I'm also using DWMAC CORE 4.XX
> We have 2 TX queues and 1 RX queue.
> 
> I think that Corentin is using DWMAC CORE 3.XX
> 
> I know that Joao is using an IP Prototyping Kit that uses
> DWMAC CORE 4.XX (connected via PCIe).
> It would be nice if Joao could get an IP Prototyping Kit
> based on DWMAC CORE 3.XX.
> 
> Doesn't Synopsys have an IP Prototyping Kit based on
> DWMAC CORE 3.XX laying around somewhere? :)
> 

I requested a prototyping platform with MAC 100 or a MAC 1000 in order to make
more tests, but I don't have an ETA for it yet.

The implication of the multiple buffers patch in 3.xx is some flow change in the
configuration of dma op mode or similar. I would recomend Corentin to dump the
dma & mac registers in the end of the _open function in order to see if the DMA
is really being well configured and is really started.

Thanks.

Joao

Re: Re: [PATCH] [net-next] stmmac: use netif_set_real_num_{rx,tx}_queues

2017-03-30 Thread Niklas Cassel

On 03/30/2017 04:34 PM, Thierry Reding wrote:
> On Thu, Mar 30, 2017 at 09:45:36AM +0200, Corentin Labbe wrote:
>> On Tue, Mar 28, 2017 at 06:01:05PM -0700, David Miller wrote:
>>> From: Arnd Bergmann 
>>> Date: Tue, 28 Mar 2017 11:48:21 +0200
>>>
 A driver must not access the two fields directly but should instead use
 the helper functions to set the values and keep a consistent internal
 state:

 ethernet/stmicro/stmmac/stmmac_main.c: In function 'stmmac_dvr_probe':
 ethernet/stmicro/stmmac/stmmac_main.c:4083:8: error: 'struct net_device' 
 has no member named 'real_num_rx_queues'; did you mean 
 'real_num_tx_queues'?

 Fixes: a8f5102af2a7 ("net: stmmac: TX and RX queue priority configuration")
 Signed-off-by: Arnd Bergmann 
>>>
>>> Applied.
>>
>> This break my revert patch. (since it patch ("net: stmmac: enable multiple 
>> buffers").
>> Since dwmac-sunxi is still broken, what can I do ? send two revert patch ? 
>> or adapt the reverting patch.
> 
> Have you tried if the kcalloc() patch I sent on Tuesday fixes things the
> issues introduced by the multiple buffers patch? Niklas reported that it
> restores functionality on his setup.
> 
> If it makes things work for you as well, we could maybe avoid the revert
> altogether.

Thierry, I know that you are using DWMAC CORE 4.XX
How many RX queues and how many TX queues have you got?

I'm also using DWMAC CORE 4.XX
We have 2 TX queues and 1 RX queue.

I think that Corentin is using DWMAC CORE 3.XX

I know that Joao is using an IP Prototyping Kit that uses
DWMAC CORE 4.XX (connected via PCIe).
It would be nice if Joao could get an IP Prototyping Kit
based on DWMAC CORE 3.XX.

Doesn't Synopsys have an IP Prototyping Kit based on
DWMAC CORE 3.XX laying around somewhere? :)

[PATCH net-next v2 2/6] net: mpls: Convert number of nexthops to u8

2017-03-30 Thread David Ahern

Number of nexthops and number of alive nexthops are tracked using an
unsigned int. A route should never have more than 255 nexthops so
convert both to u8. Update all references and intermediate variables
to consistently use u8 as well.

Shrinks the size of mpls_route from 32 bytes to 24 bytes with a 2-byte
hole before the nexthops.

Signed-off-by: David Ahern 
---
v2
- label u16 hole in mpls_route as rt_reserved1

 net/mpls/af_mpls.c  | 28 +---
 net/mpls/internal.h |  5 +++--
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 6bdd2f95b576..665dec84f001 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -197,10 +197,10 @@ static u32 mpls_multipath_hash(struct mpls_route *rt, 
struct sk_buff *skb)
 static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
 struct sk_buff *skb)
 {
-   unsigned int alive;
u32 hash = 0;
int nh_index = 0;
int n = 0;
+   u8 alive;
 
/* No need to look further into packet if there's only
 * one path
@@ -466,7 +466,7 @@ struct mpls_route_config {
int rc_mp_len;
 };
 
-static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
+static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen)
 {
u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN);
struct mpls_route *rt;
@@ -744,11 +744,11 @@ static int mpls_nh_build(struct net *net, struct 
mpls_route *rt,
return err;
 }
 
-static int mpls_count_nexthops(struct rtnexthop *rtnh, int len,
-  u8 cfg_via_alen, u8 *max_via_alen)
+static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len,
+ u8 cfg_via_alen, u8 *max_via_alen)
 {
-   int nhs = 0;
int remaining = len;
+   u8 nhs = 0;
 
if (!rtnh) {
*max_via_alen = cfg_via_alen;
@@ -773,7 +773,13 @@ static int mpls_count_nexthops(struct rtnexthop *rtnh, int 
len,
  via_alen);
}
 
+   /* number of nexthops is tracked by a u8.
+* Check for overflow.
+*/
+   if (nhs == 255)
+   return 0;
nhs++;
+
rtnh = rtnh_next(rtnh, );
}
 
@@ -787,8 +793,8 @@ static int mpls_nh_build_multi(struct mpls_route_config 
*cfg,
struct rtnexthop *rtnh = cfg->rc_mp;
struct nlattr *nla_via, *nla_newdst;
int remaining = cfg->rc_mp_len;
-   int nhs = 0;
int err = 0;
+   u8 nhs = 0;
 
change_nexthops(rt) {
int attrlen;
@@ -842,7 +848,7 @@ static int mpls_route_add(struct mpls_route_config *cfg)
int err = -EINVAL;
u8 max_via_alen;
unsigned index;
-   int nhs;
+   u8 nhs;
 
index = cfg->rc_label;
 
@@ -1310,7 +1316,7 @@ static void mpls_ifdown(struct net_device *dev, int event)
 {
struct mpls_route __rcu **platform_label;
struct net *net = dev_net(dev);
-   unsigned int alive, deleted;
+   u8 alive, deleted;
unsigned index;
 
platform_label = rtnl_dereference(net->mpls.platform_label);
@@ -1362,7 +1368,7 @@ static void mpls_ifup(struct net_device *dev, unsigned 
int flags)
struct mpls_route __rcu **platform_label;
struct net *net = dev_net(dev);
unsigned index;
-   int alive;
+   u8 alive;
 
platform_label = rtnl_dereference(net->mpls.platform_label);
for (index = 0; index < net->mpls.platform_labels; index++) {
@@ -1786,8 +1792,8 @@ static int mpls_dump_route(struct sk_buff *skb, u32 
portid, u32 seq, int event,
} else {
struct rtnexthop *rtnh;
struct nlattr *mp;
-   int dead = 0;
-   int linkdown = 0;
+   u8 linkdown = 0;
+   u8 dead = 0;
 
mp = nla_nest_start(skb, RTA_MULTIPATH);
if (!mp)
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 91419fe63464..2ac97433c3b7 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -127,12 +127,13 @@ struct mpls_route { /* next hop label forwarding entry */
u8  rt_payload_type;
u8  rt_max_alen;
u8  rt_ttl_propagate;
-   unsigned intrt_nhn;
+   u8  rt_nhn;
 
/* rt_nhn_alive is accessed under RCU in the packet path; it
 * is modified handling netdev events with rtnl lock held
 */
-   unsigned intrt_nhn_alive;
+   u8  rt_nhn_alive;
+   u16 rt_reserved1;
struct mpls_nh  rt_nh[0];
 };
 
-- 
2.1.4

[PATCH net-next v2 4/6] net: mpls: Limit memory allocation for mpls_route

2017-03-30 Thread David Ahern

Limit memory allocation size for mpls_route to 4096.

Signed-off-by: David Ahern 
---
v2
- new patch in v2 of set

 net/mpls/af_mpls.c | 31 +--
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 1863b94133e4..f84c52b6eafc 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -26,6 +26,9 @@
 
 #define MAX_NEW_LABELS 2
 
+/* max memory we will use for mpls_route */
+#define MAX_MPLS_ROUTE_MEM 4096
+
 /* Maximum number of labels to look ahead at when selecting a path of
  * a multipath route
  */
@@ -477,14 +480,20 @@ static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 
max_alen, u8 max_labels)
 {
u8 nh_size = MPLS_NH_SIZE(max_labels, max_alen);
struct mpls_route *rt;
+   size_t size;
 
-   rt = kzalloc(sizeof(*rt) + num_nh * nh_size, GFP_KERNEL);
-   if (rt) {
-   rt->rt_nhn = num_nh;
-   rt->rt_nhn_alive = num_nh;
-   rt->rt_nh_size = nh_size;
-   rt->rt_via_offset = MPLS_NH_VIA_OFF(max_labels);
-   }
+   size = sizeof(*rt) + num_nh * nh_size;
+   if (size > MAX_MPLS_ROUTE_MEM)
+   return ERR_PTR(-EINVAL);
+
+   rt = kzalloc(size, GFP_KERNEL);
+   if (!rt)
+   return ERR_PTR(-ENOMEM);
+
+   rt->rt_nhn = num_nh;
+   rt->rt_nhn_alive = num_nh;
+   rt->rt_nh_size = nh_size;
+   rt->rt_via_offset = MPLS_NH_VIA_OFF(max_labels);
 
return rt;
 }
@@ -898,8 +907,10 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 
err = -ENOMEM;
rt = mpls_rt_alloc(nhs, max_via_alen, MAX_NEW_LABELS);
-   if (!rt)
+   if (IS_ERR(rt)) {
+   err = PTR_ERR(rt);
goto errout;
+   }
 
rt->rt_protocol = cfg->rc_protocol;
rt->rt_payload_type = cfg->rc_payload_type;
@@ -1970,7 +1981,7 @@ static int resize_platform_label_table(struct net *net, 
size_t limit)
if (limit > MPLS_LABEL_IPV4NULL) {
struct net_device *lo = net->loopback_dev;
rt0 = mpls_rt_alloc(1, lo->addr_len, MAX_NEW_LABELS);
-   if (!rt0)
+   if (IS_ERR(rt0))
goto nort0;
RCU_INIT_POINTER(rt0->rt_nh->nh_dev, lo);
rt0->rt_protocol = RTPROT_KERNEL;
@@ -1984,7 +1995,7 @@ static int resize_platform_label_table(struct net *net, 
size_t limit)
if (limit > MPLS_LABEL_IPV6NULL) {
struct net_device *lo = net->loopback_dev;
rt2 = mpls_rt_alloc(1, lo->addr_len, MAX_NEW_LABELS);
-   if (!rt2)
+   if (IS_ERR(rt2))
goto nort2;
RCU_INIT_POINTER(rt2->rt_nh->nh_dev, lo);
rt2->rt_protocol = RTPROT_KERNEL;
-- 
2.1.4

[PATCH net-next v2 0/6] net: mpls: Allow users to configure more labels per route

2017-03-30 Thread David Ahern

Increase the maximum number of new labels for MPLS routes from 2 to 30.

To keep memory consumption in check, the labels array is moved to the end
of mpls_nh and mpls_iptunnel_encap structs as a 0-sized array. Allocations
use the maximum number of labels across all nexthops in a route for LSR
and the number of labels configured for LWT.

The mpls_route layout is changed to:

   +--+
   | mpls_route   |
   +--+
   | mpls_nh 0|
   +--+
   | alignment padding|   4 bytes for odd number of labels; 0 for even
   +--+
   | via[rt_max_alen] 0   |
   +--+
   | alignment padding|   via's aligned on sizeof(unsigned long)
   +--+
   | ...  |

Meaning the via follows its mpls_nh providing better locality as the
number of labels increases. UDP_RR tests with namespaces shows no impact
to a modest performance increase with this layout for 1 or 2 labels and
1 or 2 nexthops.

mpls_route allocation size is limited to 4096 bytes allowing on the
order of 30 nexthops with 30 labels (or more nexthops with fewer
labels). LWT encap shares same maximum number of labels as mpls routing.

v2
- updates per Eric's comments
  + added patch to ensure all reads of rt_nhn_alive and nh_flags in
the packet path use READ_ONCE and all writes via event handlers
use WRITE_ONCE

  + limit mpls_route size to 4096 (PAGE_SIZE for most arch)

  + mostly killed use of MAX_NEW_LABELS; it exists only for common
limit between lwt and routing paths

David Ahern (6):
  net: mpls: rt_nhn_alive and nh_flags should be accessed using
READ_ONCE
  net: mpls: Convert number of nexthops to u8
  net: mpls: change mpls_route layout
  net:mpls: Limit memory allocation for mpls_route
  net: mpls: bump maximum number of labels
  net: mpls: Increase max number of labels for lwt encap

 include/net/mpls_iptunnel.h |   5 +-
 net/mpls/af_mpls.c  | 210 +---
 net/mpls/internal.h |  61 +
 net/mpls/mpls_iptunnel.c|  13 ++-
 4 files changed, 196 insertions(+), 93 deletions(-)

-- 
2.1.4

[PATCH net-next v2 6/6] net: mpls: Increase max number of labels for lwt encap

2017-03-30 Thread David Ahern

Alow users to push down more labels per MPLS encap. Similar to LSR case,
move label array to the end of mpls_iptunnel_encap and allocate based on
the number of labels for the route.

For consistency with the LSR case, re-use the same maximum number of
labels.

Signed-off-by: David Ahern 
---
v2
- marked hole in mpls_iptunnel_encap as reserved1

 include/net/mpls_iptunnel.h |  5 ++---
 net/mpls/af_mpls.c  |  5 -
 net/mpls/internal.h |  5 +
 net/mpls/mpls_iptunnel.c| 13 ++---
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
index a18af6a16eb5..9d22bf67ac86 100644
--- a/include/net/mpls_iptunnel.h
+++ b/include/net/mpls_iptunnel.h
@@ -14,13 +14,12 @@
 #ifndef _NET_MPLS_IPTUNNEL_H
 #define _NET_MPLS_IPTUNNEL_H 1
 
-#define MAX_NEW_LABELS 2
-
 struct mpls_iptunnel_encap {
-   u32 label[MAX_NEW_LABELS];
u8  labels;
u8  ttl_propagate;
u8  default_ttl;
+   u8  reserved1;
+   u32 label[0];
 };
 
 static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct 
lwtunnel_state *lwtstate)
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 2458d7ed2ab5..2da15dcb2675 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -24,11 +24,6 @@
 #include 
 #include "internal.h"
 
-/* put a reasonable limit on the number of labels
- * we will accept from userspace
- */
-#define MAX_NEW_LABELS 30
-
 /* max memory we will use for mpls_route */
 #define MAX_MPLS_ROUTE_MEM 4096
 
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index c5d2f5bc37ec..4db6a5971322 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -2,6 +2,11 @@
 #define MPLS_INTERNAL_H
 #include 
 
+/* put a reasonable limit on the number of labels
+ * we will accept from userspace
+ */
+#define MAX_NEW_LABELS 30
+
 struct mpls_entry_decoded {
u32 label;
u8 ttl;
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 22f71fce0bfb..fe00e98667cf 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -164,6 +164,7 @@ static int mpls_build_state(struct nlattr *nla,
struct mpls_iptunnel_encap *tun_encap_info;
struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
struct lwtunnel_state *newts;
+   u8 n_labels;
int ret;
 
ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
@@ -175,12 +176,18 @@ static int mpls_build_state(struct nlattr *nla,
return -EINVAL;
 
 
-   newts = lwtunnel_state_alloc(sizeof(*tun_encap_info));
+   /* determine number of labels */
+   if (nla_get_labels(tb[MPLS_IPTUNNEL_DST],
+  MAX_NEW_LABELS, _labels, NULL))
+   return -EINVAL;
+
+   newts = lwtunnel_state_alloc(sizeof(*tun_encap_info) +
+n_labels * sizeof(u32));
if (!newts)
return -ENOMEM;
 
tun_encap_info = mpls_lwtunnel_encap(newts);
-   ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
+   ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], n_labels,
 _encap_info->labels, tun_encap_info->label);
if (ret)
goto errout;
@@ -257,7 +264,7 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct 
lwtunnel_state *b)
a_hdr->default_ttl != b_hdr->default_ttl)
return 1;
 
-   for (l = 0; l < MAX_NEW_LABELS; l++)
+   for (l = 0; l < a_hdr->labels; l++)
if (a_hdr->label[l] != b_hdr->label[l])
return 1;
return 0;
-- 
2.1.4

1 2 3 >

1 - 100 of 203 matches

Mail list logo