date:20180810

[PATCH bpf-next 2/4] bpf: Sync bpf.h to tools/

2018-08-10 Thread Andrey Ignatov

Sync skb_ancestor_cgroup_id() related bpf UAPI changes to tools/.

Signed-off-by: Andrey Ignatov 
---
 tools/include/uapi/linux/bpf.h | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3102a2a23c31..66917a4eba27 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2093,6 +2093,24 @@ union bpf_attr {
  * Return
  * The id is returned or 0 in case the id could not be retrieved.
  *
+ * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
+ * Description
+ * Return id of cgroup v2 that is ancestor of cgroup associated
+ * with the *skb* at the *ancestor_level*.  The root cgroup is at
+ * *ancestor_level* zero and each step down the hierarchy
+ * increments the level. If *ancestor_level* == level of cgroup
+ * associated with *skb*, then return value will be same as that
+ * of **bpf_skb_cgroup_id**\ ().
+ *
+ * The helper is useful to implement policies based on cgroups
+ * that are upper in hierarchy than immediate cgroup associated
+ * with *skb*.
+ *
+ * The format of returned id and helper limitations are same as in
+ * **bpf_skb_cgroup_id**\ ().
+ * Return
+ * The id is returned or 0 in case the id could not be retrieved.
+ *
  * u64 bpf_get_current_cgroup_id(void)
  * Return
  * A 64-bit integer containing the current cgroup id based
@@ -2207,7 +2225,8 @@ union bpf_attr {
FN(skb_cgroup_id),  \
FN(get_current_cgroup_id),  \
FN(get_local_storage),  \
-   FN(sk_select_reuseport),
+   FN(sk_select_reuseport),\
+   FN(skb_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
2.17.1

[PATCH bpf-next 3/4] selftests/bpf: Add cgroup id helpers to bpf_helpers.h

2018-08-10 Thread Andrey Ignatov

Add bpf_skb_cgroup_id and bpf_skb_ancestor_cgroup_id helpers to
bpf_helpers.h to use them in tests and samples.

Signed-off-by: Andrey Ignatov 
---
 tools/testing/selftests/bpf/bpf_helpers.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/bpf/bpf_helpers.h 
b/tools/testing/selftests/bpf/bpf_helpers.h
index 5c32266c2c38..e4be7730222d 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -139,6 +139,10 @@ static unsigned long long 
(*bpf_get_current_cgroup_id)(void) =
(void *) BPF_FUNC_get_current_cgroup_id;
 static void *(*bpf_get_local_storage)(void *map, unsigned long long flags) =
(void *) BPF_FUNC_get_local_storage;
+static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) =
+   (void *) BPF_FUNC_skb_cgroup_id;
+static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) =
+   (void *) BPF_FUNC_skb_ancestor_cgroup_id;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.17.1

[PATCH bpf-next 4/4] selftests/bpf: Selftest for bpf_skb_ancestor_cgroup_id

2018-08-10 Thread Andrey Ignatov

Add selftests for bpf_skb_ancestor_cgroup_id helper.

test_skb_cgroup_id.sh prepares testing interface and adds tc qdisc and
filter for it using BPF object compiled from test_skb_cgroup_id_kern.c
program.

BPF program in test_skb_cgroup_id_kern.c gets ancestor cgroup id using
the new helper at different levels of cgroup hierarchy that skb belongs
to, including root level and non-existing level, and saves it to the map
where the key is the level of corresponding cgroup and the value is its
id.

To trigger BPF program, user space program test_skb_cgroup_id_user is
run. It adds itself into testing cgroup and sends UDP datagram to
link-local multicast address of testing interface. Then it reads cgroup
ids saved in kernel for different levels from the BPF map and compares
them with those in user space. They must be equal for every level of
ancestry.

Example of run:
  # ./test_skb_cgroup_id.sh
  Wait for testing link-local IP to become available ... OK
  Note: 8 bytes struct bpf_elf_map fixup performed due to size mismatch!
  [PASS]

Signed-off-by: Andrey Ignatov 
---
 tools/testing/selftests/bpf/Makefile  |   9 +-
 .../selftests/bpf/test_skb_cgroup_id.sh   |  61 ++
 .../selftests/bpf/test_skb_cgroup_id_kern.c   |  47 +
 .../selftests/bpf/test_skb_cgroup_id_user.c   | 187 ++
 4 files changed, 301 insertions(+), 3 deletions(-)
 create mode 100755 tools/testing/selftests/bpf/test_skb_cgroup_id.sh
 create mode 100644 tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_skb_cgroup_id_user.c

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index daed162043c2..fff7fb1285fc 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -34,7 +34,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o 
test_tcp_estats.o test
test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o 
\
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o 
test_lirc_mode2_kern.o \
-   get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o
+   get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
+   test_skb_cgroup_id_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -45,10 +46,11 @@ TEST_PROGS := test_kmod.sh \
test_sock_addr.sh \
test_tunnel.sh \
test_lwt_seg6local.sh \
-   test_lirc_mode2.sh
+   test_lirc_mode2.sh \
+   test_skb_cgroup_id.sh
 
 # Compile but not part of 'make run_tests'
-TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr
+TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr 
test_skb_cgroup_id_user
 
 include ../lib.mk
 
@@ -59,6 +61,7 @@ $(TEST_GEN_PROGS): $(BPFOBJ)
 $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
 
 $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
+$(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c
 $(OUTPUT)/test_sock: cgroup_helpers.c
 $(OUTPUT)/test_sock_addr: cgroup_helpers.c
 $(OUTPUT)/test_socket_cookie: cgroup_helpers.c
diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh 
b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
new file mode 100755
index ..b75e9b52f06f
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Facebook
+
+set -eu
+
+wait_for_ip()
+{
+   local _i
+   echo -n "Wait for testing link-local IP to become available "
+   for _i in $(seq ${MAX_PING_TRIES}); do
+   echo -n "."
+   if ping -6 -q -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then
+   echo " OK"
+   return
+   fi
+   done
+   echo 1>&2 "ERROR: Timeout waiting for test IP to become available."
+   exit 1
+}
+
+setup()
+{
+   # Create testing interfaces not to interfere with current environment.
+   ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER}
+   ip link set ${TEST_IF} up
+   ip link set ${TEST_IF_PEER} up
+
+   wait_for_ip
+
+   tc qdisc add dev ${TEST_IF} clsact
+   tc filter add dev ${TEST_IF} egress bpf obj ${BPF_PROG_OBJ} \
+   sec ${BPF_PROG_SECTION} da
+
+   BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \
+   awk '/ id / {sub(/.* id /, "", $0); print($1)}')
+}
+
+cleanup()
+{
+   ip link del ${TEST_IF} 2>/dev/null || :
+   ip link del ${TEST_IF_PEER} 2>/dev/null || :
+}
+
+main()
+{
+   trap cleanup EXIT 2 3 6 15
+   setup
+   ${PROG} ${TEST_IF} ${BPF_PROG_ID}
+}
+
+DIR=$(dirname $0)
+TEST_IF="test_cgid_1"
+TEST_IF_PEER="test_cgid_2"
+MAX_PING_TRIES=5
+BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.o"
+BPF_PROG_SECTION="cgroup_id_logger"
+BPF_PROG_ID=0
+PROG="${DIR}/test_skb_cgroup_id_u

[PATCH bpf-next 0/4] bpf_skb_ancestor_cgroup_id helper

2018-08-10 Thread Andrey Ignatov

This patch set adds new BPF helper bpf_skb_ancestor_cgroup_id that returns
id of cgroup v2 that is ancestor of cgroup associated with the skb at the
ancestor_level.

The helper is useful to implement policies in TC based on cgroups that are
upper in hierarchy than immediate cgroup associated with skb.

Patch 0001 provides more details and describes use-cases.
Patch 0002 syncs UAPI changes to tools/.
Patch 0003 adds skb*cgroup_id helpers to bpf_helper.h header.
Patch 0004 adds selftest for the new helper and is an example of usage.


Andrey Ignatov (4):
  bpf: Introduce bpf_skb_ancestor_cgroup_id helper
  bpf: Sync bpf.h to tools/
  selftests/bpf: Add cgroup id helpers to bpf_helpers.h
  selftests/bpf: Selftest for bpf_skb_ancestor_cgroup_id

 include/linux/cgroup.h|  30 +++
 include/uapi/linux/bpf.h  |  21 +-
 net/core/filter.c |  28 +++
 tools/include/uapi/linux/bpf.h|  21 +-
 tools/testing/selftests/bpf/Makefile  |   9 +-
 tools/testing/selftests/bpf/bpf_helpers.h |   4 +
 .../selftests/bpf/test_skb_cgroup_id.sh   |  61 ++
 .../selftests/bpf/test_skb_cgroup_id_kern.c   |  47 +
 .../selftests/bpf/test_skb_cgroup_id_user.c   | 187 ++
 9 files changed, 403 insertions(+), 5 deletions(-)
 create mode 100755 tools/testing/selftests/bpf/test_skb_cgroup_id.sh
 create mode 100644 tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_skb_cgroup_id_user.c

-- 
2.17.1

[PATCH bpf-next 1/4] bpf: Introduce bpf_skb_ancestor_cgroup_id helper

2018-08-10 Thread Andrey Ignatov

== Problem description ==

It's useful to be able to identify cgroup associated with skb in TC so
that a policy can be applied to this skb, and existing bpf_skb_cgroup_id
helper can help with this.

Though in real life cgroup hierarchy and hierarchy to apply a policy to
don't map 1:1.

It's often the case that there is a container and corresponding cgroup,
but there are many more sub-cgroups inside container, e.g. because it's
delegated to containerized application to control resources for its
subsystems, or to separate application inside container from infra that
belongs to containerization system (e.g. sshd).

At the same time it may be useful to apply a policy to container as a
whole.

If multiple containers like this are run on a host (what is often the
case) and many of them have sub-cgroups, it may not be possible to apply
per-container policy in TC with existing helpers such as
bpf_skb_under_cgroup or bpf_skb_cgroup_id:

* bpf_skb_cgroup_id will return id of immediate cgroup associated with
  skb, i.e. if it's a sub-cgroup inside container, it can't be used to
  identify container's cgroup;

* bpf_skb_under_cgroup can work only with one cgroup and doesn't scale,
  i.e. if there are N containers on a host and a policy has to be
  applied to M of them (0 <= M <= N), it'd require M calls to
  bpf_skb_under_cgroup, and, if M changes, it'd require to rebuild &
  load new BPF program.

== Solution ==

The patch introduces new helper bpf_skb_ancestor_cgroup_id that can be
used to get id of cgroup v2 that is an ancestor of cgroup associated
with skb at specified level of cgroup hierarchy.

That way admin can place all containers on one level of cgroup hierarchy
(what is a good practice in general and already used in many
configurations) and identify specific cgroup on this level no matter
what sub-cgroup skb is associated with.

E.g. if there is a cgroup hierarchy:
  root/
  root/container1/
  root/container1/app11/
  root/container1/app11/sub-app-a/
  root/container1/app12/
  root/container2/
  root/container2/app21/
  root/container2/app22/
  root/container2/app22/sub-app-b/

, then having skb associated with root/container1/app11/sub-app-a/ it's
possible to get ancestor at level 1, what is container1 and apply policy
for this container, or apply another policy if it's container2.

Policies can be kept e.g. in a hash map where key is a container cgroup
id and value is an action.

Levels where container cgroups are created are usually known in advance
whether cgroup hierarchy inside container may be hard to predict
especially in case when its creation is delegated to containerized
application.

== Implementation details ==

The helper gets ancestor by walking parents up to specified level.

Another option would be to get different kind of "id" from
cgroup->ancestor_ids[level] and use it with idr_find() to get struct
cgroup for ancestor. But that would require radix lookup what doesn't
seem to be better (at least it's not obviously better).

Format of return value of the new helper is same as that of
bpf_skb_cgroup_id.

Signed-off-by: Andrey Ignatov 
---
 include/linux/cgroup.h   | 30 ++
 include/uapi/linux/bpf.h | 21 -
 net/core/filter.c| 28 
 3 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c9fdf6f57913..32c553556bbd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -553,6 +553,36 @@ static inline bool cgroup_is_descendant(struct cgroup 
*cgrp,
return cgrp->ancestor_ids[ancestor->level] == ancestor->id;
 }
 
+/**
+ * cgroup_ancestor - find ancestor of cgroup
+ * @cgrp: cgroup to find ancestor of
+ * @ancestor_level: level of ancestor to find starting from root
+ *
+ * Find ancestor of cgroup at specified level starting from root if it exists
+ * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
+ * @ancestor_level.
+ *
+ * This function is safe to call as long as @cgrp is accessible.
+ */
+static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
+int ancestor_level)
+{
+   struct cgroup *ptr;
+
+   if (cgrp->level < ancestor_level)
+   return NULL;
+
+   for (ptr = cgrp;
+ptr && ptr->level > ancestor_level;
+ptr = cgroup_parent(ptr))
+   ;
+
+   if (ptr && ptr->level == ancestor_level)
+   return ptr;
+
+   return NULL;
+}
+
 /**
  * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
  * @task: the task to be tested
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3102a2a23c31..66917a4eba27 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2093,6 +2093,24 @@ union bpf_attr {
  * Return
  * The id is returned or 0 in case the id could not be retrieved.
  *
+ * u64 bpf_skb_ancestor_cgroup_id(struct s

Re: [net-next 03/12] net/mlx5e: Ethtool steering, ip6 support

2018-08-10 Thread David Miller

From: Saeed Mahameed 
Date: Fri, 10 Aug 2018 15:26:21 -0700

> +static bool is_zero_ip6(__be32 ip6[4])
> +{
> + int i;
> +
> + for (i = 0; i < 4; i++)
> + if (ip6[i] != 0)
> + return false;
> + return true;
> +}

This is ipv6_addr_any().

Re: [net-next 01/12] net/mlx5e: Ethtool steering flow validation refactoring

2018-08-10 Thread David Miller

From: Saeed Mahameed 
Date: Fri, 10 Aug 2018 15:26:19 -0700

> +static int validate_tcpudp4(struct ethtool_rx_flow_spec *fs)
> +{
> + struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.tcp_ip4_spec;
> + int ntuples = 0;
> +
> + if (fs->m_u.tcp_ip4_spec.tos)


Since you've gone to the trouble of loading this object into
l4_mask before this statement, please use "if (l4_mask->tos)"

I know you're just moving existing code into a function, but it
looks silly now.

Re: [PATCH net-next v2] bnxt_en: Fix strcpy() warnings in bnxt_ethtool.c

2018-08-10 Thread David Miller

From: Michael Chan 
Date: Fri, 10 Aug 2018 18:24:43 -0400

> From: Vasundhara Volam 
> 
> This patch fixes following smatch warnings:
> 
> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2826 
> bnxt_fill_coredump_seg_hdr() error: strcpy() '"sEgM"' too large for 
> 'seg_hdr->signature' (5 vs 4)
> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2858 
> bnxt_fill_coredump_record() error: strcpy() '"cOrE"' too large for 
> 'record->signature' (5 vs 4)
> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2879 
> bnxt_fill_coredump_record() error: strcpy() 'utsname()->sysname' too large 
> for 'record->os_name' (65 vs 32)
> 
> Fixes: 6c5657d085ae ("bnxt_en: Add support for ethtool get dump.")
> Reported-by: Dan Carpenter 
> Signed-off-by: Vasundhara Volam 
> Signed-off-by: Michael Chan 

That seems to make them all go away, applied, thanks!

Re: [PATCH bpf-next 0/9] Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY and BPF_PROG_TYPE_SK_REUSEPORT

2018-08-10 Thread Daniel Borkmann

On 08/08/2018 09:59 AM, Martin KaFai Lau wrote:
> This series introduces a new map type "BPF_MAP_TYPE_REUSEPORT_SOCKARRAY"
> and a new prog type BPF_PROG_TYPE_SK_REUSEPORT.
> 
> Here is a snippet from a commit message:
> 
> "To unleash the full potential of a bpf prog, it is essential for the
> userspace to be capable of directly setting up a bpf map which can then
> be consumed by the bpf prog to make decision.  In this case, decide which
> SO_REUSEPORT sk to serve the incoming request.
> 
> By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control
> and visibility on where a SO_REUSEPORT sk should be located in a bpf map.
> The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that
> the bpf prog can directly select a sk from the bpf map.  That will
> raise the programmability of the bpf prog attached to a reuseport
> group (a group of sk serving the same IP:PORT).
> 
> For example, in UDP, the bpf prog can peek into the payload (e.g.
> through the "data" pointer introduced in the later patch) to learn
> the application level's connection information and then decide which sk
> to pick from a bpf map.  The userspace can tightly couple the sk's location
> in a bpf map with the application logic in generating the UDP payload's
> connection information.  This connection info contact/API stays within the
> userspace.
> 
> Also, when used with map-in-map, the userspace can switch the
> old-server-process's inner map to a new-server-process's inner map
> in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)".
> The bpf prog will then direct incoming requests to the new process instead
> of the old process.  The old process can finish draining the pending
> requests (e.g. by "accept()") before closing the old-fds.  [Note that
> deleting a fd from a bpf map does not necessary mean the fd is closed]"
> 
> Please see individual patch for details
> 
> Martin KaFai Lau (9):
>   tcp: Avoid TCP syncookie rejected by SO_REUSEPORT socket
>   net: Add ID (if needed) to sock_reuseport and expose reuseport_lock
>   bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY
>   bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT
>   bpf: Enable BPF_PROG_TYPE_SK_REUSEPORT bpf prog in reuseport selection
>   bpf: Refactor ARRAY_SIZE macro to bpf_util.h
>   bpf: Sync bpf.h uapi to tools/
>   bpf: test BPF_MAP_TYPE_REUSEPORT_SOCKARRAY
>   bpf: Test BPF_PROG_TYPE_SK_REUSEPORT

Applied to bpf-next, thanks Martin!

Re: [PATCH bpf-next 0/3] bpf: add bpffs pretty print for hash/lru_hash maps

2018-08-10 Thread Daniel Borkmann

On 08/09/2018 05:55 PM, Yonghong Song wrote:
> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
> the basic arraymap") added pretty print support to array map.
> This patch adds pretty print for hash and lru_hash maps.
> 
> The following example shows the pretty-print result of a pinned hashmap.
> Without this patch set, user will get an error instead.
> 
> struct map_value {
> int count_a;
> int count_b;
> };
> 
> cat /sys/fs/bpf/pinned_hash_map:
> 
> 87907: {87907,87908}
> 57354: {37354,57355}
> 76625: {76625,76626}
> ...
> 
> Patch #1 fixed a bug in bpffs map_seq_next() function so that
> all elements in the hash table will be traversed.
> Patch #2 implemented map_seq_show_elem() and map_check_btf()
> callback functions for hash and lru hash maps.
> Patch #3 enhanced tools/testing/selftests/bpf/test_btf.c to
> test bpffs hash and lru hash map pretty print.
> 
> Yonghong Song (3):
>   bpf: fix bpffs non-array map seq_show issue
>   bpf: btf: add pretty print for hash/lru_hash maps
>   tools/bpf: add bpffs pretty print btf test for hash/lru_hash maps
> 
>  kernel/bpf/hashtab.c   | 44 +
>  kernel/bpf/inode.c |  8 ++--
>  tools/testing/selftests/bpf/test_btf.c | 87 
> --
>  3 files changed, 121 insertions(+), 18 deletions(-)

Applied to bpf-next, thanks Yonghong!

Re: [PATCH v2 bpf-next] BPF: helpers: New helper to obtain namespace data from current task

2018-08-10 Thread Alexei Starovoitov

On Fri, Aug 10, 2018 at 08:41:04AM -0400, Carlos Neira wrote:
> This helper obtains the active namespace from current and returns pid, tgid,
> device and namespace id as seen from that namespace, allowing to instrument
> a process inside a container.
> Device is read from /proc/self/ns/pid, as in the future it's possible that
> different pid_ns files may belong to different devices, according
> to the discussion between Eric Biederman and Yonghong in 2017 linux plumbers
> conference.
> 
> Currently bpf_get_current_pid_tgid(), is used to do pid filtering in bcc's
> scripts but this helper returns the pid as seen by the root namespace which is
> fine when a bcc script is not executed inside a container.
> When the process of interest is inside a container, pid filtering will not 
> work
> if bpf_get_current_pid_tgid() is used. This helper addresses this limitation
> returning the pid as it's seen by the current namespace where the script is
> executing.
> 
> This helper has the same use cases as bpf_get_current_pid_tgid() as it can be
> used to do pid filtering even inside a container.
> 
> For example a bcc script using bpf_get_current_pid_tgid() 
> (tools/funccount.py):
> 
>   u32 pid = bpf_get_current_pid_tgid() >> 32;
>   if (pid != )
>   return 0;
> 
> Could be modified to use bpf_get_current_pidns_info() as follows:
> 
>   struct bpf_pidns pidns;
>   bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
>   u32 pid = pidns.tgid;
>   u32 nsid = pidns.nsid;
>   if ((pid != ) && (nsid != ))
>   return 0;
> 
> To find out the name PID namespace id of a process, you could use this 
> command:
> 
> $ ps -h -o pidns -p 
> 
> Or this other command:
> 
> $ ls -Li /proc//ns/pid
> 
> Signed-off-by: Carlos Antonio Neira Bustos 
> ---
>  include/linux/bpf.h   |  1 +
>  include/uapi/linux/bpf.h  | 24 +++-
>  kernel/bpf/core.c |  1 +
>  kernel/bpf/helpers.c  | 64 
> +++
>  kernel/trace/bpf_trace.c  |  2 +
>  samples/bpf/Makefile  |  3 ++
>  samples/bpf/trace_ns_info_user.c  | 35 +
>  samples/bpf/trace_ns_info_user_kern.c | 45 ++
>  tools/include/uapi/linux/bpf.h| 24 +++-
>  tools/testing/selftests/bpf/bpf_helpers.h |  3 ++
>  10 files changed, 200 insertions(+), 2 deletions(-)
>  create mode 100644 samples/bpf/trace_ns_info_user.c
>  create mode 100644 samples/bpf/trace_ns_info_user_kern.c
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index cd8790d2c6ed..3f4b999f7c99 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -787,6 +787,7 @@ extern const struct bpf_func_proto bpf_get_stack_proto;
>  extern const struct bpf_func_proto bpf_sock_map_update_proto;
>  extern const struct bpf_func_proto bpf_sock_hash_update_proto;
>  extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
> +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
>  
>  extern const struct bpf_func_proto bpf_get_local_storage_proto;
>  
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index dd5758dc35d3..8462f9881465 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2113,6 +2113,18 @@ union bpf_attr {
>   *   the shared data.
>   *   Return
>   *   Pointer to the local storage area.
> + *
> + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 
> size_of_pidns)
> + *   Description
> + *   Copies into *pidns* pid, namespace id and tgid as seen by the
> + *   current namespace and also device from /proc/self/ns/pid.
> + *   *size_of_pidns* must be the size of *pidns*
> + *
> + *   This helper is used when pid filtering is needed inside a
> + *   container as bpf_get_current_tgid() helper returns always the
> + *   pid id as seen by the root namespace.
> + *   Return
> + *   0 on success -EINVAL on error.
>   */
>  #define __BPF_FUNC_MAPPER(FN)\
>   FN(unspec), \
> @@ -2196,7 +2208,8 @@ union bpf_attr {
>   FN(rc_keydown), \
>   FN(skb_cgroup_id),  \
>   FN(get_current_cgroup_id),  \
> - FN(get_local_storage),
> + FN(get_local_storage),  \
> + FN(get_current_pidns_info),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2724,4 +2737,13 @@ enum bpf_task_fd_type {
>   BPF_FD_TYPE_URETPROBE,  /* filename + offset */
>  };
>  
> +/* helper bpf_get_current_pidns_info will store the following
> + * data, dev will contain major/minor from /proc/self/ns/pid.
> + */
> +struct bpf_pidns_info {
> + __u32 dev;
> + __u32 nsid;
> + __u32 tgid;
> + __u32 pid;
> +};
>  #endif /* _UAPI__LIN

[net-next 03/12] net/mlx5e: Ethtool steering, ip6 support

2018-08-10 Thread Saeed Mahameed

Add ip6 support for ethtool flow steering.

New supported flow types: ip6|tcp6|udp6|
Supported fields: src-ip|dst-ip|src-port|dst-port

Signed-off-by: Saeed Mahameed 
---
 .../mellanox/mlx5/core/en_fs_ethtool.c| 138 ++
 1 file changed, 138 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index f2fa189adc4f..646b659fe805 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -66,11 +66,14 @@ static struct mlx5e_ethtool_table *get_flow_table(struct 
mlx5e_priv *priv,
switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
case TCP_V4_FLOW:
case UDP_V4_FLOW:
+   case TCP_V6_FLOW:
+   case UDP_V6_FLOW:
max_tuples = ETHTOOL_NUM_L3_L4_FTS;
prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples);
eth_ft = &priv->fs.ethtool.l3_l4_ft[prio];
break;
case IP_USER_FLOW:
+   case IPV6_USER_FLOW:
max_tuples = ETHTOOL_NUM_L3_L4_FTS;
prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples);
eth_ft = &priv->fs.ethtool.l3_l4_ft[prio];
@@ -142,6 +145,39 @@ set_ip4(void *headers_c, void *headers_v, __be32 ip4src_m,
MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IP);
 }
 
+static bool is_zero_ip6(__be32 ip6[4])
+{
+   int i;
+
+   for (i = 0; i < 4; i++)
+   if (ip6[i] != 0)
+   return false;
+   return true;
+}
+
+static void
+set_ip6(void *headers_c, void *headers_v, __be32 ip6src_m[4],
+   __be32 ip6src_v[4], __be32 ip6dst_m[4], __be32 ip6dst_v[4])
+{
+   u8 ip6_sz = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6);
+
+   if (!is_zero_ip6(ip6src_m)) {
+   memcpy(MLX5E_FTE_ADDR_OF(headers_v, 
src_ipv4_src_ipv6.ipv6_layout.ipv6),
+  ip6src_v, ip6_sz);
+   memset(MLX5E_FTE_ADDR_OF(headers_c, 
src_ipv4_src_ipv6.ipv6_layout.ipv6),
+  0xff, ip6_sz);
+   }
+   if (!is_zero_ip6(ip6dst_m)) {
+   memcpy(MLX5E_FTE_ADDR_OF(headers_v, 
dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+  ip6dst_v, ip6_sz);
+   memset(MLX5E_FTE_ADDR_OF(headers_c, 
dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+  0xff, ip6_sz);
+   }
+
+   MLX5E_FTE_SET(headers_c, ethertype, 0x);
+   MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IPV6);
+}
+
 static void
 set_tcp(void *headers_c, void *headers_v, __be16 psrc_m, __be16 psrc_v,
__be16 pdst_m, __be16 pdst_v)
@@ -213,6 +249,42 @@ parse_ip4(void *headers_c, void *headers_v, struct 
ethtool_rx_flow_spec *fs)
l3_mask->ip4dst, l3_val->ip4dst);
 }
 
+static void
+parse_ip6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+   struct ethtool_usrip6_spec *l3_mask = &fs->m_u.usr_ip6_spec;
+   struct ethtool_usrip6_spec *l3_val  = &fs->h_u.usr_ip6_spec;
+
+   set_ip6(headers_c, headers_v, l3_mask->ip6src,
+   l3_val->ip6src, l3_mask->ip6dst, l3_val->ip6dst);
+}
+
+static void
+parse_tcp6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+   struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.tcp_ip6_spec;
+   struct ethtool_tcpip6_spec *l4_val  = &fs->h_u.tcp_ip6_spec;
+
+   set_ip6(headers_c, headers_v, l4_mask->ip6src,
+   l4_val->ip6src, l4_mask->ip6dst, l4_val->ip6dst);
+
+   set_tcp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
+   l4_mask->pdst, l4_val->pdst);
+}
+
+static void
+parse_udp6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+   struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.udp_ip6_spec;
+   struct ethtool_tcpip6_spec *l4_val  = &fs->h_u.udp_ip6_spec;
+
+   set_ip6(headers_c, headers_v, l4_mask->ip6src,
+   l4_val->ip6src, l4_mask->ip6dst, l4_val->ip6dst);
+
+   set_udp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
+   l4_mask->pdst, l4_val->pdst);
+}
+
 static void
 parse_ether(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
 {
@@ -264,6 +336,15 @@ static int set_flow_attrs(u32 *match_c, u32 *match_v,
case IP_USER_FLOW:
parse_ip4(outer_headers_c, outer_headers_v, fs);
break;
+   case TCP_V6_FLOW:
+   parse_tcp6(outer_headers_c, outer_headers_v, fs);
+   break;
+   case UDP_V6_FLOW:
+   parse_udp6(outer_headers_c, outer_headers_v, fs);
+   break;
+   case IPV6_USER_FLOW:
+   parse_ip6(outer_headers_c, outer_headers_v, fs);
+   break;
case ETHER_FLOW:
parse_ether(outer_headers_c, outer_headers_v, fs);
break;
@@ -473,6 +554,50 @@ static int validate_ip4(struct ethtool_rx_flow_spec *fs)
return ++ntuples;

[net-next 11/12] net/mlx5: Reorganize the makefile

2018-08-10 Thread Saeed Mahameed

Reorganize the Makefile and group files together according to their
functionality and importance.

Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/Makefile  | 57 +--
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9e78c48b22dd..d324a3884462 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -1,38 +1,61 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_MLX5_CORE)+= mlx5_core.o
+#
+# Makefile for Mellanox 5th generation network adapters
+# (ConnectX series) core & netdev driver
+#
+
 subdir-ccflags-y += -I$(src)
 
+obj-$(CONFIG_MLX5_CORE) += mlx5_core.o
+
+#
+# mlx5 core basic
+#
 mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o \
mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
fs_counters.o rl.o lag.o dev.o wq.o lib/gid.o  \
diag/fs_tracepoint.o diag/fw_tracer.o
 
-mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o accel/tls.o
-
-mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o 
\
-   fpga/ipsec.o fpga/tls.o
-
-mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
-mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o
-mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
-
+#
+# Netdev basic
+#
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o 
\
en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
en_selftest.o en/port.o
 
-mlx5_core-$(CONFIG_MLX5_EN_ARFS)  += en_arfs.o
-mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o
+#
+# Netdev extra
+#
+mlx5_core-$(CONFIG_MLX5_EN_ARFS) += en_arfs.o
+mlx5_core-$(CONFIG_MLX5_EN_RXNFC)+= en_fs_ethtool.o
+mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
+mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o
+
+#
+# Core extra
+#
+mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o eswitch_offloads.o
+mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
+mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
+mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
 
-mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o en_rep.o 
en_tc.o
+#
+# Ipoib netdev
+#
+mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o 
ipoib/ipoib_vlan.o
 
-mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) +=  en_dcbnl.o en/port_buffer.o
+#
+# Accelerations & FPGA
+#
+mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o accel/tls.o
 
-mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o 
ipoib/ipoib_vlan.o
+mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o 
\
+fpga/ipsec.o fpga/tls.o
 
 mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
-   en_accel/ipsec_stats.o
+en_accel/ipsec_stats.o
 
-mlx5_core-$(CONFIG_MLX5_EN_TLS) +=  en_accel/tls.o en_accel/tls_rxtx.o 
en_accel/tls_stats.o
+mlx5_core-$(CONFIG_MLX5_EN_TLS) += en_accel/tls.o en_accel/tls_rxtx.o 
en_accel/tls_stats.o
 
 CFLAGS_tracepoint.o := -I$(src)
-- 
2.17.0

[net-next 10/12] net/mlx5e: clock.c depends on CONFIG_PTP_1588_CLOCK

2018-08-10 Thread Saeed Mahameed

From: Moshe Shemesh 

lib/clock.c includes clock related functions which require ptp support.
Thus compile out lib/clock.c and add the needed function stubs in case
kconfig CONFIG_PTP_1588_CLOCK is off.

Signed-off-by: Moshe Shemesh 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/Makefile  |  3 ++-
 .../ethernet/mellanox/mlx5/core/en_ethtool.c  |  7 +++---
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  4 +++-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c  |  1 +
 .../ethernet/mellanox/mlx5/core/lib/clock.h   | 24 +++
 .../ethernet/mellanox/mlx5/core/mlx5_core.h   |  1 -
 6 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 09b5e235527b..9e78c48b22dd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -5,7 +5,7 @@ subdir-ccflags-y += -I$(src)
 mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o \
mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
-   fs_counters.o rl.o lag.o dev.o wq.o lib/gid.o lib/clock.o \
+   fs_counters.o rl.o lag.o dev.o wq.o lib/gid.o  \
diag/fs_tracepoint.o diag/fw_tracer.o
 
 mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o accel/tls.o
@@ -15,6 +15,7 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o 
fpga/conn.o fpga/sdk.o \
 
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o
+mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o 
\
en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 7787cc3a2c84..98dd3e0ada72 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -32,6 +32,7 @@
 
 #include "en.h"
 #include "en/port.h"
+#include "lib/clock.h"
 
 void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
   struct ethtool_drvinfo *drvinfo)
@@ -1106,10 +1107,10 @@ int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
if (ret)
return ret;
 
-   info->phc_index = mdev->clock.ptp ?
- ptp_clock_index(mdev->clock.ptp) : -1;
+   info->phc_index = mlx5_clock_get_ptp_index(mdev);
 
-   if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+   if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz) ||
+   info->phc_index == -1)
return 0;
 
info->so_timestamping |= SOF_TIMESTAMPING_TX_HARDWARE |
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index e09086f41365..5a7939e70190 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -46,6 +46,7 @@
 #include "accel/ipsec.h"
 #include "accel/tls.h"
 #include "lib/vxlan.h"
+#include "lib/clock.h"
 #include "en/port.h"
 #include "en/xdp.h"
 
@@ -3782,7 +3783,8 @@ int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct 
ifreq *ifr)
struct hwtstamp_config config;
int err;
 
-   if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+   if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz) ||
+   (mlx5_clock_get_ptp_index(priv->mdev) == -1))
return -EOPNOTSUPP;
 
if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 7669b4380779..48864f4988a4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -40,6 +40,7 @@
 #include "mlx5_core.h"
 #include "fpga/core.h"
 #include "eswitch.h"
+#include "lib/clock.h"
 #include "diag/fw_tracer.h"
 
 enum {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
index a8eecedd46c2..02e2e4575e4f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
@@ -33,8 +33,15 @@
 #ifndef __LIB_CLOCK_H__
 #define __LIB_CLOCK_H__
 
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
 void mlx5_init_clock(struct mlx5_core_dev *mdev);
 void mlx5_cleanup_clock(struct mlx5_core_dev *mdev);
+void mlx5_pps_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
+
+static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev)
+{
+   return mdev->clock.ptp ? ptp_clock_index(mdev->clock.ptp) : -1;
+}
 
 static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock,
u64 timestamp

[net-next 08/12] net/mlx5e: Move flow steering declarations into en/fs.h

2018-08-10 Thread Saeed Mahameed

Move flow steering declarations and definitions into the dedicated
en/fs.h header file

Signed-off-by: Saeed Mahameed 
Reviewed-by: Moshe Shemesh 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 126 -
 .../net/ethernet/mellanox/mlx5/core/en/fs.h   | 129 ++
 2 files changed, 129 insertions(+), 126 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8743bbe1baa2..db2cfcd21d43 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -627,112 +627,12 @@ struct mlx5e_channel_stats {
struct mlx5e_xdpsq_stats xdpsq;
 } cacheline_aligned_in_smp;
 
-enum mlx5e_traffic_types {
-   MLX5E_TT_IPV4_TCP,
-   MLX5E_TT_IPV6_TCP,
-   MLX5E_TT_IPV4_UDP,
-   MLX5E_TT_IPV6_UDP,
-   MLX5E_TT_IPV4_IPSEC_AH,
-   MLX5E_TT_IPV6_IPSEC_AH,
-   MLX5E_TT_IPV4_IPSEC_ESP,
-   MLX5E_TT_IPV6_IPSEC_ESP,
-   MLX5E_TT_IPV4,
-   MLX5E_TT_IPV6,
-   MLX5E_TT_ANY,
-   MLX5E_NUM_TT,
-   MLX5E_NUM_INDIR_TIRS = MLX5E_TT_ANY,
-};
-
-enum mlx5e_tunnel_types {
-   MLX5E_TT_IPV4_GRE,
-   MLX5E_TT_IPV6_GRE,
-   MLX5E_NUM_TUNNEL_TT,
-};
-
 enum {
MLX5E_STATE_ASYNC_EVENTS_ENABLED,
MLX5E_STATE_OPENED,
MLX5E_STATE_DESTROYING,
 };
 
-struct mlx5e_l2_rule {
-   u8  addr[ETH_ALEN + 2];
-   struct mlx5_flow_handle *rule;
-};
-
-#define MLX5E_L2_ADDR_HASH_SIZE BIT(BITS_PER_BYTE)
-
-struct mlx5e_tc_table {
-   struct mlx5_flow_table  *t;
-
-   struct rhashtable   ht;
-
-   DECLARE_HASHTABLE(mod_hdr_tbl, 8);
-   DECLARE_HASHTABLE(hairpin_tbl, 8);
-};
-
-struct mlx5e_vlan_table {
-   struct mlx5e_flow_table ft;
-   DECLARE_BITMAP(active_cvlans, VLAN_N_VID);
-   DECLARE_BITMAP(active_svlans, VLAN_N_VID);
-   struct mlx5_flow_handle *active_cvlans_rule[VLAN_N_VID];
-   struct mlx5_flow_handle *active_svlans_rule[VLAN_N_VID];
-   struct mlx5_flow_handle *untagged_rule;
-   struct mlx5_flow_handle *any_cvlan_rule;
-   struct mlx5_flow_handle *any_svlan_rule;
-   boolcvlan_filter_disabled;
-};
-
-struct mlx5e_l2_table {
-   struct mlx5e_flow_tableft;
-   struct hlist_head  netdev_uc[MLX5E_L2_ADDR_HASH_SIZE];
-   struct hlist_head  netdev_mc[MLX5E_L2_ADDR_HASH_SIZE];
-   struct mlx5e_l2_rule   broadcast;
-   struct mlx5e_l2_rule   allmulti;
-   struct mlx5e_l2_rule   promisc;
-   bool   broadcast_enabled;
-   bool   allmulti_enabled;
-   bool   promisc_enabled;
-};
-
-/* L3/L4 traffic type classifier */
-struct mlx5e_ttc_table {
-   struct mlx5e_flow_table  ft;
-   struct mlx5_flow_handle  *rules[MLX5E_NUM_TT];
-   struct mlx5_flow_handle  *tunnel_rules[MLX5E_NUM_TUNNEL_TT];
-};
-
-/* NIC prio FTS */
-enum {
-   MLX5E_VLAN_FT_LEVEL = 0,
-   MLX5E_L2_FT_LEVEL,
-   MLX5E_TTC_FT_LEVEL,
-   MLX5E_INNER_TTC_FT_LEVEL,
-#ifdef CONFIG_MLX5_EN_ARFS
-   MLX5E_ARFS_FT_LEVEL
-#endif
-};
-
-enum {
-   MLX5E_TC_FT_LEVEL = 0,
-   MLX5E_TC_TTC_FT_LEVEL,
-};
-
-struct mlx5e_flow_steering {
-   struct mlx5_flow_namespace  *ns;
-#ifdef CONFIG_MLX5_EN_RXNFC
-   struct mlx5e_ethtool_steering   ethtool;
-#endif
-   struct mlx5e_tc_table   tc;
-   struct mlx5e_vlan_table vlan;
-   struct mlx5e_l2_table   l2;
-   struct mlx5e_ttc_table  ttc;
-   struct mlx5e_ttc_table  inner_ttc;
-#ifdef CONFIG_MLX5_EN_ARFS
-   struct mlx5e_arfs_tablesarfs;
-#endif
-};
-
 struct mlx5e_rqt {
u32  rqtn;
bool enabled;
@@ -866,10 +766,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct 
mlx5_cqe64 *cqe,
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
 
-int mlx5e_create_flow_steering(struct mlx5e_priv *priv);
-void mlx5e_destroy_flow_steering(struct mlx5e_priv *priv);
 void mlx5e_init_l2_addr(struct mlx5e_priv *priv);
-void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft);
 int mlx5e_self_test_num(struct mlx5e_priv *priv);
 void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest,
 u64 *buf);
@@ -883,8 +780,6 @@ int mlx5e_vlan_rx_add_vid(struct net_device *dev, 
__always_unused __be16 proto,
  u16 vid);
 int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 
proto,
   u16 vid);
-void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv);
-void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv);
 void mlx5e_timestamp_init(struct mlx5e_priv *priv);
 
 struct mlx5e_redirect_rqt_param {
@@ -1021,27 +916,6 @@ int mlx5e_create_direct_tirs(struct mlx5e_priv *priv);
 void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv);
 void mlx5e_destroy_rqt(stru

[net-next 09/12] net/mlx5e: vxlan.c depends on CONFIG_VXLAN

2018-08-10 Thread Saeed Mahameed

When vxlan is not enabled by kernel, no need to enable it in mlx5.
Compile out lib/vxlan.c if CONFIG_VXLAN is not selected.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Moshe Shemesh 
Reviewed-by: Eran Ben Elisha 
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile|  7 ---
 drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h | 12 ++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig 
b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 01f9ba1a2098..37a551436e4a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -7,6 +7,7 @@ config MLX5_CORE
depends on MAY_USE_DEVLINK
depends on PCI
imply PTP_1588_CLOCK
+   imply VXLAN
default n
---help---
  Core driver for low level functionality of the ConnectX-4 and
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index ae9da4b51487..09b5e235527b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -13,15 +13,16 @@ mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o accel/tls.o
 mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o 
\
fpga/ipsec.o fpga/tls.o
 
+mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
+mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o
+
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o 
\
en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
-   en_selftest.o en/port.o lib/vxlan.o
+   en_selftest.o en/port.o
 
 mlx5_core-$(CONFIG_MLX5_EN_ARFS)  += en_arfs.o
 mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o
 
-mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
-
 mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o en_rep.o 
en_tc.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) +=  en_dcbnl.o en/port_buffer.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h
index fd874a30c4d0..8fb0eb08fa6d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h
@@ -37,8 +37,6 @@
 struct mlx5_vxlan;
 struct mlx5_vxlan_port;
 
-#ifdef CONFIG_MLX5_CORE_EN
-
 static inline bool mlx5_vxlan_allowed(struct mlx5_vxlan *vxlan)
 {
/* not allowed reason is encoded in vxlan pointer as error,
@@ -47,18 +45,20 @@ static inline bool mlx5_vxlan_allowed(struct mlx5_vxlan 
*vxlan)
return !IS_ERR_OR_NULL(vxlan);
 }
 
+#if IS_ENABLED(CONFIG_VXLAN)
 struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev);
 void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan);
 int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port);
 int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port);
 struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 
port);
-
 #else
-
 static inline struct mlx5_vxlan*
-mlx5_vxlan_create(struct mlx5_core_dev *mdev) { return ERR_PTR(-ENOTSUPP); }
+mlx5_vxlan_create(struct mlx5_core_dev *mdev) { return ERR_PTR(-EOPNOTSUPP); }
 static inline void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) { return; }
-
+static inline int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port) { 
return -EOPNOTSUPP; }
+static inline int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port) { 
return -EOPNOTSUPP; }
+static inline struct mx5_vxlan_port*
+mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port) { return NULL; }
 #endif
 
 #endif /* __MLX5_VXLAN_H__ */
-- 
2.17.0

[net-next 05/12] net/mlx5e: Ethtool steering, move ethtool callbacks

2018-08-10 Thread Saeed Mahameed

Move ethool rxnfc callback into en_fs_etthool file where they belong.
This will allow us to make many ethtool fs related helper functions
static.

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 11 +--
 .../ethernet/mellanox/mlx5/core/en_ethtool.c  | 48 -
 .../mellanox/mlx5/core/en_fs_ethtool.c| 67 ---
 3 files changed, 62 insertions(+), 64 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 29a3be97..31a29b73f558 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -912,16 +912,11 @@ void mlx5e_destroy_flow_table(struct mlx5e_flow_table 
*ft);
 int mlx5e_self_test_num(struct mlx5e_priv *priv);
 void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest,
 u64 *buf);
-int mlx5e_ethtool_get_flow(struct mlx5e_priv *priv, struct ethtool_rxnfc *info,
-  int location);
-int mlx5e_ethtool_get_all_flows(struct mlx5e_priv *priv,
-   struct ethtool_rxnfc *info, u32 *rule_locs);
-int mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv,
-  struct ethtool_rx_flow_spec *fs);
-int mlx5e_ethtool_flow_remove(struct mlx5e_priv *priv,
- int location);
 void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv);
 void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv);
+int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd);
+int mlx5e_get_rxnfc(struct net_device *dev,
+   struct ethtool_rxnfc *info, u32 *rule_locs);
 void mlx5e_set_rx_mode_work(struct work_struct *work);
 
 int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index fffe514ba855..cde1a0bb9c4a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -969,33 +969,6 @@ static int mlx5e_set_rxfh(struct net_device *dev, const 
u32 *indir,
return 0;
 }
 
-static int mlx5e_get_rxnfc(struct net_device *netdev,
-  struct ethtool_rxnfc *info, u32 *rule_locs)
-{
-   struct mlx5e_priv *priv = netdev_priv(netdev);
-   int err = 0;
-
-   switch (info->cmd) {
-   case ETHTOOL_GRXRINGS:
-   info->data = priv->channels.params.num_channels;
-   break;
-   case ETHTOOL_GRXCLSRLCNT:
-   info->rule_cnt = priv->fs.ethtool.tot_num_rules;
-   break;
-   case ETHTOOL_GRXCLSRULE:
-   err = mlx5e_ethtool_get_flow(priv, info, info->fs.location);
-   break;
-   case ETHTOOL_GRXCLSRLALL:
-   err = mlx5e_ethtool_get_all_flows(priv, info, rule_locs);
-   break;
-   default:
-   err = -EOPNOTSUPP;
-   break;
-   }
-
-   return err;
-}
-
 #define MLX5E_PFC_PREVEN_AUTO_TOUT_MSEC100
 #define MLX5E_PFC_PREVEN_TOUT_MAX_MSEC 8000
 #define MLX5E_PFC_PREVEN_MINOR_PRECENT 85
@@ -1606,26 +1579,6 @@ static u32 mlx5e_get_priv_flags(struct net_device 
*netdev)
return priv->channels.params.pflags;
 }
 
-static int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
-{
-   int err = 0;
-   struct mlx5e_priv *priv = netdev_priv(dev);
-
-   switch (cmd->cmd) {
-   case ETHTOOL_SRXCLSRLINS:
-   err = mlx5e_ethtool_flow_replace(priv, &cmd->fs);
-   break;
-   case ETHTOOL_SRXCLSRLDEL:
-   err = mlx5e_ethtool_flow_remove(priv, cmd->fs.location);
-   break;
-   default:
-   err = -EOPNOTSUPP;
-   break;
-   }
-
-   return err;
-}
-
 int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
   struct ethtool_flash *flash)
 {
@@ -1696,5 +1649,4 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
.self_test = mlx5e_self_test,
.get_msglevel  = mlx5e_get_msglevel,
.set_msglevel  = mlx5e_set_msglevel,
-
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index c0abcf48dfe5..881cb2475c18 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -684,8 +684,9 @@ static int validate_flow(struct mlx5e_priv *priv,
return num_tuples;
 }
 
-int mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv,
-  struct ethtool_rx_flow_spec *fs)
+static int
+mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv,
+  struct ethtool_rx_flow_spec *fs)
 {
struct mlx5e_ethtool_table *eth_ft;
struct mlx5e_ethtool_rule *eth_rule;
@@ -

[net-next 04/12] net/mlx5e: Ethtool steering, l4 proto support

2018-08-10 Thread Saeed Mahameed

Add support for l4 proto ip field in ethtool flow steering.

Example: Redirect icmpv6 to rx queue #2

ethtool -U eth0 flow-type ip6 l4proto 58 action 2

Signed-off-by: Saeed Mahameed 
---
 .../mellanox/mlx5/core/en_fs_ethtool.c| 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index 646b659fe805..c0abcf48dfe5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -247,6 +247,11 @@ parse_ip4(void *headers_c, void *headers_v, struct 
ethtool_rx_flow_spec *fs)
 
set_ip4(headers_c, headers_v, l3_mask->ip4src, l3_val->ip4src,
l3_mask->ip4dst, l3_val->ip4dst);
+
+   if (l3_mask->proto) {
+   MLX5E_FTE_SET(headers_c, ip_protocol, l3_mask->proto);
+   MLX5E_FTE_SET(headers_v, ip_protocol, l3_val->proto);
+   }
 }
 
 static void
@@ -257,6 +262,11 @@ parse_ip6(void *headers_c, void *headers_v, struct 
ethtool_rx_flow_spec *fs)
 
set_ip6(headers_c, headers_v, l3_mask->ip6src,
l3_val->ip6src, l3_mask->ip6dst, l3_val->ip6dst);
+
+   if (l3_mask->l4_proto) {
+   MLX5E_FTE_SET(headers_c, ip_protocol, l3_mask->l4_proto);
+   MLX5E_FTE_SET(headers_v, ip_protocol, l3_val->l4_proto);
+   }
 }
 
 static void
@@ -537,7 +547,7 @@ static int validate_ip4(struct ethtool_rx_flow_spec *fs)
struct ethtool_usrip4_spec *l3_mask = &fs->m_u.usr_ip4_spec;
int ntuples = 0;
 
-   if (l3_mask->l4_4_bytes || l3_mask->tos || l3_mask->proto ||
+   if (l3_mask->l4_4_bytes || l3_mask->tos ||
fs->h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4)
return -EINVAL;
if (l3_mask->ip4src) {
@@ -550,6 +560,8 @@ static int validate_ip4(struct ethtool_rx_flow_spec *fs)
return -EINVAL;
ntuples++;
}
+   if (l3_mask->proto)
+   ntuples++;
/* Flow is IPv4 */
return ++ntuples;
 }
@@ -559,13 +571,14 @@ static int validate_ip6(struct ethtool_rx_flow_spec *fs)
struct ethtool_usrip6_spec *l3_mask = &fs->m_u.usr_ip6_spec;
int ntuples = 0;
 
-   if (l3_mask->l4_4_bytes || l3_mask->tclass || l3_mask->l4_proto)
+   if (l3_mask->l4_4_bytes || l3_mask->tclass)
return -EINVAL;
if (!is_zero_ip6(l3_mask->ip6src))
ntuples++;
if (!is_zero_ip6(l3_mask->ip6dst))
ntuples++;
-
+   if (l3_mask->l4_proto)
+   ntuples++;
/* Flow is IPv6 */
return ++ntuples;
 }
-- 
2.17.0

[net-next 12/12] net/mlx5: Improve argument name for add flow API

2018-08-10 Thread Saeed Mahameed

From: Eli Cohen 

The last argument to mlx5_add_flow_rules passes the number of
destinations in the struct pointed to by the dest arg. Change the name
to better reflect this fact.

Signed-off-by: Eli Cohen 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 8 
 include/linux/mlx5/fs.h   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index a21df24b695e..261cb6aacf12 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1876,7 +1876,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
struct mlx5_flow_spec *spec,
struct mlx5_flow_act *flow_act,
struct mlx5_flow_destination *dest,
-   int dest_num)
+   int num_dest)
 {
struct mlx5_flow_root_namespace *root = find_root(&ft->node);
struct mlx5_flow_destination gen_dest = {};
@@ -1889,7 +1889,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
if (flow_act->action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
if (!fwd_next_prio_supported(ft))
return ERR_PTR(-EOPNOTSUPP);
-   if (dest_num)
+   if (num_dest)
return ERR_PTR(-EINVAL);
mutex_lock(&root->chain_lock);
next_ft = find_next_chained_ft(prio);
@@ -1897,7 +1897,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
gen_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
gen_dest.ft = next_ft;
dest = &gen_dest;
-   dest_num = 1;
+   num_dest = 1;
flow_act->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
} else {
mutex_unlock(&root->chain_lock);
@@ -1905,7 +1905,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
}
}
 
-   handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, dest_num);
+   handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, num_dest);
 
if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
if (!IS_ERR_OR_NULL(handle) &&
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index c40f2fc68655..71fb503b2b52 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -177,7 +177,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
struct mlx5_flow_spec *spec,
struct mlx5_flow_act *flow_act,
struct mlx5_flow_destination *dest,
-   int dest_num);
+   int num_dest);
 void mlx5_del_flow_rules(struct mlx5_flow_handle *fr);
 
 int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
-- 
2.17.0

[net-next 07/12] net/mlx5e: Add CONFIG_MLX5_EN_ARFS for accelerated flow steering support

2018-08-10 Thread Saeed Mahameed

Add new mlx5 Kconfig flag to allow selecting accelerated flow steering
support, and compile out en_arfs.c if not selected.

Move arfs declarations and definitions to en/fs.h header file.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Moshe Shemesh 
---
 .../net/ethernet/mellanox/mlx5/core/Kconfig   |  8 +++
 .../net/ethernet/mellanox/mlx5/core/Makefile  |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 61 ++-
 .../net/ethernet/mellanox/mlx5/core/en/fs.h   | 46 ++
 .../net/ethernet/mellanox/mlx5/core/en_arfs.c |  4 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 14 ++---
 6 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig 
b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 1ff5f12ab12d..01f9ba1a2098 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -35,6 +35,14 @@ config MLX5_CORE_EN
---help---
  Ethernet support in Mellanox Technologies ConnectX-4 NIC.
 
+config MLX5_EN_ARFS
+   bool "Mellanox MLX5 ethernet accelerated receive flow steering (ARFS) 
support"
+   depends on MLX5_CORE_EN && RFS_ACCEL
+   default y
+   ---help---
+ Mellanox MLX5 ethernet hardware-accelerated receive flow steering 
support,
+ Enables ethernet netdevice arfs support and ntuple filtering.
+
 config MLX5_EN_RXNFC
bool "Mellanox MLX5 ethernet rx nfc flow steering support"
depends on MLX5_CORE_EN
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 7b6f9d2c32c9..ae9da4b51487 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -15,8 +15,9 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o 
fpga/conn.o fpga/sdk.o \
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o 
\
en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
-   en_arfs.o en_selftest.o en/port.o lib/vxlan.o
+   en_selftest.o en/port.o lib/vxlan.o
 
+mlx5_core-$(CONFIG_MLX5_EN_ARFS)  += en_arfs.o
 mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 19728f9f25e7..8743bbe1baa2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -660,12 +660,6 @@ struct mlx5e_l2_rule {
struct mlx5_flow_handle *rule;
 };
 
-struct mlx5e_flow_table {
-   int num_groups;
-   struct mlx5_flow_table *t;
-   struct mlx5_flow_group **g;
-};
-
 #define MLX5E_L2_ADDR_HASH_SIZE BIT(BITS_PER_BYTE)
 
 struct mlx5e_tc_table {
@@ -708,38 +702,15 @@ struct mlx5e_ttc_table {
struct mlx5_flow_handle  *tunnel_rules[MLX5E_NUM_TUNNEL_TT];
 };
 
-#define ARFS_HASH_SHIFT BITS_PER_BYTE
-#define ARFS_HASH_SIZE BIT(BITS_PER_BYTE)
-struct arfs_table {
-   struct mlx5e_flow_table  ft;
-   struct mlx5_flow_handle  *default_rule;
-   struct hlist_headrules_hash[ARFS_HASH_SIZE];
-};
-
-enum  arfs_type {
-   ARFS_IPV4_TCP,
-   ARFS_IPV6_TCP,
-   ARFS_IPV4_UDP,
-   ARFS_IPV6_UDP,
-   ARFS_NUM_TYPES,
-};
-
-struct mlx5e_arfs_tables {
-   struct arfs_table arfs_tables[ARFS_NUM_TYPES];
-   /* Protect aRFS rules list */
-   spinlock_t arfs_lock;
-   struct list_head   rules;
-   intlast_filter_id;
-   struct workqueue_struct*wq;
-};
-
 /* NIC prio FTS */
 enum {
MLX5E_VLAN_FT_LEVEL = 0,
MLX5E_L2_FT_LEVEL,
MLX5E_TTC_FT_LEVEL,
MLX5E_INNER_TTC_FT_LEVEL,
+#ifdef CONFIG_MLX5_EN_ARFS
MLX5E_ARFS_FT_LEVEL
+#endif
 };
 
 enum {
@@ -757,7 +728,9 @@ struct mlx5e_flow_steering {
struct mlx5e_l2_table   l2;
struct mlx5e_ttc_table  ttc;
struct mlx5e_ttc_table  inner_ttc;
+#ifdef CONFIG_MLX5_EN_ARFS
struct mlx5e_arfs_tablesarfs;
+#endif
 };
 
 struct mlx5e_rqt {
@@ -1028,32 +1001,6 @@ void mlx5e_dcbnl_init_app(struct mlx5e_priv *priv);
 void mlx5e_dcbnl_delete_app(struct mlx5e_priv *priv);
 #endif
 
-#ifndef CONFIG_RFS_ACCEL
-static inline int mlx5e_arfs_create_tables(struct mlx5e_priv *priv)
-{
-   return 0;
-}
-
-static inline void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv) {}
-
-static inline int mlx5e_arfs_enable(struct mlx5e_priv *priv)
-{
-   return -EOPNOTSUPP;
-}
-
-static inline int mlx5e_arfs_disable(struct mlx5e_priv *priv)
-{
-   return -EOPNOTSUPP;
-}
-#else
-int mlx5e_arfs_create_tables(struct mlx5e_priv *priv);
-void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv);
-int mlx5e_arfs_enable(struct mlx5e_priv *priv);
-int mlx5e_arfs_disable(struct mlx5e_priv *priv);
-int mlx5e_rx_flow_steer

[net-next 06/12] net/mlx5e: Add CONFIG_MLX5_EN_RXNFC for ethtool rx nfc

2018-08-10 Thread Saeed Mahameed

Add new mlx5 Kconfig flag to allow selecting ethtool rx nfc support,
and compile out en_fs_ehtool.c if not selected.

Add en/fs.h header file to host all steering declarations and
definitions.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Moshe Shemesh 
---
 .../net/ethernet/mellanox/mlx5/core/Kconfig   | 10 ++
 .../net/ethernet/mellanox/mlx5/core/Makefile  |  4 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 23 ++--
 .../net/ethernet/mellanox/mlx5/core/en/fs.h   | 35 +++
 .../ethernet/mellanox/mlx5/core/en_ethtool.c  |  2 ++
 5 files changed, 53 insertions(+), 21 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/fs.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig 
b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 7a84dd07ced2..1ff5f12ab12d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -35,6 +35,16 @@ config MLX5_CORE_EN
---help---
  Ethernet support in Mellanox Technologies ConnectX-4 NIC.
 
+config MLX5_EN_RXNFC
+   bool "Mellanox MLX5 ethernet rx nfc flow steering support"
+   depends on MLX5_CORE_EN
+   default y
+   ---help---
+ Mellanox MLX5 ethernet rx nfc flow steering support
+ Enables ethtool receive network flow classification, which allows 
user defined
+ flow rules to direct traffic into arbitrary rx queue via ethtool 
set/get_rxnfc
+ API.
+
 config MLX5_MPFS
 bool "Mellanox Technologies MLX5 MPFS support"
 depends on MLX5_CORE_EN
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index f20fda1ced4f..7b6f9d2c32c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -15,7 +15,9 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o 
fpga/conn.o fpga/sdk.o \
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o 
\
en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
-   en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o lib/vxlan.o
+   en_arfs.o en_selftest.o en/port.o lib/vxlan.o
+
+mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 31a29b73f558..19728f9f25e7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -52,6 +52,7 @@
 #include "wq.h"
 #include "mlx5_core.h"
 #include "en_stats.h"
+#include "en/fs.h"
 
 struct page_pool;
 
@@ -746,24 +747,11 @@ enum {
MLX5E_TC_TTC_FT_LEVEL,
 };
 
-struct mlx5e_ethtool_table {
-   struct mlx5_flow_table *ft;
-   intnum_rules;
-};
-
-#define ETHTOOL_NUM_L3_L4_FTS 7
-#define ETHTOOL_NUM_L2_FTS 4
-
-struct mlx5e_ethtool_steering {
-   struct mlx5e_ethtool_table  l3_l4_ft[ETHTOOL_NUM_L3_L4_FTS];
-   struct mlx5e_ethtool_table  l2_ft[ETHTOOL_NUM_L2_FTS];
-   struct list_headrules;
-   int tot_num_rules;
-};
-
 struct mlx5e_flow_steering {
struct mlx5_flow_namespace  *ns;
+#ifdef CONFIG_MLX5_EN_RXNFC
struct mlx5e_ethtool_steering   ethtool;
+#endif
struct mlx5e_tc_table   tc;
struct mlx5e_vlan_table vlan;
struct mlx5e_l2_table   l2;
@@ -912,11 +900,6 @@ void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft);
 int mlx5e_self_test_num(struct mlx5e_priv *priv);
 void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest,
 u64 *buf);
-void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv);
-void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv);
-int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd);
-int mlx5e_get_rxnfc(struct net_device *dev,
-   struct ethtool_rxnfc *info, u32 *rule_locs);
 void mlx5e_set_rx_mode_work(struct work_struct *work);
 
 int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
new file mode 100644
index ..50b0784787bc
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2018 Mellanox Technologies. */
+
+#ifndef __MLX5E_FLOW_STEER_H__
+#define __MLX5E_FLOW_STEER_H__
+
+#ifdef CONFIG_MLX5_EN_RXNFC
+
+struct mlx5e_ethtool_table {
+   struct mlx5_flow_table *ft;
+   intnum_rules;
+};
+
+#define ETHTOOL_NUM_L3_L4_FTS 7
+#define ETHTOOL_NUM_L2_FTS 4
+
+struct mlx5e_ethtool_steering {
+   struct mlx5e_ethtool_table  l3_l4_ft[ETHTOOL_NUM_L3_L4_FTS];
+   struct mlx5e_ethtool_t

[net-next 01/12] net/mlx5e: Ethtool steering flow validation refactoring

2018-08-10 Thread Saeed Mahameed

Have a ethtool rx flow spec validation helper function per flow type.

Signed-off-by: Saeed Mahameed 
---
 .../mellanox/mlx5/core/en_fs_ethtool.c| 164 +++---
 1 file changed, 100 insertions(+), 64 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index eafc59280ada..34d38a359712 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -379,16 +379,95 @@ static struct mlx5e_ethtool_rule *get_ethtool_rule(struct 
mlx5e_priv *priv,
 #define all_zeros_or_all_ones(field)   \
((field) == 0 || (field) == (__force typeof(field))-1)
 
+static int validate_ethter(struct ethtool_rx_flow_spec *fs)
+{
+   struct ethhdr *eth_mask = &fs->m_u.ether_spec;
+   int ntuples = 0;
+
+   if (!is_zero_ether_addr(eth_mask->h_dest))
+   ntuples++;
+   if (!is_zero_ether_addr(eth_mask->h_source))
+   ntuples++;
+   if (eth_mask->h_proto)
+   ntuples++;
+   return ntuples;
+}
+
+static int validate_tcpudp4(struct ethtool_rx_flow_spec *fs)
+{
+   struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.tcp_ip4_spec;
+   int ntuples = 0;
+
+   if (fs->m_u.tcp_ip4_spec.tos)
+   return -EINVAL;
+
+   if (l4_mask->ip4src) {
+   if (!all_ones(l4_mask->ip4src))
+   return -EINVAL;
+   ntuples++;
+   }
+   if (l4_mask->ip4dst) {
+   if (!all_ones(l4_mask->ip4dst))
+   return -EINVAL;
+   ntuples++;
+   }
+   if (l4_mask->psrc) {
+   if (!all_ones(l4_mask->psrc))
+   return -EINVAL;
+   ntuples++;
+   }
+   if (l4_mask->pdst) {
+   if (!all_ones(l4_mask->pdst))
+   return -EINVAL;
+   ntuples++;
+   }
+   /* Flow is TCP/UDP */
+   return ++ntuples;
+}
+
+static int validate_ip4(struct ethtool_rx_flow_spec *fs)
+{
+   struct ethtool_usrip4_spec *l3_mask = &fs->m_u.usr_ip4_spec;
+   int ntuples = 0;
+
+   if (l3_mask->l4_4_bytes || l3_mask->tos || l3_mask->proto ||
+   fs->h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4)
+   return -EINVAL;
+   if (l3_mask->ip4src) {
+   if (!all_ones(l3_mask->ip4src))
+   return -EINVAL;
+   ntuples++;
+   }
+   if (l3_mask->ip4dst) {
+   if (!all_ones(l3_mask->ip4dst))
+   return -EINVAL;
+   ntuples++;
+   }
+   /* Flow is IPv4 */
+   return ++ntuples;
+}
+
+static int validate_vlan(struct ethtool_rx_flow_spec *fs)
+{
+   if (fs->m_ext.vlan_etype ||
+   fs->m_ext.vlan_tci != cpu_to_be16(VLAN_VID_MASK))
+   return -EINVAL;
+
+   if (fs->m_ext.vlan_tci &&
+   (be16_to_cpu(fs->h_ext.vlan_tci) >= VLAN_N_VID))
+   return -EINVAL;
+
+   return 1;
+}
+
 static int validate_flow(struct mlx5e_priv *priv,
 struct ethtool_rx_flow_spec *fs)
 {
-   struct ethtool_tcpip4_spec *l4_mask;
-   struct ethtool_usrip4_spec *l3_mask;
-   struct ethhdr *eth_mask;
int num_tuples = 0;
+   int ret = 0;
 
if (fs->location >= MAX_NUM_OF_ETHTOOL_RULES)
-   return -EINVAL;
+   return -ENOSPC;
 
if (fs->ring_cookie >= priv->channels.params.num_channels &&
fs->ring_cookie != RX_CLS_FLOW_DISC)
@@ -396,73 +475,29 @@ static int validate_flow(struct mlx5e_priv *priv,
 
switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
case ETHER_FLOW:
-   eth_mask = &fs->m_u.ether_spec;
-   if (!is_zero_ether_addr(eth_mask->h_dest))
-   num_tuples++;
-   if (!is_zero_ether_addr(eth_mask->h_source))
-   num_tuples++;
-   if (eth_mask->h_proto)
-   num_tuples++;
+   num_tuples += validate_ethter(fs);
break;
case TCP_V4_FLOW:
case UDP_V4_FLOW:
-   if (fs->m_u.tcp_ip4_spec.tos)
-   return -EINVAL;
-   l4_mask = &fs->m_u.tcp_ip4_spec;
-   if (l4_mask->ip4src) {
-   if (!all_ones(l4_mask->ip4src))
-   return -EINVAL;
-   num_tuples++;
-   }
-   if (l4_mask->ip4dst) {
-   if (!all_ones(l4_mask->ip4dst))
-   return -EINVAL;
-   num_tuples++;
-   }
-   if (l4_mask->psrc) {
-   if (!all_ones(l4_mask->psrc))
-   return -EINVAL;
-   num_tuples++;
-   }
-   if (l4_mask->pdst) {
-   if (!al

[net-next 02/12] net/mlx5e: Ethtool steering flow parsing refactoring

2018-08-10 Thread Saeed Mahameed

Have a parsing function per flow type, that converts from ethtool rx flow
spec to mlx5 flow spec.

Will be useful to add support for ip6 ethtool flow steering in the
next patch.

Signed-off-by: Saeed Mahameed 
---
 .../mellanox/mlx5/core/en_fs_ethtool.c| 230 ++
 1 file changed, 128 insertions(+), 102 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index 34d38a359712..f2fa189adc4f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -115,29 +115,134 @@ static void mask_spec(u8 *mask, u8 *val, size_t size)
*((u8 *)val) = *((u8 *)mask) & *((u8 *)val);
 }
 
-static void set_ips(void *outer_headers_v, void *outer_headers_c, __be32 
ip4src_m,
-   __be32 ip4src_v, __be32 ip4dst_m, __be32 ip4dst_v)
+#define MLX5E_FTE_SET(header_p, fld, v)  \
+   MLX5_SET(fte_match_set_lyr_2_4, header_p, fld, v)
+
+#define MLX5E_FTE_ADDR_OF(header_p, fld) \
+   MLX5_ADDR_OF(fte_match_set_lyr_2_4, header_p, fld)
+
+static void
+set_ip4(void *headers_c, void *headers_v, __be32 ip4src_m,
+   __be32 ip4src_v, __be32 ip4dst_m, __be32 ip4dst_v)
 {
if (ip4src_m) {
-   memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
-   src_ipv4_src_ipv6.ipv4_layout.ipv4),
+   memcpy(MLX5E_FTE_ADDR_OF(headers_v, 
src_ipv4_src_ipv6.ipv4_layout.ipv4),
   &ip4src_v, sizeof(ip4src_v));
-   memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
-   src_ipv4_src_ipv6.ipv4_layout.ipv4),
+   memset(MLX5E_FTE_ADDR_OF(headers_c, 
src_ipv4_src_ipv6.ipv4_layout.ipv4),
   0xff, sizeof(ip4src_m));
}
if (ip4dst_m) {
-   memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
-   dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+   memcpy(MLX5E_FTE_ADDR_OF(headers_v, 
dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
   &ip4dst_v, sizeof(ip4dst_v));
-   memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
-   dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+   memset(MLX5E_FTE_ADDR_OF(headers_c, 
dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
   0xff, sizeof(ip4dst_m));
}
-   MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
-ethertype, ETH_P_IP);
-   MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
-ethertype, 0x);
+
+   MLX5E_FTE_SET(headers_c, ethertype, 0x);
+   MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IP);
+}
+
+static void
+set_tcp(void *headers_c, void *headers_v, __be16 psrc_m, __be16 psrc_v,
+   __be16 pdst_m, __be16 pdst_v)
+{
+   if (psrc_m) {
+   MLX5E_FTE_SET(headers_c, tcp_sport, 0x);
+   MLX5E_FTE_SET(headers_v, tcp_sport, ntohs(psrc_v));
+   }
+   if (pdst_m) {
+   MLX5E_FTE_SET(headers_c, tcp_dport, 0x);
+   MLX5E_FTE_SET(headers_v, tcp_dport, ntohs(pdst_v));
+   }
+
+   MLX5E_FTE_SET(headers_c, ip_protocol, 0x);
+   MLX5E_FTE_SET(headers_v, ip_protocol, IPPROTO_TCP);
+}
+
+static void
+set_udp(void *headers_c, void *headers_v, __be16 psrc_m, __be16 psrc_v,
+   __be16 pdst_m, __be16 pdst_v)
+{
+   if (psrc_m) {
+   MLX5E_FTE_SET(headers_c, udp_sport, 0x);
+   MLX5E_FTE_SET(headers_c, udp_sport, ntohs(psrc_v));
+   }
+
+   if (pdst_m) {
+   MLX5E_FTE_SET(headers_c, udp_dport, 0x);
+   MLX5E_FTE_SET(headers_v, udp_dport, ntohs(pdst_v));
+   }
+
+   MLX5E_FTE_SET(headers_c, ip_protocol, 0x);
+   MLX5E_FTE_SET(headers_v, ip_protocol, IPPROTO_UDP);
+}
+
+static void
+parse_tcp4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+   struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.tcp_ip4_spec;
+   struct ethtool_tcpip4_spec *l4_val  = &fs->h_u.tcp_ip4_spec;
+
+   set_ip4(headers_c, headers_v, l4_mask->ip4src, l4_val->ip4src,
+   l4_mask->ip4dst, l4_val->ip4dst);
+
+   set_tcp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
+   l4_mask->pdst, l4_val->pdst);
+}
+
+static void
+parse_udp4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+   struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.udp_ip4_spec;
+   struct ethtool_tcpip4_spec *l4_val  = &fs->h_u.udp_ip4_spec;
+
+   set_ip4(headers_c, headers_v, l4_mask->ip4src, l4_val->ip4src,
+   l4_mask->ip4dst, l4_val->ip4dst);
+
+   set_udp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
+   l4_mask->pdst, l4_val->pdst);
+}
+
+static void
+parse_ip4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+

[pull request][net-next 00/12] Mellanox, mlx5e updates 2018-08-10

2018-08-10 Thread Saeed Mahameed

Hi Dave,

This series provides some updates to mlx5e netdevice driver.

For more information please see tag log below.

Please pull and let me know if there's any problem.

Thanks,
Saeed.

---

The following changes since commit fd685657cd5441e504113db1928196d030c8c414:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next (2018-08-10 
10:33:08 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5e-updates-2018-08-10

for you to fetch changes up to 3a427a275c89d97607b8f801b5ca9bacaa0f336d:

  net/mlx5: Improve argument name for add flow API (2018-08-10 14:04:54 -0700)


mlx5e-updates-2018-08-10

This series provides the following updates to mlx5e netdevice driver.

1) First 4 patches extends the support for ethtool rxnfc flow steering
   - Added ipv6 support
   - l4 proto ip field for both ip6 and ip4

2) Next 4 patches, reorganizing flow steering structures and declaration into
one header file, and add two Kconfig flags to allow disabling/enabling mlx5
netdevice rx flow steering at compile time:
CONFIG_MLX5_EN_ARFS for en_arfs.c
CONFIG_MLX5_EN_RXNFC for en_fs_ehtool.c

3) More kconfig flags dependencies
- vxlan.c depends on CONFIG_VXLAN
- clock.c depends on CONFIG_PTP_1588_CLOCK

4) Reorganize the Makefile

Thanks,
Saeeed.


Eli Cohen (1):
  net/mlx5: Improve argument name for add flow API

Moshe Shemesh (1):
  net/mlx5e: clock.c depends on CONFIG_PTP_1588_CLOCK

Saeed Mahameed (10):
  net/mlx5e: Ethtool steering flow validation refactoring
  net/mlx5e: Ethtool steering flow parsing refactoring
  net/mlx5e: Ethtool steering, ip6 support
  net/mlx5e: Ethtool steering, l4 proto support
  net/mlx5e: Ethtool steering, move ethtool callbacks
  net/mlx5e: Add CONFIG_MLX5_EN_RXNFC for ethtool rx nfc
  net/mlx5e: Add CONFIG_MLX5_EN_ARFS for accelerated flow steering support
  net/mlx5e: Move flow steering declarations into en/fs.h
  net/mlx5e: vxlan.c depends on CONFIG_VXLAN
  net/mlx5: Reorganize the makefile

 drivers/net/ethernet/mellanox/mlx5/core/Kconfig|  19 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |  60 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h   | 203 +--
 drivers/net/ethernet/mellanox/mlx5/core/en/fs.h| 210 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c  |   4 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  57 +-
 .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c| 612 +++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  18 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |   8 +-
 .../net/ethernet/mellanox/mlx5/core/lib/clock.h|  24 +
 .../net/ethernet/mellanox/mlx5/core/lib/vxlan.h|  12 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h|   1 -
 include/linux/mlx5/fs.h|   2 +-
 14 files changed, 765 insertions(+), 466 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/fs.h

[PATCH net-next v2] bnxt_en: Fix strcpy() warnings in bnxt_ethtool.c

2018-08-10 Thread Michael Chan

From: Vasundhara Volam 

This patch fixes following smatch warnings:

drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2826 
bnxt_fill_coredump_seg_hdr() error: strcpy() '"sEgM"' too large for 
'seg_hdr->signature' (5 vs 4)
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2858 
bnxt_fill_coredump_record() error: strcpy() '"cOrE"' too large for 
'record->signature' (5 vs 4)
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2879 
bnxt_fill_coredump_record() error: strcpy() 'utsname()->sysname' too large for 
'record->os_name' (65 vs 32)

Fixes: 6c5657d085ae ("bnxt_en: Add support for ethtool get dump.")
Reported-by: Dan Carpenter 
Signed-off-by: Vasundhara Volam 
Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index b6dbc3f..9c929cd 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2823,7 +2823,7 @@ bnxt_fill_coredump_seg_hdr(struct bnxt *bp,
   int status, u32 duration, u32 instance)
 {
memset(seg_hdr, 0, sizeof(*seg_hdr));
-   strcpy(seg_hdr->signature, "sEgM");
+   memcpy(seg_hdr->signature, "sEgM", 4);
if (seg_rec) {
seg_hdr->component_id = (__force __le32)seg_rec->component_id;
seg_hdr->segment_id = (__force __le32)seg_rec->segment_id;
@@ -2855,7 +2855,7 @@ bnxt_fill_coredump_record(struct bnxt *bp, struct 
bnxt_coredump_record *record,
 
time64_to_tm(start, 0, &tm);
memset(record, 0, sizeof(*record));
-   strcpy(record->signature, "cOrE");
+   memcpy(record->signature, "cOrE", 4);
record->flags = 0;
record->low_version = 0;
record->high_version = 1;
@@ -2876,7 +2876,7 @@ bnxt_fill_coredump_record(struct bnxt *bp, struct 
bnxt_coredump_record *record,
record->os_ver_major = cpu_to_le32(os_ver_major);
record->os_ver_minor = cpu_to_le32(os_ver_minor);
 
-   strcpy(record->os_name, utsname()->sysname);
+   strlcpy(record->os_name, utsname()->sysname, 32);
time64_to_tm(end, 0, &tm);
record->end_year = cpu_to_le16(tm.tm_year + 1900);
record->end_month = cpu_to_le16(tm.tm_mon + 1);
-- 
2.5.1

Re: [PATCH net-next] bnxt_en: Fix strcpy() warnings in bnxt_ethtool.c

2018-08-10 Thread Michael Chan

On Fri, Aug 10, 2018 at 2:37 PM, David Miller  wrote:
> From: David Miller 
> Date: Fri, 10 Aug 2018 14:35:45 -0700 (PDT)
>
>> From: Michael Chan 
>> Date: Fri, 10 Aug 2018 17:02:12 -0400
>>
>>> From: Vasundhara Volam 
>>>
>>> This patch fixes following smatch warnings:
>>>
>>> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2826 
>>> bnxt_fill_coredump_seg_hdr() error: strcpy() '"sEgM"' too large for 
>>> 'seg_hdr->signature' (5 vs 4)
>>> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2858 
>>> bnxt_fill_coredump_record() error: strcpy() '"cOrE"' too large for 
>>> 'record->signature' (5 vs 4)
>>> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2879 
>>> bnxt_fill_coredump_record() error: strcpy() 'utsname()->sysname' too large 
>>> for 'record->os_name' (65 vs 32)
>>>
>>> Fixes: 6c5657d085ae ("bnxt_en: Add support for ethtool get dump.")
>>> Reported-by: Dan Carpenter 
>>> Signed-off-by: Vasundhara Volam 
>>> Signed-off-by: Michael Chan 
>>
>> Applied, thanks Michael.
>
> Actually, I'm reverting, this may fix those three warnings, but they are 
> replaced with
> a new one:
>
> ./include/linux/string.h:246:9: warning: ‘__builtin_strncpy’ output may be 
> truncated copying 32 bytes from a string of length 64 [-Wstringop-truncation]
>

OK.  I'm guessing strlcpy() is the right variant to use here.  I will
repost v2 using strlcpy().  Thanks.

Re: [PATCH net-next v6 10/11] net: sched: atomically check-allocate action

2018-08-10 Thread Cong Wang

On Fri, Aug 10, 2018 at 3:29 AM Vlad Buslov  wrote:
>
> Approach you suggest is valid, but has its own trade-offs:
>
> - As you noted, lock granularity becomes coarse-grained due to per-netns
> scope.

Sure, you acquire idrinfo->lock too, the only difference is how long
you take it.

The bottleneck of your approach is the same, also you take idrinfo->lock
twice, so the contention is heavier.

>
> - I am not sure it is possible to call idr_replace() without obtaining
> idrinfo->lock in this particular case. Concurrent delete of action with
> same id is possible and, according to idr_replace() description,
> unlocked execution is not supported for such use-case:

But we can hold its refcnt before releasing idrinfo->lock, so
idr_replace() can't race with concurrent delete.

>
> - High rate or replace request will generate a lot of unnecessary memory
> allocations and deallocations.
>

Yes, this is literally how RCU works, always allocate and copy,
release upon error.

Also, if this is really a problem, we have SLAB_TYPESAFE_BY_RCU
too. ;)

Re: [PATCH net-next] liquidio: copperhead LED identification

2018-08-10 Thread David Miller

From: Felix Manlunas 
Date: Thu, 9 Aug 2018 13:54:12 -0700

> From: Raghu Vatsavayi 
> 
> Add LED identification support for liquidio TP copperhead cards.
> 
> Signed-off-by: Raghu Vatsavayi 
> Acked-by: Derek Chickles 
> Signed-off-by: Felix Manlunas 

Applied.

Re: [PATCH net-next] bnxt_en: Fix strcpy() warnings in bnxt_ethtool.c

2018-08-10 Thread David Miller

From: David Miller 
Date: Fri, 10 Aug 2018 14:35:45 -0700 (PDT)

> From: Michael Chan 
> Date: Fri, 10 Aug 2018 17:02:12 -0400
> 
>> From: Vasundhara Volam 
>> 
>> This patch fixes following smatch warnings:
>> 
>> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2826 
>> bnxt_fill_coredump_seg_hdr() error: strcpy() '"sEgM"' too large for 
>> 'seg_hdr->signature' (5 vs 4)
>> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2858 
>> bnxt_fill_coredump_record() error: strcpy() '"cOrE"' too large for 
>> 'record->signature' (5 vs 4)
>> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2879 
>> bnxt_fill_coredump_record() error: strcpy() 'utsname()->sysname' too large 
>> for 'record->os_name' (65 vs 32)
>> 
>> Fixes: 6c5657d085ae ("bnxt_en: Add support for ethtool get dump.")
>> Reported-by: Dan Carpenter 
>> Signed-off-by: Vasundhara Volam 
>> Signed-off-by: Michael Chan 
> 
> Applied, thanks Michael.

Actually, I'm reverting, this may fix those three warnings, but they are 
replaced with
a new one:

./include/linux/string.h:246:9: warning: ‘__builtin_strncpy’ output may be 
truncated copying 32 bytes from a string of length 64 [-Wstringop-truncation]

Sorry.

Re: [PATCH net-next] bnxt_en: Fix strcpy() warnings in bnxt_ethtool.c

2018-08-10 Thread David Miller

From: Michael Chan 
Date: Fri, 10 Aug 2018 17:02:12 -0400

> From: Vasundhara Volam 
> 
> This patch fixes following smatch warnings:
> 
> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2826 
> bnxt_fill_coredump_seg_hdr() error: strcpy() '"sEgM"' too large for 
> 'seg_hdr->signature' (5 vs 4)
> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2858 
> bnxt_fill_coredump_record() error: strcpy() '"cOrE"' too large for 
> 'record->signature' (5 vs 4)
> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2879 
> bnxt_fill_coredump_record() error: strcpy() 'utsname()->sysname' too large 
> for 'record->os_name' (65 vs 32)
> 
> Fixes: 6c5657d085ae ("bnxt_en: Add support for ethtool get dump.")
> Reported-by: Dan Carpenter 
> Signed-off-by: Vasundhara Volam 
> Signed-off-by: Michael Chan 

Applied, thanks Michael.

Re: [PATCH net-next 0/4] r8169: smaller improvements

2018-08-10 Thread David Miller

From: Heiner Kallweit 
Date: Fri, 10 Aug 2018 22:36:03 +0200

> This series includes smaller improvements, no functional change
> intended.

Series applied, thank you.

Re: [PATCH net-next] r8169: remove version info

2018-08-10 Thread David Miller

From: Heiner Kallweit 
Date: Fri, 10 Aug 2018 21:27:55 +0200

> The version number hasn't changed for ages and in general I doubt it
> provides any benefit. The message in rtl_init_one() may even be
> misleading because it's printed also if something fails in probe.
> Therefore let's remove the version information.
> 
> Signed-off-by: Heiner Kallweit 

Applied.

Re: pull request: bluetooth-next 2018-08-10

2018-08-10 Thread David Miller

From: Johan Hedberg 
Date: Fri, 10 Aug 2018 22:57:07 +0300

> Here's one more (most likely last) bluetooth-next pull request for the
> 4.19 kernel.
> 
>  - Added support for MediaTek serial Bluetooth devices
>  - Initial skeleton for controller-side address resolution support
>  - Fix BT_HCIUART_RTL related Kconfig dependencies
>  - A few other minor fixes/cleanups
> 
> Please let me know if there are any issues pulling. Thanks.

Pulled, thanks Johan.

[PATCH net-next] bnxt_en: Fix strcpy() warnings in bnxt_ethtool.c

2018-08-10 Thread Michael Chan

From: Vasundhara Volam 

This patch fixes following smatch warnings:

drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2826 
bnxt_fill_coredump_seg_hdr() error: strcpy() '"sEgM"' too large for 
'seg_hdr->signature' (5 vs 4)
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2858 
bnxt_fill_coredump_record() error: strcpy() '"cOrE"' too large for 
'record->signature' (5 vs 4)
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2879 
bnxt_fill_coredump_record() error: strcpy() 'utsname()->sysname' too large for 
'record->os_name' (65 vs 32)

Fixes: 6c5657d085ae ("bnxt_en: Add support for ethtool get dump.")
Reported-by: Dan Carpenter 
Signed-off-by: Vasundhara Volam 
Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index b6dbc3f..d6f3289 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2823,7 +2823,7 @@ bnxt_fill_coredump_seg_hdr(struct bnxt *bp,
   int status, u32 duration, u32 instance)
 {
memset(seg_hdr, 0, sizeof(*seg_hdr));
-   strcpy(seg_hdr->signature, "sEgM");
+   memcpy(seg_hdr->signature, "sEgM", 4);
if (seg_rec) {
seg_hdr->component_id = (__force __le32)seg_rec->component_id;
seg_hdr->segment_id = (__force __le32)seg_rec->segment_id;
@@ -2855,7 +2855,7 @@ bnxt_fill_coredump_record(struct bnxt *bp, struct 
bnxt_coredump_record *record,
 
time64_to_tm(start, 0, &tm);
memset(record, 0, sizeof(*record));
-   strcpy(record->signature, "cOrE");
+   memcpy(record->signature, "cOrE", 4);
record->flags = 0;
record->low_version = 0;
record->high_version = 1;
@@ -2876,7 +2876,7 @@ bnxt_fill_coredump_record(struct bnxt *bp, struct 
bnxt_coredump_record *record,
record->os_ver_major = cpu_to_le32(os_ver_major);
record->os_ver_minor = cpu_to_le32(os_ver_minor);
 
-   strcpy(record->os_name, utsname()->sysname);
+   strncpy(record->os_name, utsname()->sysname, 32);
time64_to_tm(end, 0, &tm);
record->end_year = cpu_to_le16(tm.tm_year + 1900);
record->end_month = cpu_to_le16(tm.tm_mon + 1);
-- 
2.5.1

Re: Error running AF_XDP sample application

2018-08-10 Thread Konrad Djimeli

> On 2018-08-10 17:45, Björn Töpel wrote:
> Thanks for taking AF_XDP for a spin!

Thanks, I am actually an Igalia Coding Experience participant working
remotely from Cameroon in Africa and I am working on making use of
AF_XDP within Snabb. I am currently trying to familiarize myself with
XDP and AF_XDP, I also hope to make contributions to the
development/implementation of AF_XDP.

> 
> Before I start digging into details; Do you have CONFIG_XDP_SOCKETS=y
> in your config? :-)

Yay!, setting CONFIG_XDP_SOCKETS=y in my config resolved every issue,
and it now works fine.

> 
> Björn

Konrad

Re: [PATCH net-next] r8169: remove version info

2018-08-10 Thread Andrew Lunn

On Fri, Aug 10, 2018 at 09:27:55PM +0200, Heiner Kallweit wrote:
> The version number hasn't changed for ages and in general I doubt it
> provides any benefit. The message in rtl_init_one() may even be
> misleading because it's printed also if something fails in probe.
> Therefore let's remove the version information.
> 
> Signed-off-by: Heiner Kallweit 

Reviewed-by: Andrew Lunn 

Andrew

[PATCH net-next 4/4] r8169: don't configure max jumbo frame size per chip version

2018-08-10 Thread Heiner Kallweit

We don't have to configure the max jumbo frame size per chip
(sub-)version. It can be easily determined based on the chip family.
And new members of the RTL8168 family (if there are any) should be
automatically covered.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169.c | 201 +++
 1 file changed, 83 insertions(+), 118 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index e5326420..344d77d9 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -147,123 +147,64 @@ enum mac_version {
 #define JUMBO_7K   (7*1024 - ETH_HLEN - 2)
 #define JUMBO_9K   (9*1024 - ETH_HLEN - 2)
 
-#define _R(NAME, FW, SZ) { \
-   .name = NAME,   \
-   .fw_name = FW,  \
-   .jumbo_max = SZ,\
-}
-
 static const struct {
const char *name;
const char *fw_name;
-   u16 jumbo_max;
 } rtl_chip_infos[] = {
/* PCI devices. */
-   [RTL_GIGA_MAC_VER_01] =
-   _R("RTL8169",   NULL, JUMBO_7K),
-   [RTL_GIGA_MAC_VER_02] =
-   _R("RTL8169s",  NULL, JUMBO_7K),
-   [RTL_GIGA_MAC_VER_03] =
-   _R("RTL8110s",  NULL, JUMBO_7K),
-   [RTL_GIGA_MAC_VER_04] =
-   _R("RTL8169sb/8110sb",  NULL, JUMBO_7K),
-   [RTL_GIGA_MAC_VER_05] =
-   _R("RTL8169sc/8110sc",  NULL, JUMBO_7K),
-   [RTL_GIGA_MAC_VER_06] =
-   _R("RTL8169sc/8110sc",  NULL, JUMBO_7K),
+   [RTL_GIGA_MAC_VER_01] = {"RTL8169"  },
+   [RTL_GIGA_MAC_VER_02] = {"RTL8169s" },
+   [RTL_GIGA_MAC_VER_03] = {"RTL8110s" },
+   [RTL_GIGA_MAC_VER_04] = {"RTL8169sb/8110sb" },
+   [RTL_GIGA_MAC_VER_05] = {"RTL8169sc/8110sc" },
+   [RTL_GIGA_MAC_VER_06] = {"RTL8169sc/8110sc" },
/* PCI-E devices. */
-   [RTL_GIGA_MAC_VER_07] =
-   _R("RTL8102e",  NULL, JUMBO_1K),
-   [RTL_GIGA_MAC_VER_08] =
-   _R("RTL8102e",  NULL, JUMBO_1K),
-   [RTL_GIGA_MAC_VER_09] =
-   _R("RTL8102e",  NULL, JUMBO_1K),
-   [RTL_GIGA_MAC_VER_10] =
-   _R("RTL8101e",  NULL, JUMBO_1K),
-   [RTL_GIGA_MAC_VER_11] =
-   _R("RTL8168b/8111b",NULL, JUMBO_4K),
-   [RTL_GIGA_MAC_VER_12] =
-   _R("RTL8168b/8111b",NULL, JUMBO_4K),
-   [RTL_GIGA_MAC_VER_13] =
-   _R("RTL8101e",  NULL, JUMBO_1K),
-   [RTL_GIGA_MAC_VER_14] =
-   _R("RTL8100e",  NULL, JUMBO_1K),
-   [RTL_GIGA_MAC_VER_15] =
-   _R("RTL8100e",  NULL, JUMBO_1K),
-   [RTL_GIGA_MAC_VER_16] =
-   _R("RTL8101e",  NULL, JUMBO_1K),
-   [RTL_GIGA_MAC_VER_17] =
-   _R("RTL8168b/8111b",NULL, JUMBO_4K),
-   [RTL_GIGA_MAC_VER_18] =
-   _R("RTL8168cp/8111cp",  NULL, JUMBO_6K),
-   [RTL_GIGA_MAC_VER_19] =
-   _R("RTL8168c/8111c",NULL, JUMBO_6K),
-   [RTL_GIGA_MAC_VER_20] =
-   _R("RTL8168c/8111c",NULL, JUMBO_6K),
-   [RTL_GIGA_MAC_VER_21] =
-   _R("RTL8168c/8111c",NULL, JUMBO_6K),
-   [RTL_GIGA_MAC_VER_22] =
-   _R("RTL8168c/8111c",NULL, JUMBO_6K),
-   [RTL_GIGA_MAC_VER_23] =
-   _R("RTL8168cp/8111cp",  NULL, JUMBO_6K),
-   [RTL_GIGA_MAC_VER_24] =
-   _R("RTL8168cp/8111cp",  NULL, JUMBO_6K),
-   [RTL_GIGA_MAC_VER_25] =
-   _R("RTL8168d/8111d",FIRMWARE_8168D_1, JUMBO_9K),
-   [RTL_GIGA_MAC_VER_26] =
-   _R("RTL8168d/8111d",FIRMWARE_8168D_2, JUMBO_9K),
-   [RTL_GIGA_MAC_VER_27] =
-   _R("RTL8168dp/8111dp",  NULL, JUMBO_9K),
-   [RTL_GIGA_MAC_VER_28] =
-   _R("RTL8168dp/8111dp",  NULL, JUMBO_9K),
-   [RTL_GIGA_MAC_VER_29] =
-   _R("RTL8105e",  FIRMWARE_8105E_1, JUMBO_1K),
-   [RTL_GIGA_MAC_VER_30] =
-   _R("RTL8105e",  FIRMWARE_8105E_1, JUMBO_1K),
-   [RTL_GIGA_MAC_VER_31] =
-   _R("RTL8168dp/8111dp",  NULL, JUMBO_9K),
-   [RTL_GIGA_MAC_VER_32] =
-   _R("RTL8168e/8111e",FIRMWARE_8168E_1, JUMBO_9K),
-   [RTL_GIGA_MAC_VER_33] =
-   _R("RTL8168e/8111e",FIRMWARE_8168E_2, JUMBO_9K),
-   [RTL_GIGA_MAC_VER_34] =
-   _R("RTL8168evl/8111evl", FIRMWARE_8168E_3, JUMBO_9K),
-   [RTL_GIGA_MAC_VER_35] =
-   _R("RTL8168f/8111f",FIRMWARE_8168F_1, JUMBO_9K),
-   [RTL_GIGA_MAC_VER_36] =
-   _R("RTL8168f/8111f",FIRMWARE_8168F_2, JUMBO_9K),
-   [RTL_GIGA_MAC_VER_37] =
-   _R("RTL8402",   FIRMWARE_8402_1,  JUMBO_1K),
-   [RTL_GIGA_MAC_VER_38] =
-   _R("RTL8411",   FIRMWARE_84

[PATCH net-next 2/4] r8169: simplify interrupt handler

2018-08-10 Thread Heiner Kallweit

Simplify the interrupt handler a little and make it better readable.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169.c | 19 +++
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index 7f0975b6..fc6ae446 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -6520,20 +6520,15 @@ static int rtl_rx(struct net_device *dev, struct 
rtl8169_private *tp, u32 budget
 static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
 {
struct rtl8169_private *tp = dev_instance;
-   int handled = 0;
-   u16 status;
+   u16 status = rtl_get_events(tp);
 
-   status = rtl_get_events(tp);
-   if (status && status != 0x) {
-   status &= RTL_EVENT_NAPI | tp->event_slow;
-   if (status) {
-   handled = 1;
+   if (status == 0x || !(status & (RTL_EVENT_NAPI | tp->event_slow)))
+   return IRQ_NONE;
 
-   rtl_irq_disable(tp);
-   napi_schedule_irqoff(&tp->napi);
-   }
-   }
-   return IRQ_RETVAL(handled);
+   rtl_irq_disable(tp);
+   napi_schedule_irqoff(&tp->napi);
+
+   return IRQ_HANDLED;
 }
 
 /*
-- 
2.18.0

[PATCH net-next 3/4] r8169: don't configure csum function per chip version

2018-08-10 Thread Heiner Kallweit

We don't have to configure the csum function per chip (sub-)version.
The distinction is simple, versions RTL8102e and from RTL8168c onwards
support csum_v2.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169.c | 134 +--
 1 file changed, 67 insertions(+), 67 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index fc6ae446..e5326420 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -141,134 +141,127 @@ enum mac_version {
RTL_GIGA_MAC_NONE   = 0xff,
 };
 
-enum rtl_tx_desc_version {
-   RTL_TD_0= 0,
-   RTL_TD_1= 1,
-};
-
 #define JUMBO_1K   ETH_DATA_LEN
 #define JUMBO_4K   (4*1024 - ETH_HLEN - 2)
 #define JUMBO_6K   (6*1024 - ETH_HLEN - 2)
 #define JUMBO_7K   (7*1024 - ETH_HLEN - 2)
 #define JUMBO_9K   (9*1024 - ETH_HLEN - 2)
 
-#define _R(NAME,TD,FW,SZ) {\
+#define _R(NAME, FW, SZ) { \
.name = NAME,   \
-   .txd_version = TD,  \
.fw_name = FW,  \
.jumbo_max = SZ,\
 }
 
 static const struct {
const char *name;
-   enum rtl_tx_desc_version txd_version;
const char *fw_name;
u16 jumbo_max;
 } rtl_chip_infos[] = {
/* PCI devices. */
[RTL_GIGA_MAC_VER_01] =
-   _R("RTL8169",   RTL_TD_0, NULL, JUMBO_7K),
+   _R("RTL8169",   NULL, JUMBO_7K),
[RTL_GIGA_MAC_VER_02] =
-   _R("RTL8169s",  RTL_TD_0, NULL, JUMBO_7K),
+   _R("RTL8169s",  NULL, JUMBO_7K),
[RTL_GIGA_MAC_VER_03] =
-   _R("RTL8110s",  RTL_TD_0, NULL, JUMBO_7K),
+   _R("RTL8110s",  NULL, JUMBO_7K),
[RTL_GIGA_MAC_VER_04] =
-   _R("RTL8169sb/8110sb",  RTL_TD_0, NULL, JUMBO_7K),
+   _R("RTL8169sb/8110sb",  NULL, JUMBO_7K),
[RTL_GIGA_MAC_VER_05] =
-   _R("RTL8169sc/8110sc",  RTL_TD_0, NULL, JUMBO_7K),
+   _R("RTL8169sc/8110sc",  NULL, JUMBO_7K),
[RTL_GIGA_MAC_VER_06] =
-   _R("RTL8169sc/8110sc",  RTL_TD_0, NULL, JUMBO_7K),
+   _R("RTL8169sc/8110sc",  NULL, JUMBO_7K),
/* PCI-E devices. */
[RTL_GIGA_MAC_VER_07] =
-   _R("RTL8102e",  RTL_TD_1, NULL, JUMBO_1K),
+   _R("RTL8102e",  NULL, JUMBO_1K),
[RTL_GIGA_MAC_VER_08] =
-   _R("RTL8102e",  RTL_TD_1, NULL, JUMBO_1K),
+   _R("RTL8102e",  NULL, JUMBO_1K),
[RTL_GIGA_MAC_VER_09] =
-   _R("RTL8102e",  RTL_TD_1, NULL, JUMBO_1K),
+   _R("RTL8102e",  NULL, JUMBO_1K),
[RTL_GIGA_MAC_VER_10] =
-   _R("RTL8101e",  RTL_TD_0, NULL, JUMBO_1K),
+   _R("RTL8101e",  NULL, JUMBO_1K),
[RTL_GIGA_MAC_VER_11] =
-   _R("RTL8168b/8111b",RTL_TD_0, NULL, JUMBO_4K),
+   _R("RTL8168b/8111b",NULL, JUMBO_4K),
[RTL_GIGA_MAC_VER_12] =
-   _R("RTL8168b/8111b",RTL_TD_0, NULL, JUMBO_4K),
+   _R("RTL8168b/8111b",NULL, JUMBO_4K),
[RTL_GIGA_MAC_VER_13] =
-   _R("RTL8101e",  RTL_TD_0, NULL, JUMBO_1K),
+   _R("RTL8101e",  NULL, JUMBO_1K),
[RTL_GIGA_MAC_VER_14] =
-   _R("RTL8100e",  RTL_TD_0, NULL, JUMBO_1K),
+   _R("RTL8100e",  NULL, JUMBO_1K),
[RTL_GIGA_MAC_VER_15] =
-   _R("RTL8100e",  RTL_TD_0, NULL, JUMBO_1K),
+   _R("RTL8100e",  NULL, JUMBO_1K),
[RTL_GIGA_MAC_VER_16] =
-   _R("RTL8101e",  RTL_TD_0, NULL, JUMBO_1K),
+   _R("RTL8101e",  NULL, JUMBO_1K),
[RTL_GIGA_MAC_VER_17] =
-   _R("RTL8168b/8111b",RTL_TD_0, NULL, JUMBO_4K),
+   _R("RTL8168b/8111b",NULL, JUMBO_4K),
[RTL_GIGA_MAC_VER_18] =
-   _R("RTL8168cp/8111cp",  RTL_TD_1, NULL, JUMBO_6K),
+   _R("RTL8168cp/8111cp",  NULL, JUMBO_6K),
[RTL_GIGA_MAC_VER_19] =
-   _R("RTL8168c/8111c",RTL_TD_1, NULL, JUMBO_6K),
+   _R("RTL8168c/8111c",NULL, JUMBO_6K),
[RTL_GIGA_MAC_VER_20] =
-   _R("RTL8168c/8111c",RTL_TD_1, NULL, JUMBO_6K),
+   _R("RTL8168c/8111c",NULL, JUMBO_6K),
[RTL_GIGA_MAC_VER_21] =
-   _R("RTL8168c/8111c",RTL_TD_1, NULL, JUMBO_6K),
+   _R("RTL8168c/8111c",NULL, JUMBO_6K),
[RTL_GIGA_MAC_VER_22] =
-   _R("RTL8168c/8111c",RTL_TD_1, NULL, JUMBO_6K),
+   _R("RTL8168c/8111c",NULL, JUMBO_6K),
[RTL_GIGA_MAC_VER_23] =
-   _R("RTL8168cp/8111cp",  RTL_TD_1, NULL, JUMBO_6K),
+   _R("RTL8168cp/8111cp",  NULL, JUMBO_6K),
[RTL_GIGA_MAC_VER_24] =
-

[PATCH net-next 1/4] r8169: don't include asm headers directly

2018-08-10 Thread Heiner Kallweit

The asm headers shouldn't be included directly. asm/irq.h is
implicitly included by linux/interrupt.h, and instead of
asm/io.h include linux/io.h.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index 77802a9d..7f0975b6 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -29,9 +30,6 @@
 #include 
 #include 
 
-#include 
-#include 
-
 #define MODULENAME "r8169"
 
 #define FIRMWARE_8168D_1   "rtl_nic/rtl8168d-1.fw"
-- 
2.18.0

[PATCH net-next 0/4] r8169: smaller improvements

2018-08-10 Thread Heiner Kallweit

This series includes smaller improvements, no functional change
intended.

Heiner Kallweit (4):
  r8169: don't include asm headers directly
  r8169: simplify interrupt handler
  r8169: don't configure csum function per chip version
  r8169: don't configure max jumbo frame size per chip version

 drivers/net/ethernet/realtek/r8169.c | 254 +++
 1 file changed, 106 insertions(+), 148 deletions(-)

-- 
2.18.0

pull request: bluetooth-next 2018-08-10

2018-08-10 Thread Johan Hedberg

Hi Dave,

Here's one more (most likely last) bluetooth-next pull request for the
4.19 kernel.

 - Added support for MediaTek serial Bluetooth devices
 - Initial skeleton for controller-side address resolution support
 - Fix BT_HCIUART_RTL related Kconfig dependencies
 - A few other minor fixes/cleanups

Please let me know if there are any issues pulling. Thanks.

Johan

---
The following changes since commit 981467033a37d916649647fa3afe1fe99bba1817:

  tc-testing: remove duplicate spaces in skbedit match patterns (2018-08-05 
17:39:24 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git 
for-upstream

for you to fetch changes up to aa12af77aae05008b3e637b85944dcd512f75eba:

  Bluetooth: Add definitions for LE set address resolution (2018-08-10 16:57:57 
+0200)


Ankit Navik (1):
  Bluetooth: Add definitions for LE set address resolution

Marcel Holtmann (2):
  Bluetooth: btqca: Introduce HCI_EV_VENDOR and use it
  Bluetooth: Introduce BT_HCIUART_RTL configuration option

Sean Wang (3):
  dt-bindings: net: bluetooth: Add mediatek-bluetooth
  Bluetooth: mediatek: Add protocol support for MediaTek serial devices
  MAINTAINERS: add an entry for MediaTek Bluetooth driver

YueHaibing (1):
  Bluetooth: remove redundant variables 'adv_set' and 'cp'

 .../devicetree/bindings/net/mediatek-bluetooth.txt |  35 ++
 MAINTAINERS|   8 +
 drivers/bluetooth/Kconfig  |  24 +
 drivers/bluetooth/Makefile |   1 +
 drivers/bluetooth/btmtkuart.c  | 629 +
 drivers/bluetooth/btqca.c  |   6 +-
 drivers/bluetooth/hci_h5.c |   4 +
 include/net/bluetooth/hci.h|   5 +
 net/bluetooth/hci_event.c  |  31 +-
 net/bluetooth/mgmt.c   |   3 -
 10 files changed, 737 insertions(+), 9 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/mediatek-bluetooth.txt
 create mode 100644 drivers/bluetooth/btmtkuart.c


signature.asc
Description: PGP signature

[PATCH net-next] r8169: remove version info

2018-08-10 Thread Heiner Kallweit

The version number hasn't changed for ages and in general I doubt it
provides any benefit. The message in rtl_init_one() may even be
misleading because it's printed also if something fails in probe.
Therefore let's remove the version information.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index 8ea1fa36..77802a9d 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -32,7 +32,6 @@
 #include 
 #include 
 
-#define RTL8169_VERSION "2.3LK-NAPI"
 #define MODULENAME "r8169"
 
 #define FIRMWARE_8168D_1   "rtl_nic/rtl8168d-1.fw"
@@ -784,7 +783,6 @@ MODULE_PARM_DESC(use_dac, "Enable PCI DAC. Unsafe on 32 bit 
PCI slot.");
 module_param_named(debug, debug.msg_enable, int, 0);
 MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., 16=all)");
 MODULE_LICENSE("GPL");
-MODULE_VERSION(RTL8169_VERSION);
 MODULE_FIRMWARE(FIRMWARE_8168D_1);
 MODULE_FIRMWARE(FIRMWARE_8168D_2);
 MODULE_FIRMWARE(FIRMWARE_8168E_1);
@@ -1635,7 +1633,6 @@ static void rtl8169_get_drvinfo(struct net_device *dev,
struct rtl_fw *rtl_fw = tp->rtl_fw;
 
strlcpy(info->driver, MODULENAME, sizeof(info->driver));
-   strlcpy(info->version, RTL8169_VERSION, sizeof(info->version));
strlcpy(info->bus_info, pci_name(tp->pci_dev), sizeof(info->bus_info));
BUILD_BUG_ON(sizeof(info->fw_version) < sizeof(rtl_fw->version));
if (!IS_ERR_OR_NULL(rtl_fw))
@@ -7292,11 +7289,6 @@ static int rtl_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
int chipset, region, i;
int rc;
 
-   if (netif_msg_drv(&debug)) {
-   printk(KERN_INFO "%s Gigabit Ethernet driver %s loaded\n",
-  MODULENAME, RTL8169_VERSION);
-   }
-
dev = devm_alloc_etherdev(&pdev->dev, sizeof (*tp));
if (!dev)
return -ENOMEM;
-- 
2.18.0

[PATCH next-queue] ixgbe: don't clear ipsec sa counters on hw clearing

2018-08-10 Thread Shannon Nelson

The software SA record counters should not be cleared when clearing
the hardware tables.  This causes the counters to be out of sync
after a driver reset.

Fixes: 63a67fe229ea ("ixgbe: add ipsec offload add and remove SA")
Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index c116f45..df2f997 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -113,7 +113,6 @@ static void ixgbe_ipsec_set_rx_ip(struct ixgbe_hw *hw, u16 
idx, __be32 addr[])
  **/
 static void ixgbe_ipsec_clear_hw_tables(struct ixgbe_adapter *adapter)
 {
-   struct ixgbe_ipsec *ipsec = adapter->ipsec;
struct ixgbe_hw *hw = &adapter->hw;
u32 buf[4] = {0, 0, 0, 0};
u16 idx;
@@ -132,9 +131,6 @@ static void ixgbe_ipsec_clear_hw_tables(struct 
ixgbe_adapter *adapter)
ixgbe_ipsec_set_tx_sa(hw, idx, buf, 0);
ixgbe_ipsec_set_rx_sa(hw, idx, 0, buf, 0, 0, 0);
}
-
-   ipsec->num_rx_sa = 0;
-   ipsec->num_tx_sa = 0;
 }
 
 /**
-- 
2.7.4

Re: [PATCH bpf-next] BPF: helpers: New helper to obtain namespace data from current task

2018-08-10 Thread Carlos Neira

Jesper,

I'm sorry, thanks a lot for the pointers to the current process.
I just submitted v2 of for this patch. Thanks again for checking this out.

Bests

On Fri, Aug 10, 2018 at 12:40:29PM +0200, Jesper Dangaard Brouer wrote:
> On Thu, 9 Aug 2018 12:07:00 -0400
> Carlos Neira  wrote:
> 
> > Jesper,
> > Here is the updated patch.
> >  
> > From 92633f6819423093932e8d04aa3dc99a5913f6fd Mon Sep 17 00:00:00 2001
> > From: Carlos Neira 
> > Date: Thu, 9 Aug 2018 09:55:32 -0400
> > Subject: [PATCH bpf-next] BPF: helpers: New helper to obtain namespace
> >  data from current task
> >
> [...]
> 
> Hi Carlos,
> 
> This is not how you resubmit a patch, it is both documented in [1] and
> [2], that: "In case the patch or patch series has to be reworked and
> sent out again in a second or later revision, it is also required to
> add a version number (v2, v3, ...) into the subject prefix"
> 
> [1] 
> https://github.com/torvalds/linux/blob/master/Documentation/bpf/bpf_devel_QA.rst#q-how-do-i-indicate-which-tree-bpf-vs-bpf-next-my-patch-should-be-applied-to
> 
> [2] 
> https://www.kernel.org/doc/html/latest/process/submitting-patches.html#the-canonical-patch-format
> 
> Take a look at [1], which toplevel doc is about "HOWTO interact with
> BPF subsystem".
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCH bpf-next] bpf: enable btf for use in all maps

2018-08-10 Thread Daniel Borkmann

On 08/10/2018 06:40 PM, Alexei Starovoitov wrote:
> On Fri, Aug 10, 2018 at 09:55:35AM +0200, Daniel Borkmann wrote:
>> On 08/10/2018 04:13 AM, Alexei Starovoitov wrote:
>>> On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
 On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
>> On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
>>> On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
 Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
 the basic arraymap") enabled support for BTF and dumping via
 BPF fs for arraymap. However, both can be decoupled from each
 other such that all BPF maps can be supported for attaching
 BTF key/value information, while not all maps necessarily
 need to dump via map_seq_show_elem() callback.

 The check in array_map_check_btf() can be generalized as
 ultimatively the key and value size is the only contraint
 that needs to match for the map. The fact that the key needs
 to be of type int is optional; it could be any data type as
 long as it matches the 4 byte key size, just like hash table
 key or others could be of any data type as well.

 Minimal example of a hash table dump which then works out
 of the box for bpftool:

   # bpftool map dump id 19
   [{
   "key": {
   "": {
   "vip": 0,
   "vipv6": []
   },
   "port": 0,
   "family": 0,
   "proto": 0
   },
   "value": {
   "flags": 0,
   "vip_num": 0
   }
   }
   ]

 Signed-off-by: Daniel Borkmann 
 Cc: Yonghong Song 
 ---
  include/linux/bpf.h   |  4 +---
  kernel/bpf/arraymap.c | 27 ---
  kernel/bpf/inode.c|  3 ++-
  kernel/bpf/syscall.c  | 24 
  4 files changed, 23 insertions(+), 35 deletions(-)

 diff --git a/include/linux/bpf.h b/include/linux/bpf.h
 index cd8790d..eb76e8e 100644
 --- a/include/linux/bpf.h
 +++ b/include/linux/bpf.h
 @@ -48,8 +48,6 @@ struct bpf_map_ops {
u32 (*map_fd_sys_lookup_elem)(void *ptr);
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
  struct seq_file *m);
 -  int (*map_check_btf)(const struct bpf_map *map, const struct 
 btf *btf,
 -   u32 key_type_id, u32 value_type_id);
  };
  
  struct bpf_map {
 @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const 
 struct bpf_map *map)
  
  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
  {
 -  return map->ops->map_seq_show_elem && map->ops->map_check_btf;
 +  return map->btf && map->ops->map_seq_show_elem;
  }
  
  extern const struct bpf_map_ops bpf_map_offload_ops;
 diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
 index 2aa55d030..67f0bdf 100644
 --- a/kernel/bpf/arraymap.c
 +++ b/kernel/bpf/arraymap.c
 @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct 
 bpf_map *map, void *key,
rcu_read_unlock();
  }
  
 -static int array_map_check_btf(const struct bpf_map *map, const 
 struct btf *btf,
 - u32 btf_key_id, u32 btf_value_id)
 -{
 -  const struct btf_type *key_type, *value_type;
 -  u32 key_size, value_size;
 -  u32 int_data;
 -
 -  key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
 -  if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
 -  return -EINVAL;
 -
 -  int_data = *(u32 *)(key_type + 1);
 -  /* bpf array can only take a u32 key.  This check makes
 -   * sure that the btf matches the attr used during map_create.
 -   */
 -  if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
 -  BTF_INT_OFFSET(int_data))
 -  return -EINVAL;
>>>
>>> I think most of these checks are still necessary for array type.
>>> Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
>>> is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't 
>>> makes sense.
>>
>> Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
>> on 32 bit it may be allowed

[PATCH net v2] l2tp: use sk_dst_check() to avoid race on sk->sk_dst_cache

2018-08-10 Thread Wei Wang

From: Wei Wang 

In l2tp code, if it is a L2TP_UDP_ENCAP tunnel, tunnel->sk points to a
UDP socket. User could call sendmsg() on both this tunnel and the UDP
socket itself concurrently. As l2tp_xmit_skb() holds socket lock and call
__sk_dst_check() to refresh sk->sk_dst_cache, while udpv6_sendmsg() is
lockless and call sk_dst_check() to refresh sk->sk_dst_cache, there
could be a race and cause the dst cache to be freed multiple times.
So we fix l2tp side code to always call sk_dst_check() to garantee
xchg() is called when refreshing sk->sk_dst_cache to avoid race
conditions.

Syzkaller reported stack trace:
BUG: KASAN: use-after-free in atomic_read 
include/asm-generic/atomic-instrumented.h:21 [inline]
BUG: KASAN: use-after-free in atomic_fetch_add_unless 
include/linux/atomic.h:575 [inline]
BUG: KASAN: use-after-free in atomic_add_unless include/linux/atomic.h:597 
[inline]
BUG: KASAN: use-after-free in dst_hold_safe include/net/dst.h:308 [inline]
BUG: KASAN: use-after-free in ip6_hold_safe+0xe6/0x670 net/ipv6/route.c:1029
Read of size 4 at addr 8801aea9a880 by task syz-executor129/4829

CPU: 0 PID: 4829 Comm: syz-executor129 Not tainted 4.18.0-rc7-next-20180802+ #30
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
 check_memory_region_inline mm/kasan/kasan.c:260 [inline]
 check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
 kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272
 atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
 atomic_fetch_add_unless include/linux/atomic.h:575 [inline]
 atomic_add_unless include/linux/atomic.h:597 [inline]
 dst_hold_safe include/net/dst.h:308 [inline]
 ip6_hold_safe+0xe6/0x670 net/ipv6/route.c:1029
 rt6_get_pcpu_route net/ipv6/route.c:1249 [inline]
 ip6_pol_route+0x354/0xd20 net/ipv6/route.c:1922
 ip6_pol_route_output+0x54/0x70 net/ipv6/route.c:2098
 fib6_rule_lookup+0x283/0x890 net/ipv6/fib6_rules.c:122
 ip6_route_output_flags+0x2c5/0x350 net/ipv6/route.c:2126
 ip6_dst_lookup_tail+0x1278/0x1da0 net/ipv6/ip6_output.c:978
 ip6_dst_lookup_flow+0xc8/0x270 net/ipv6/ip6_output.c:1079
 ip6_sk_dst_lookup_flow+0x5ed/0xc50 net/ipv6/ip6_output.c:1117
 udpv6_sendmsg+0x2163/0x36b0 net/ipv6/udp.c:1354
 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
 sock_sendmsg_nosec net/socket.c:622 [inline]
 sock_sendmsg+0xd5/0x120 net/socket.c:632
 ___sys_sendmsg+0x51d/0x930 net/socket.c:2115
 __sys_sendmmsg+0x240/0x6f0 net/socket.c:2210
 __do_sys_sendmmsg net/socket.c:2239 [inline]
 __se_sys_sendmmsg net/socket.c:2236 [inline]
 __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2236
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x446a29
Code: e8 ac b8 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 
eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:7f4de5532db8 EFLAGS: 0246 ORIG_RAX: 0133
RAX: ffda RBX: 006dcc38 RCX: 00446a29
RDX: 00b8 RSI: 20001b00 RDI: 0003
RBP: 006dcc30 R08: 7f4de5533700 R09: 
R10:  R11: 0246 R12: 006dcc3c
R13: 7ffe2b830fdf R14: 7f4de55339c0 R15: 0001

Fixes: 71b1391a4128 ("l2tp: ensure sk->dst is still valid")
Reported-by: syzbot+05f840f3b04f211ba...@syzkaller.appspotmail.com
Signed-off-by: Wei Wang 
Signed-off-by: Martin KaFai Lau 
Cc: Guillaume Nault 
Cc: David Ahern 
Cc: Cong Wang 
---
v1->v2: Removed dst_clone() as Guillaume Nault suggested

 net/l2tp/l2tp_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 40261cb68e83..8aaf8157da2b 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1110,7 +1110,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct 
sk_buff *skb, int hdr_len
 
/* Get routing info from the tunnel socket */
skb_dst_drop(skb);
-   skb_dst_set(skb, dst_clone(__sk_dst_check(sk, 0)));
+   skb_dst_set(skb, sk_dst_check(sk, 0));
 
inet = inet_sk(sk);
fl = &inet->cork.fl;
-- 
2.18.0.597.ga71716f1ad-goog

Re: [PATCH net] l2tp: fix missing refcount drop in pppol2tp_tunnel_ioctl()

2018-08-10 Thread David Miller

From: Guillaume Nault 
Date: Fri, 10 Aug 2018 19:58:38 +0200

> As far as I can see, f664e37dcc52 ("l2tp: fix missing refcount drop in
> pppol2tp_tunnel_ioctl()") is still in your -stable queue, but the two
> patches it depends on haven't made their way to -stable. I'd suggest to
> either drop this patch from your -stable queue, or to also queue up
> ecd012e45ab5 ("l2tp: filter out non-PPP sessions in pppol2tp_tunnel_ioctl()")
> and
> f664e37dcc52 ("l2tp: fix missing refcount drop in pppol2tp_tunnel_ioctl()").

Thanks Guillaume, I'll sort this out the next time I work on stable
submissions.

Re: [PATCH net] l2tp: fix missing refcount drop in pppol2tp_tunnel_ioctl()

2018-08-10 Thread Guillaume Nault

On Sun, Aug 05, 2018 at 01:24:13PM +0200, Guillaume Nault wrote:
> On Fri, Aug 03, 2018 at 12:42:22PM -0700, David Miller wrote:
> > From: Guillaume Nault 
> > Date: Fri, 3 Aug 2018 17:00:11 +0200
> > 
> > > If 'session' is not NULL and is not a PPP pseudo-wire, then we fail to
> > > drop the reference taken by l2tp_session_get().
> > > 
> > > Fixes: ecd012e45ab5 ("l2tp: filter out non-PPP sessions in 
> > > pppol2tp_tunnel_ioctl()")
> > > Signed-off-by: Guillaume Nault 
> > > ---
> > > Sorry for the stupid mistake. I guess I got blinded by the apparent
> > > simplicity of the bug when I wrote the original patch.
> > 
> > Applied, thanks.
> > 
> > I'm pretty sure I backported the commit this fixes, so I'm queueing
> > this up for -stable as well.
> > 
> Well, I think it wasn't. I didn't receive any notification from the
> stable team about it and I don't see it in Greg's stable queue nor
> in any -stable tree.
> 
> Also, we'd have to queue 90904ff5f958 ("l2tp: fix pseudo-wire type for
> sessions created by pppol2tp_connect()") first, which is necessary for
> properly identifying PPP sessions.
> 
> To recapitulate, three patches are needed to fix the original bug:
> 
>   * 90904ff5f958 ("l2tp: fix pseudo-wire type for sessions created by
> pppol2tp_connect()"): allows later patches to check if a session is
> PPP.
> 
>   * ecd012e45ab5 ("l2tp: filter out non-PPP sessions in
> pppol2tp_tunnel_ioctl()"): refuses calling pppol2tp_session_ioctl()
> on non-PPP sessions. This fixes an invalid pointer dereference when
> the session is Ethernet. Unfortunately it fails to drop the
> reference it takes on the session.
> 
>   * f664e37dcc52 ("l2tp: fix missing refcount drop in
> pppol2tp_tunnel_ioctl()"): fixes the memory leak introduced by the
> previous patch.
> 
Hi Dave,

As far as I can see, f664e37dcc52 ("l2tp: fix missing refcount drop in
pppol2tp_tunnel_ioctl()") is still in your -stable queue, but the two
patches it depends on haven't made their way to -stable. I'd suggest to
either drop this patch from your -stable queue, or to also queue up
ecd012e45ab5 ("l2tp: filter out non-PPP sessions in pppol2tp_tunnel_ioctl()")
and
f664e37dcc52 ("l2tp: fix missing refcount drop in pppol2tp_tunnel_ioctl()").

[PATCH net-next v2 09/15] net: sched: act_skbmod: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Move read of skbmod_p rcu pointer to be protected by tcf spinlock. Use tcf
spinlock to protect private skbmod data from concurrent modification during
dump.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_skbmod.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index c437c6d51a71..e9c86ade3b40 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -156,7 +156,6 @@ static int tcf_skbmod_init(struct net *net, struct nlattr 
*nla,
 
d = to_skbmod(*a);
 
-   ASSERT_RTNL();
p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
if (unlikely(!p)) {
tcf_idr_release(*a, bind);
@@ -166,10 +165,10 @@ static int tcf_skbmod_init(struct net *net, struct nlattr 
*nla,
p->flags = lflags;
d->tcf_action = parm->action;
 
-   p_old = rtnl_dereference(d->skbmod_p);
-
if (ovr)
spin_lock_bh(&d->tcf_lock);
+   /* Protected by tcf_lock if overwriting existing action. */
+   p_old = rcu_dereference_protected(d->skbmod_p, 1);
 
if (lflags & SKBMOD_F_DMAC)
ether_addr_copy(p->eth_dst, daddr);
@@ -205,15 +204,18 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct 
tc_action *a,
 {
struct tcf_skbmod *d = to_skbmod(a);
unsigned char *b = skb_tail_pointer(skb);
-   struct tcf_skbmod_params  *p = rtnl_dereference(d->skbmod_p);
+   struct tcf_skbmod_params  *p;
struct tc_skbmod opt = {
.index   = d->tcf_index,
.refcnt  = refcount_read(&d->tcf_refcnt) - ref,
.bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
-   .action  = d->tcf_action,
};
struct tcf_t t;
 
+   spin_lock_bh(&d->tcf_lock);
+   opt.action = d->tcf_action;
+   p = rcu_dereference_protected(d->skbmod_p,
+ lockdep_is_held(&d->tcf_lock));
opt.flags  = p->flags;
if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -231,8 +233,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct 
tc_action *a,
if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
goto nla_put_failure;
 
+   spin_unlock_bh(&d->tcf_lock);
return skb->len;
 nla_put_failure:
+   spin_unlock_bh(&d->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next v2 07/15] net: sched: act_sample: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Use tcf spinlock to protect private sample action data from concurrent
modification during dump and init.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_sample.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 2608ccc83e5e..81071afe1b43 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -80,11 +80,13 @@ static int tcf_sample_init(struct net *net, struct nlattr 
*nla,
}
s = to_sample(*a);
 
+   spin_lock(&s->tcf_lock);
s->tcf_action = parm->action;
s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
psample_group = psample_group_get(net, s->psample_group_num);
if (!psample_group) {
+   spin_unlock(&s->tcf_lock);
tcf_idr_release(*a, bind);
return -ENOMEM;
}
@@ -94,6 +96,7 @@ static int tcf_sample_init(struct net *net, struct nlattr 
*nla,
s->truncate = true;
s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
}
+   spin_unlock(&s->tcf_lock);
 
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -105,7 +108,8 @@ static void tcf_sample_cleanup(struct tc_action *a)
struct tcf_sample *s = to_sample(a);
struct psample_group *psample_group;
 
-   psample_group = rtnl_dereference(s->psample_group);
+   /* last reference to action, no need to lock */
+   psample_group = rcu_dereference_protected(s->psample_group, 1);
RCU_INIT_POINTER(s->psample_group, NULL);
if (psample_group)
psample_group_put(psample_group);
@@ -174,12 +178,13 @@ static int tcf_sample_dump(struct sk_buff *skb, struct 
tc_action *a,
struct tcf_sample *s = to_sample(a);
struct tc_sample opt = {
.index  = s->tcf_index,
-   .action = s->tcf_action,
.refcnt = refcount_read(&s->tcf_refcnt) - ref,
.bindcnt= atomic_read(&s->tcf_bindcnt) - bind,
};
struct tcf_t t;
 
+   spin_lock(&s->tcf_lock);
+   opt.action = s->tcf_action;
if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
 
@@ -196,9 +201,12 @@ static int tcf_sample_dump(struct sk_buff *skb, struct 
tc_action *a,
 
if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
goto nla_put_failure;
+   spin_unlock(&s->tcf_lock);
+
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&s->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next v2 06/15] net: sched: act_pedit: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Rearrange pedit init code to only access pedit action data while holding
tcf spinlock. Change keys allocation type to atomic to allow it to execute
while holding tcf spinlock. Take tcf spinlock in dump function when
accessing pedit action data.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_pedit.c | 40 
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 43ba999b2d23..3f62da72ab6a 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -187,44 +187,38 @@ static int tcf_pedit_init(struct net *net, struct nlattr 
*nla,
tcf_idr_cleanup(tn, parm->index);
goto out_free;
}
-   p = to_pedit(*a);
-   keys = kmalloc(ksize, GFP_KERNEL);
-   if (!keys) {
-   tcf_idr_release(*a, bind);
-   ret = -ENOMEM;
-   goto out_free;
-   }
ret = ACT_P_CREATED;
} else if (err > 0) {
if (bind)
goto out_free;
if (!ovr) {
-   tcf_idr_release(*a, bind);
ret = -EEXIST;
-   goto out_free;
-   }
-   p = to_pedit(*a);
-   if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
-   keys = kmalloc(ksize, GFP_KERNEL);
-   if (!keys) {
-   ret = -ENOMEM;
-   goto out_free;
-   }
+   goto out_release;
}
} else {
return err;
}
 
+   p = to_pedit(*a);
spin_lock_bh(&p->tcf_lock);
-   p->tcfp_flags = parm->flags;
-   p->tcf_action = parm->action;
-   if (keys) {
+
+   if (ret == ACT_P_CREATED ||
+   (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) {
+   keys = kmalloc(ksize, GFP_ATOMIC);
+   if (!keys) {
+   spin_unlock_bh(&p->tcf_lock);
+   ret = -ENOMEM;
+   goto out_release;
+   }
kfree(p->tcfp_keys);
p->tcfp_keys = keys;
p->tcfp_nkeys = parm->nkeys;
}
memcpy(p->tcfp_keys, parm->keys, ksize);
 
+   p->tcfp_flags = parm->flags;
+   p->tcf_action = parm->action;
+
kfree(p->tcfp_keys_ex);
p->tcfp_keys_ex = keys_ex;
 
@@ -232,6 +226,9 @@ static int tcf_pedit_init(struct net *net, struct nlattr 
*nla,
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+
+out_release:
+   tcf_idr_release(*a, bind);
 out_free:
kfree(keys_ex);
return ret;
@@ -410,6 +407,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct 
tc_action *a,
if (unlikely(!opt))
return -ENOBUFS;
 
+   spin_lock_bh(&p->tcf_lock);
memcpy(opt->keys, p->tcfp_keys,
   p->tcfp_nkeys * sizeof(struct tc_pedit_key));
opt->index = p->tcf_index;
@@ -432,11 +430,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct 
tc_action *a,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
goto nla_put_failure;
+   spin_unlock_bh(&p->tcf_lock);
 
kfree(opt);
return skb->len;
 
 nla_put_failure:
+   spin_unlock_bh(&p->tcf_lock);
nlmsg_trim(skb, b);
kfree(opt);
return -1;
-- 
2.7.5

[PATCH net-next v2 10/15] net: sched: act_tunnel_key: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Use tcf lock to protect tunnel key action struct private data from
concurrent modification in init and dump. Use rcu swap operation to
reassign params pointer under protection of tcf lock. (old params value is
not used by init, so there is no need of standalone rcu dereference step)

Remove rtnl lock assertion that is no longer required.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_tunnel_key.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index d42d9e112789..ba2ae9f75ef5 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -204,7 +204,6 @@ static int tunnel_key_init(struct net *net, struct nlattr 
*nla,
 {
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
-   struct tcf_tunnel_key_params *params_old;
struct tcf_tunnel_key_params *params_new;
struct metadata_dst *metadata = NULL;
struct tc_tunnel_key *parm;
@@ -346,24 +345,22 @@ static int tunnel_key_init(struct net *net, struct nlattr 
*nla,
 
t = to_tunnel_key(*a);
 
-   ASSERT_RTNL();
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
tcf_idr_release(*a, bind);
NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters");
return -ENOMEM;
}
-
-   params_old = rtnl_dereference(t->params);
-
-   t->tcf_action = parm->action;
params_new->tcft_action = parm->t_action;
params_new->tcft_enc_metadata = metadata;
 
-   rcu_assign_pointer(t->params, params_new);
-
-   if (params_old)
-   kfree_rcu(params_old, rcu);
+   spin_lock(&t->tcf_lock);
+   t->tcf_action = parm->action;
+   rcu_swap_protected(t->params, params_new,
+  lockdep_is_held(&t->tcf_lock));
+   spin_unlock(&t->tcf_lock);
+   if (params_new)
+   kfree_rcu(params_new, rcu);
 
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -485,12 +482,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct 
tc_action *a,
.index= t->tcf_index,
.refcnt   = refcount_read(&t->tcf_refcnt) - ref,
.bindcnt  = atomic_read(&t->tcf_bindcnt) - bind,
-   .action   = t->tcf_action,
};
struct tcf_t tm;
 
-   params = rtnl_dereference(t->params);
-
+   spin_lock(&t->tcf_lock);
+   params = rcu_dereference_protected(t->params,
+  lockdep_is_held(&t->tcf_lock));
+   opt.action   = t->tcf_action;
opt.t_action = params->tcft_action;
 
if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
@@ -522,10 +520,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct 
tc_action *a,
if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
  &tm, TCA_TUNNEL_KEY_PAD))
goto nla_put_failure;
+   spin_unlock(&t->tcf_lock);
 
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&t->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next v2 05/15] net: sched: act_ipt: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Use tcf spinlock to protect ipt action private data from concurrent
modification during dump. Ipt init already takes tcf spinlock when
modifying ipt state.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_ipt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 0dc787a57798..e149f0e66cb6 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -288,6 +288,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
 * for foolproof you need to not assume this
 */
 
+   spin_lock_bh(&ipt->tcf_lock);
t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
if (unlikely(!t))
goto nla_put_failure;
@@ -307,10 +308,12 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
goto nla_put_failure;
 
+   spin_unlock_bh(&ipt->tcf_lock);
kfree(t);
return skb->len;
 
 nla_put_failure:
+   spin_unlock_bh(&ipt->tcf_lock);
nlmsg_trim(skb, b);
kfree(t);
return -1;
-- 
2.7.5

[PATCH net-next v2 01/15] net: sched: act_bpf: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Use tcf spinlock to protect bpf action private data from concurrent
modification during dump and init. Remove rtnl lock assertion that is no
longer necessary.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_bpf.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 6203eb075c9a..9e8a33f9fee3 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -143,11 +143,12 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct 
tc_action *act,
.index   = prog->tcf_index,
.refcnt  = refcount_read(&prog->tcf_refcnt) - ref,
.bindcnt = atomic_read(&prog->tcf_bindcnt) - bind,
-   .action  = prog->tcf_action,
};
struct tcf_t tm;
int ret;
 
+   spin_lock(&prog->tcf_lock);
+   opt.action = prog->tcf_action;
if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
 
@@ -163,9 +164,11 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct 
tc_action *act,
  TCA_ACT_BPF_PAD))
goto nla_put_failure;
 
+   spin_unlock(&prog->tcf_lock);
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&prog->tcf_lock);
nlmsg_trim(skb, tp);
return -1;
 }
@@ -264,7 +267,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf 
*prog,
 {
cfg->is_ebpf = tcf_bpf_is_ebpf(prog);
/* updates to prog->filter are prevented, since it's called either
-* with rtnl lock or during final cleanup in rcu callback
+* with tcf lock or during final cleanup in rcu callback
 */
cfg->filter = rcu_dereference_protected(prog->filter, 1);
 
@@ -336,8 +339,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
goto out;
 
prog = to_bpf(*act);
-   ASSERT_RTNL();
 
+   spin_lock(&prog->tcf_lock);
if (res != ACT_P_CREATED)
tcf_bpf_prog_fill_cfg(prog, &old);
 
@@ -349,6 +352,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 
prog->tcf_action = parm->action;
rcu_assign_pointer(prog->filter, cfg.filter);
+   spin_unlock(&prog->tcf_lock);
 
if (res == ACT_P_CREATED) {
tcf_idr_insert(tn, *act);
-- 
2.7.5

[PATCH net-next v2 11/15] net: sched: act_vlan: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Use tcf spinlock to protect vlan action private data from concurrent
modification during dump and init. Use rcu swap operation to reassign
params pointer under protection of tcf lock. (old params value is not used
by init, so there is no need of standalone rcu dereference step)

Remove rtnl assertion that is no longer necessary.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_vlan.c | 27 +++
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 15a0ee214c9c..5bde17fe3608 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -109,7 +109,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr 
*nla,
 {
struct tc_action_net *tn = net_generic(net, vlan_net_id);
struct nlattr *tb[TCA_VLAN_MAX + 1];
-   struct tcf_vlan_params *p, *p_old;
+   struct tcf_vlan_params *p;
struct tc_vlan *parm;
struct tcf_vlan *v;
int action;
@@ -202,26 +202,24 @@ static int tcf_vlan_init(struct net *net, struct nlattr 
*nla,
 
v = to_vlan(*a);
 
-   ASSERT_RTNL();
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p) {
tcf_idr_release(*a, bind);
return -ENOMEM;
}
 
-   v->tcf_action = parm->action;
-
-   p_old = rtnl_dereference(v->vlan_p);
-
p->tcfv_action = action;
p->tcfv_push_vid = push_vid;
p->tcfv_push_prio = push_prio;
p->tcfv_push_proto = push_proto;
 
-   rcu_assign_pointer(v->vlan_p, p);
+   spin_lock(&v->tcf_lock);
+   v->tcf_action = parm->action;
+   rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
+   spin_unlock(&v->tcf_lock);
 
-   if (p_old)
-   kfree_rcu(p_old, rcu);
+   if (p)
+   kfree_rcu(p, rcu);
 
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -243,16 +241,18 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct 
tc_action *a,
 {
unsigned char *b = skb_tail_pointer(skb);
struct tcf_vlan *v = to_vlan(a);
-   struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p);
+   struct tcf_vlan_params *p;
struct tc_vlan opt = {
.index= v->tcf_index,
.refcnt   = refcount_read(&v->tcf_refcnt) - ref,
.bindcnt  = atomic_read(&v->tcf_bindcnt) - bind,
-   .action   = v->tcf_action,
-   .v_action = p->tcfv_action,
};
struct tcf_t t;
 
+   spin_lock(&v->tcf_lock);
+   opt.action = v->tcf_action;
+   p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock));
+   opt.v_action = p->tcfv_action;
if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
 
@@ -268,9 +268,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct 
tc_action *a,
tcf_tm_dump(&t, &v->tcf_tm);
if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
+   spin_unlock(&v->tcf_lock);
+
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&v->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next v2 12/15] net: sched: extend action ops with put_dev callback

2018-08-10 Thread Vlad Buslov

As a preparation for removing dependency on rtnl lock from rules update
path, all users of shared objects must take reference while working with
them.

Extend action ops with put_dev() API to be used on net device returned by
get_dev().

Modify mirred action (only action that implements get_dev callback):
- Take reference to net device in get_dev.
- Implement put_dev API that releases reference to net device.

Signed-off-by: Vlad Buslov 
---
 include/net/act_api.h  |  1 +
 net/sched/act_mirred.c | 12 +++-
 net/sched/cls_api.c|  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8c9bc02d05e1..1ad5b19e83a9 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -101,6 +101,7 @@ struct tc_action_ops {
void(*stats_update)(struct tc_action *, u64, u32, u64);
size_t  (*get_fill_size)(const struct tc_action *act);
struct net_device *(*get_dev)(const struct tc_action *a);
+   void(*put_dev)(struct net_device *dev);
int (*delete)(struct net *net, u32 index);
 };
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index b26d060da08e..7a045cc7fe3b 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -358,8 +358,17 @@ static struct notifier_block mirred_device_notifier = {
 static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
 {
struct tcf_mirred *m = to_mirred(a);
+   struct net_device *dev = rtnl_dereference(m->tcfm_dev);
+
+   if (dev)
+   dev_hold(dev);
 
-   return rtnl_dereference(m->tcfm_dev);
+   return dev;
+}
+
+static void tcf_mirred_put_dev(struct net_device *dev)
+{
+   dev_put(dev);
 }
 
 static int tcf_mirred_delete(struct net *net, u32 index)
@@ -382,6 +391,7 @@ static struct tc_action_ops act_mirred_ops = {
.lookup =   tcf_mirred_search,
.size   =   sizeof(struct tcf_mirred),
.get_dev=   tcf_mirred_get_dev,
+   .put_dev=   tcf_mirred_put_dev,
.delete =   tcf_mirred_delete,
 };
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index e8b0bbd0883f..bdabf8144e21 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -2167,6 +2167,7 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts 
*exts,
if (!dev)
continue;
ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop);
+   a->ops->put_dev(dev);
if (ret < 0)
return ret;
ok_count += ret;
-- 
2.7.5

[PATCH net-next v2 08/15] net: sched: act_simple: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Use tcf spinlock to protect private simple action data from concurrent
modification during dump. (simple init already uses tcf spinlock when
changing action state)

Signed-off-by: Vlad Buslov 
---
 net/sched/act_simple.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index aa51152e0066..18e4452574cd 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -156,10 +156,11 @@ static int tcf_simp_dump(struct sk_buff *skb, struct 
tc_action *a,
.index   = d->tcf_index,
.refcnt  = refcount_read(&d->tcf_refcnt) - ref,
.bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
-   .action  = d->tcf_action,
};
struct tcf_t t;
 
+   spin_lock_bh(&d->tcf_lock);
+   opt.action = d->tcf_action;
if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
goto nla_put_failure;
@@ -167,9 +168,12 @@ static int tcf_simp_dump(struct sk_buff *skb, struct 
tc_action *a,
tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD))
goto nla_put_failure;
+   spin_unlock_bh(&d->tcf_lock);
+
return skb->len;
 
 nla_put_failure:
+   spin_unlock_bh(&d->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next v2 15/15] net: sched: act_police: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Use tcf spinlock to protect police action private data from concurrent
modification during dump. (init already uses tcf spinlock when changing
police action state)

Pass tcf spinlock as estimator lock argument to gen_replace_estimator()
during action init.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_police.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 1f3192ea8df7..88c16d80c1cf 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -274,14 +274,15 @@ static int tcf_act_police_dump(struct sk_buff *skb, 
struct tc_action *a,
struct tcf_police *police = to_police(a);
struct tc_police opt = {
.index = police->tcf_index,
-   .action = police->tcf_action,
-   .mtu = police->tcfp_mtu,
-   .burst = PSCHED_NS2TICKS(police->tcfp_burst),
.refcnt = refcount_read(&police->tcf_refcnt) - ref,
.bindcnt = atomic_read(&police->tcf_bindcnt) - bind,
};
struct tcf_t t;
 
+   spin_lock_bh(&police->tcf_lock);
+   opt.action = police->tcf_action;
+   opt.mtu = police->tcfp_mtu;
+   opt.burst = PSCHED_NS2TICKS(police->tcfp_burst);
if (police->rate_present)
psched_ratecfg_getrate(&opt.rate, &police->rate);
if (police->peak_present)
@@ -301,10 +302,12 @@ static int tcf_act_police_dump(struct sk_buff *skb, 
struct tc_action *a,
t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
goto nla_put_failure;
+   spin_unlock_bh(&police->tcf_lock);
 
return skb->len;
 
 nla_put_failure:
+   spin_unlock_bh(&police->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next v2 14/15] net: core: protect rate estimator statistics pointer with lock

2018-08-10 Thread Vlad Buslov

Extend gen_new_estimator() to also take stats_lock when re-assigning rate
estimator statistics pointer. (to be used by unlocked actions)

Rename 'stats_lock' to 'lock' and change argument description to explain
that it is now also used for control path.

Signed-off-by: Vlad Buslov 
---
 include/net/gen_stats.h  |  4 ++--
 net/core/gen_estimator.c | 21 +
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 0304ba2ae353..883bb9085f15 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -59,13 +59,13 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
  struct net_rate_estimator __rcu **rate_est,
- spinlock_t *stats_lock,
+ spinlock_t *lock,
  seqcount_t *running, struct nlattr *opt);
 void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
  struct net_rate_estimator __rcu **ptr,
- spinlock_t *stats_lock,
+ spinlock_t *lock,
  seqcount_t *running, struct nlattr *opt);
 bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
 bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 98fd12721221..e4e442d70c2d 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -112,7 +112,7 @@ static void est_timer(struct timer_list *t)
  * @bstats: basic statistics
  * @cpu_bstats: bstats per cpu
  * @rate_est: rate estimator statistics
- * @stats_lock: statistics lock
+ * @lock: lock for statistics and control path
  * @running: qdisc running seqcount
  * @opt: rate estimator configuration TLV
  *
@@ -128,7 +128,7 @@ static void est_timer(struct timer_list *t)
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
  struct net_rate_estimator __rcu **rate_est,
- spinlock_t *stats_lock,
+ spinlock_t *lock,
  seqcount_t *running,
  struct nlattr *opt)
 {
@@ -154,19 +154,22 @@ int gen_new_estimator(struct gnet_stats_basic_packed 
*bstats,
seqcount_init(&est->seq);
intvl_log = parm->interval + 2;
est->bstats = bstats;
-   est->stats_lock = stats_lock;
+   est->stats_lock = lock;
est->running  = running;
est->ewma_log = parm->ewma_log;
est->intvl_log = intvl_log;
est->cpu_bstats = cpu_bstats;
 
-   if (stats_lock)
+   if (lock)
local_bh_disable();
est_fetch_counters(est, &b);
-   if (stats_lock)
+   if (lock)
local_bh_enable();
est->last_bytes = b.bytes;
est->last_packets = b.packets;
+
+   if (lock)
+   spin_lock_bh(lock);
old = rcu_dereference_protected(*rate_est, 1);
if (old) {
del_timer_sync(&old->timer);
@@ -179,6 +182,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed 
*bstats,
mod_timer(&est->timer, est->next_jiffies);
 
rcu_assign_pointer(*rate_est, est);
+   if (lock)
+   spin_unlock_bh(lock);
if (old)
kfree_rcu(old, rcu);
return 0;
@@ -209,7 +214,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
  * @bstats: basic statistics
  * @cpu_bstats: bstats per cpu
  * @rate_est: rate estimator statistics
- * @stats_lock: statistics lock
+ * @lock: lock for statistics and control path
  * @running: qdisc running seqcount (might be NULL)
  * @opt: rate estimator configuration TLV
  *
@@ -221,11 +226,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
  struct net_rate_estimator __rcu **rate_est,
- spinlock_t *stats_lock,
+ spinlock_t *lock,
  seqcount_t *running, struct nlattr *opt)
 {
return gen_new_estimator(bstats, cpu_bstats, rate_est,
-stats_lock, running, opt);
+lock, running, opt);
 }
 EXPORT_SYMBOL(gen_replace_estimator);
 
-- 
2.7.5

[PATCH net-next v2 04/15] net: sched: act_ife: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Use tcf spinlock and rcu to protect params pointer from concurrent
modification during dump and init. Use rcu swap operation to reassign
params pointer under protection of tcf lock. (old params value is not used
by init, so there is no need of standalone rcu dereference step)

Ife action has meta-actions that are compiled as standalone modules. Rtnl
mutex must be released while loading a kernel module. In order to support
execution without rtnl mutex, propagate 'rtnl_held' argument to meta action
loading functions. When requesting meta action module, conditionally
release rtnl lock depending on 'rtnl_held' argument.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_ife.c | 40 +---
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index df4060e32d43..5d200495e467 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -268,7 +268,8 @@ static const char *ife_meta_id2name(u32 metaid)
  * under ife->tcf_lock for existing action
 */
 static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
-   void *val, int len, bool exists)
+   void *val, int len, bool exists,
+   bool rtnl_held)
 {
struct tcf_meta_ops *ops = find_ife_oplist(metaid);
int ret = 0;
@@ -278,9 +279,11 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, 
u32 metaid,
 #ifdef CONFIG_MODULES
if (exists)
spin_unlock_bh(&ife->tcf_lock);
-   rtnl_unlock();
+   if (rtnl_held)
+   rtnl_unlock();
request_module("ife-meta-%s", ife_meta_id2name(metaid));
-   rtnl_lock();
+   if (rtnl_held)
+   rtnl_lock();
if (exists)
spin_lock_bh(&ife->tcf_lock);
ops = find_ife_oplist(metaid);
@@ -421,7 +424,7 @@ static void tcf_ife_cleanup(struct tc_action *a)
 
 /* under ife->tcf_lock for existing action */
 static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
-bool exists)
+bool exists, bool rtnl_held)
 {
int len = 0;
int rc = 0;
@@ -433,7 +436,8 @@ static int populate_metalist(struct tcf_ife_info *ife, 
struct nlattr **tb,
val = nla_data(tb[i]);
len = nla_len(tb[i]);
 
-   rc = load_metaops_and_vet(ife, i, val, len, exists);
+   rc = load_metaops_and_vet(ife, i, val, len, exists,
+ rtnl_held);
if (rc != 0)
return rc;
 
@@ -454,7 +458,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
struct tc_action_net *tn = net_generic(net, ife_net_id);
struct nlattr *tb[TCA_IFE_MAX + 1];
struct nlattr *tb2[IFE_META_MAX + 1];
-   struct tcf_ife_params *p, *p_old;
+   struct tcf_ife_params *p;
struct tcf_ife_info *ife;
u16 ife_type = ETH_P_IFE;
struct tc_ife *parm;
@@ -558,7 +562,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
return err;
}
 
-   err = populate_metalist(ife, tb2, exists);
+   err = populate_metalist(ife, tb2, exists, rtnl_held);
if (err)
goto metadata_parse_err;
 
@@ -581,13 +585,13 @@ static int tcf_ife_init(struct net *net, struct nlattr 
*nla,
}
 
ife->tcf_action = parm->action;
+   /* protected by tcf_lock when modifying existing action */
+   rcu_swap_protected(ife->params, p, 1);
+
if (exists)
spin_unlock_bh(&ife->tcf_lock);
-
-   p_old = rtnl_dereference(ife->params);
-   rcu_assign_pointer(ife->params, p);
-   if (p_old)
-   kfree_rcu(p_old, rcu);
+   if (p)
+   kfree_rcu(p, rcu);
 
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -600,16 +604,20 @@ static int tcf_ife_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
 {
unsigned char *b = skb_tail_pointer(skb);
struct tcf_ife_info *ife = to_ife(a);
-   struct tcf_ife_params *p = rtnl_dereference(ife->params);
+   struct tcf_ife_params *p;
struct tc_ife opt = {
.index = ife->tcf_index,
.refcnt = refcount_read(&ife->tcf_refcnt) - ref,
.bindcnt = atomic_read(&ife->tcf_bindcnt) - bind,
-   .action = ife->tcf_action,
-   .flags = p->flags,
};
struct tcf_t t;
 
+   spin_lock_bh(&ife->tcf_lock);
+   opt.action = ife->tcf_action;
+   p = rcu_dereference_protected(ife->params,
+ lockdep_is_held(&ife->tcf_lock));
+   opt.flags = p->flags;
+

[PATCH net-next v2 03/15] net: sched: act_gact: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Use tcf spinlock to protect gact action private state from concurrent
modification during dump and init. Remove rtnl assertion that is no longer
necessary.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_gact.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 661b72b9147d..bfccd34a3968 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -113,7 +113,7 @@ static int tcf_gact_init(struct net *net, struct nlattr 
*nla,
 
gact = to_gact(*a);
 
-   ASSERT_RTNL();
+   spin_lock(&gact->tcf_lock);
gact->tcf_action = parm->action;
 #ifdef CONFIG_GACT_PROB
if (p_parm) {
@@ -126,6 +126,8 @@ static int tcf_gact_init(struct net *net, struct nlattr 
*nla,
gact->tcfg_ptype   = p_parm->ptype;
}
 #endif
+   spin_unlock(&gact->tcf_lock);
+
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
@@ -178,10 +180,11 @@ static int tcf_gact_dump(struct sk_buff *skb, struct 
tc_action *a,
.index   = gact->tcf_index,
.refcnt  = refcount_read(&gact->tcf_refcnt) - ref,
.bindcnt = atomic_read(&gact->tcf_bindcnt) - bind,
-   .action  = gact->tcf_action,
};
struct tcf_t t;
 
+   spin_lock(&gact->tcf_lock);
+   opt.action = gact->tcf_action;
if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
 #ifdef CONFIG_GACT_PROB
@@ -199,9 +202,12 @@ static int tcf_gact_dump(struct sk_buff *skb, struct 
tc_action *a,
tcf_tm_dump(&t, &gact->tcf_tm);
if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
goto nla_put_failure;
+   spin_unlock(&gact->tcf_lock);
+
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&gact->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next v2 13/15] net: sched: act_mirred: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Re-introduce mirred list spinlock, that was removed some time ago, in order
to protect it from concurrent modifications, instead of relying on rtnl
lock.

Use tcf spinlock to protect mirred action private data from concurrent
modification in init and dump. Rearrange access to mirred data in order to
be performed only while holding the lock.

Rearrange net dev access to always hold reference while working with it,
instead of relying on rntl lock.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_mirred.c | 78 +-
 1 file changed, 51 insertions(+), 27 deletions(-)

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 7a045cc7fe3b..327be257033d 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -30,6 +30,7 @@
 #include 
 
 static LIST_HEAD(mirred_list);
+static DEFINE_SPINLOCK(mirred_list_lock);
 
 static bool tcf_mirred_is_act_redirect(int action)
 {
@@ -62,13 +63,23 @@ static bool tcf_mirred_can_reinsert(int action)
return false;
 }
 
+static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m)
+{
+   return rcu_dereference_protected(m->tcfm_dev,
+lockdep_is_held(&m->tcf_lock));
+}
+
 static void tcf_mirred_release(struct tc_action *a)
 {
struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
 
+   spin_lock(&mirred_list_lock);
list_del(&m->tcfm_list);
-   dev = rtnl_dereference(m->tcfm_dev);
+   spin_unlock(&mirred_list_lock);
+
+   /* last reference to action, no need to lock */
+   dev = rcu_dereference_protected(m->tcfm_dev, 1);
if (dev)
dev_put(dev);
 }
@@ -128,22 +139,9 @@ static int tcf_mirred_init(struct net *net, struct nlattr 
*nla,
NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
return -EINVAL;
}
-   if (parm->ifindex) {
-   dev = __dev_get_by_index(net, parm->ifindex);
-   if (dev == NULL) {
-   if (exists)
-   tcf_idr_release(*a, bind);
-   else
-   tcf_idr_cleanup(tn, parm->index);
-   return -ENODEV;
-   }
-   mac_header_xmit = dev_is_mac_header_xmit(dev);
-   } else {
-   dev = NULL;
-   }
 
if (!exists) {
-   if (!dev) {
+   if (!parm->ifindex) {
tcf_idr_cleanup(tn, parm->index);
NL_SET_ERR_MSG_MOD(extack, "Specified device does not 
exist");
return -EINVAL;
@@ -161,19 +159,31 @@ static int tcf_mirred_init(struct net *net, struct nlattr 
*nla,
}
m = to_mirred(*a);
 
-   ASSERT_RTNL();
+   spin_lock(&m->tcf_lock);
m->tcf_action = parm->action;
m->tcfm_eaction = parm->eaction;
-   if (dev != NULL) {
-   if (ret != ACT_P_CREATED)
-   dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
-   dev_hold(dev);
-   rcu_assign_pointer(m->tcfm_dev, dev);
+
+   if (parm->ifindex) {
+   dev = dev_get_by_index(net, parm->ifindex);
+   if (!dev) {
+   spin_unlock(&m->tcf_lock);
+   tcf_idr_release(*a, bind);
+   return -ENODEV;
+   }
+   mac_header_xmit = dev_is_mac_header_xmit(dev);
+   rcu_swap_protected(m->tcfm_dev, dev,
+  lockdep_is_held(&m->tcf_lock));
+   if (dev)
+   dev_put(dev);
m->tcfm_mac_header_xmit = mac_header_xmit;
}
+   spin_unlock(&m->tcf_lock);
 
if (ret == ACT_P_CREATED) {
+   spin_lock(&mirred_list_lock);
list_add(&m->tcfm_list, &mirred_list);
+   spin_unlock(&mirred_list_lock);
+
tcf_idr_insert(tn, *a);
}
 
@@ -287,26 +297,33 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
 {
unsigned char *b = skb_tail_pointer(skb);
struct tcf_mirred *m = to_mirred(a);
-   struct net_device *dev = rtnl_dereference(m->tcfm_dev);
struct tc_mirred opt = {
.index   = m->tcf_index,
-   .action  = m->tcf_action,
.refcnt  = refcount_read(&m->tcf_refcnt) - ref,
.bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
-   .eaction = m->tcfm_eaction,
-   .ifindex = dev ? dev->ifindex : 0,
};
+   struct net_device *dev;
struct tcf_t t;
 
+   spin_lock(&m->tcf_lock);
+   opt.action = m->tcf_action;
+   opt.eaction = m->tcfm_eaction;
+   dev = tcf_mirred_dev_dereference(m);
+   if (dev)
+   opt.ifindex = dev->ifindex;
+
if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
goto n

[PATCH net-next v2 02/15] net: sched: act_csum: remove dependency on rtnl lock

2018-08-10 Thread Vlad Buslov

Use tcf lock to protect csum action struct private data from concurrent
modification in init and dump. Use rcu swap operation to reassign params
pointer under protection of tcf lock. (old params value is not used by
init, so there is no need of standalone rcu dereference step)

Remove rtnl assertion that is no longer necessary.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_csum.c | 24 +++-
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 648a3a35b720..f01c59ba6d12 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -50,7 +50,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 struct netlink_ext_ack *extack)
 {
struct tc_action_net *tn = net_generic(net, csum_net_id);
-   struct tcf_csum_params *params_old, *params_new;
+   struct tcf_csum_params *params_new;
struct nlattr *tb[TCA_CSUM_MAX + 1];
struct tc_csum *parm;
struct tcf_csum *p;
@@ -88,20 +88,22 @@ static int tcf_csum_init(struct net *net, struct nlattr 
*nla,
}
 
p = to_tcf_csum(*a);
-   ASSERT_RTNL();
 
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
tcf_idr_release(*a, bind);
return -ENOMEM;
}
-   params_old = rtnl_dereference(p->params);
+   params_new->update_flags = parm->update_flags;
 
+   spin_lock(&p->tcf_lock);
p->tcf_action = parm->action;
-   params_new->update_flags = parm->update_flags;
-   rcu_assign_pointer(p->params, params_new);
-   if (params_old)
-   kfree_rcu(params_old, rcu);
+   rcu_swap_protected(p->params, params_new,
+  lockdep_is_held(&p->tcf_lock));
+   spin_unlock(&p->tcf_lock);
+
+   if (params_new)
+   kfree_rcu(params_new, rcu);
 
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -599,11 +601,13 @@ static int tcf_csum_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
.index   = p->tcf_index,
.refcnt  = refcount_read(&p->tcf_refcnt) - ref,
.bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
-   .action  = p->tcf_action,
};
struct tcf_t t;
 
-   params = rtnl_dereference(p->params);
+   spin_lock(&p->tcf_lock);
+   params = rcu_dereference_protected(p->params,
+  lockdep_is_held(&p->tcf_lock));
+   opt.action = p->tcf_action;
opt.update_flags = params->update_flags;
 
if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
@@ -612,10 +616,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
goto nla_put_failure;
+   spin_unlock(&p->tcf_lock);
 
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&p->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next v2 00/15] Remove rtnl lock dependency from all action implementations

2018-08-10 Thread Vlad Buslov

Currently, all netlink protocol handlers for updating rules, actions and
qdiscs are protected with single global rtnl lock which removes any
possibility for parallelism. This patch set is a second step to remove
rtnl lock dependency from TC rules update path.

Recently, new rtnl registration flag RTNL_FLAG_DOIT_UNLOCKED was added.
Handlers registered with this flag are called without RTNL taken. End
goal is to have rule update handlers(RTM_NEWTFILTER, RTM_DELTFILTER,
etc.) to be registered with UNLOCKED flag to allow parallel execution.
However, there is no intention to completely remove or split rtnl lock
itself. This patch set addresses specific problems in implementation of
tc actions that prevent their control path from being executed
concurrently. Additional changes are required to refactor classifiers
API and individual classifiers for parallel execution. This patch set
lays groundwork to eventually register rule update handlers as
rtnl-unlocked.

Action API is already prepared for parallel execution with previous
patch set, which means that action ops that use action API for their
implementation do not require additional modifications. (delete, search,
etc.) Action API implements concurrency-safe reference counting and
guarantees that cleanup/delete is called only once, after last reference
to action is released.

The goal of this change is to update specific actions APIs that access
action private state directly, in order to be independent from external
locking. General approach is to re-use existing tcf_lock spinlock (used
by some action implementation to synchronize control path with data
path) to protect action private state from concurrent modification. If
action has rcu-protected pointer, tcf spinlock is used to protect its
update code, instead of relying on rtnl lock.

Some actions need to determine rtnl mutex status in order to release it.
For example, ife action can load additional kernel modules(meta ops) and
must make sure that no locks are held during module load. In such cases
'rtnl_held' argument is used to conditionally release rtnl mutex.

Changes from V1 to V2:
- Patch 12:
  - new patch
- Patch 14:
  - refactor gen_new_estimator() to reuse stats_lock when re-assigning
rate estimator statistics pointer
- Remove mirred and tunnel_key helper function changes. (to be submitted
  and standalone patch)

Vlad Buslov (15):
  net: sched: act_bpf: remove dependency on rtnl lock
  net: sched: act_csum: remove dependency on rtnl lock
  net: sched: act_gact: remove dependency on rtnl lock
  net: sched: act_ife: remove dependency on rtnl lock
  net: sched: act_ipt: remove dependency on rtnl lock
  net: sched: act_pedit: remove dependency on rtnl lock
  net: sched: act_sample: remove dependency on rtnl lock
  net: sched: act_simple: remove dependency on rtnl lock
  net: sched: act_skbmod: remove dependency on rtnl lock
  net: sched: act_tunnel_key: remove dependency on rtnl lock
  net: sched: act_vlan: remove dependency on rtnl lock
  net: sched: extend action ops with put_dev callback
  net: sched: act_mirred: remove dependency on rtnl lock
  net: core: protect rate estimator statistics pointer with lock
  net: sched: act_police: remove dependency on rtnl lock

 include/net/act_api.h  |  1 +
 include/net/gen_stats.h|  4 +--
 net/core/gen_estimator.c   | 21 ++-
 net/sched/act_bpf.c| 10 --
 net/sched/act_csum.c   | 24 -
 net/sched/act_gact.c   | 10 --
 net/sched/act_ife.c| 40 +
 net/sched/act_ipt.c|  3 ++
 net/sched/act_mirred.c | 88 --
 net/sched/act_pedit.c  | 40 ++---
 net/sched/act_police.c |  9 +++--
 net/sched/act_sample.c | 12 +--
 net/sched/act_simple.c |  6 +++-
 net/sched/act_skbmod.c | 14 +---
 net/sched/act_tunnel_key.c | 26 +++---
 net/sched/act_vlan.c   | 27 +++---
 net/sched/cls_api.c|  1 +
 17 files changed, 214 insertions(+), 122 deletions(-)

-- 
2.7.5

Re: [PATCH][net-next] packet: switch kvzalloc to allocate memory

2018-08-10 Thread David Miller

From: Li RongQing 
Date: Fri, 10 Aug 2018 18:00:00 +0800

> @@ -4275,7 +4259,7 @@ static int packet_set_ring(struct sock *sk, union 
> tpacket_req_u *req_u,
>  
>   err = -ENOMEM;
>   order = get_order(req->tp_block_size);
> - pg_vec = alloc_pg_vec(req, order);
> + pg_vec = alloc_pg_vec(req);
>   if (unlikely(!pg_vec))
>   goto out;
>   switch (po->tp_version) {

Variable 'order' is now unused.

Re: [PATCH v2 0/2] net/sctp: Avoid allocating high order memory with kmalloc()

2018-08-10 Thread Marcelo Ricardo Leitner

On Fri, Aug 10, 2018 at 08:03:51PM +0300, Konstantin Khorenko wrote:
> On 08/09/2018 11:43 AM, Konstantin Khorenko wrote:
> > On 08/04/2018 02:36 AM, Marcelo Ricardo Leitner wrote:
> > > On Fri, Aug 03, 2018 at 07:21:00PM +0300, Konstantin Khorenko wrote:
> > > ...
> > > > Performance results:
> > > > 
> > > >   * Kernel: v4.18-rc6 - stock and with 2 patches from Oleg (earlier in 
> > > > this thread)
> > > >   * Node: CPU (8 cores): Intel(R) Xeon(R) CPU E31230 @ 3.20GHz
> > > >   RAM: 32 Gb
> > > > 
> > > >   * netperf: taken from https://github.com/HewlettPackard/netperf.git,
> > > >  compiled from sources with sctp support
> > > >   * netperf server and client are run on the same node
> > > >   * ip link set lo mtu 1500
> > > > 
> > > > The script used to run tests:
> > > >  # cat run_tests.sh
> > > >  #!/bin/bash
> > > > 
> > > > for test in SCTP_STREAM SCTP_STREAM_MANY SCTP_RR SCTP_RR_MANY; do
> > > >   echo "TEST: $test";
> > > >   for i in `seq 1 3`; do
> > > > echo "Iteration: $i";
> > > > set -x
> > > > netperf -t $test -H localhost -p 2 -S 20,20 -s 
> > > > 20,20 \
> > > > -l 60 -- -m 1452;
> > > > set +x
> > > >   done
> > > > done
> > > > 
> > > > 
> > > > Results (a bit reformatted to be more readable):
> > > ...
> > > 
> > > Nice, good numbers.
> > > 
> > > I'm missing some test that actually uses more than 1 stream. All tests
> > > in netperf uses only 1 stream. They can use 1 or Many associations on
> > > a socket, but not multiple streams. That means the numbers here show
> > > that we shouldn't see any regression on the more traditional uses, per
> > > Michael's reply on the other email, but it is not testing how it will
> > > behave if we go crazy and use the 64k streams (worst case).
> > > 
> > > You'll need some other tool to test it. One idea is sctp_test, from
> > > lksctp-tools. Something like:
> > > 
> > > Server side:
> > >   ./sctp_test -H 172.0.0.1 -P 2 -l -d 0
> > > Client side:
> > >   time ./sctp_test -H 172.0.0.1 -P 1 \
> > >   -h 172.0.0.1 -p 2 -s \
> > >   -c 1 -M 65535 -T -t 1 -x 10 -d 0
> > > 
> > > And then measure the difference on how long each test took. Can you
> > > get these too?
> > > 
> > > Interesting that in my laptop just to start this test for the first
> > > time can took some *seconds*. Seems kernel had a hard time
> > > defragmenting the memory here. :)
> 
> Hi Marcelo,
> 
> got 3 of 4 results, please take a look, but i failed to measure running
> the test on stock kernel when memory is fragmented, test fails with
> *** connect:  Cannot allocate memory ***

Hah, okay.

> 
> 
> Performance results:
> 
>   * Kernel: v4.18-rc8 - stock and with 2 patches v3
>   * Node: CPU (8 cores): Intel(R) Xeon(R) CPU E31230 @ 3.20GHz
>   RAM: 32 Gb
> 
>   * sctp_test: https://github.com/sctp/lksctp-tools
>   * both server and client are run on the same node
>   * ip link set lo mtu 1500
>   * sysctl -w vm.max_map_count=6553 (need it to make memory fragmented)
> 
> The script used to run tests:
> =
> # cat run_sctp_test.sh
> #!/bin/bash
> 
> set -x
> 
> uname -r
> ip link set lo mtu 1500
> swapoff -a
> 
> free
> cat /proc/buddyinfo
> 
> ./src/apps/sctp_test -H 127.0.0.1 -P 2 -l -d 0 &
> sleep 3
> 
> time ./src/apps/sctp_test -H 127.0.0.1 -P 1 -h 127.0.0.1 -p 2 \
> -s -c 1 -M 65535 -T -t 1 -x 10 -d 0 1>/dev/null
> 
> killall -9 lt-sctp_test
> ===
> 
> Results (a bit reformatted to be more readable):
> 
> 1) ms stock kernel v4.18-rc8, no memory fragmentation
> Info about memory - more or less same to iterations:
> # free
>   totalusedfree  shared  buff/cache   
> available
> Mem:   32906008  21315632178184 764  514668
> 32260968
> Swap: 0   0   0
> 
> cat /proc/buddyinfo
> Node 0, zone  DMA  0  1  1  0  2  1  1  0 
>  1  1  3
> Node 0, zoneDMA32  1  3  5  4  2  2  3  6 
>  6  4867
> Node 0, zone   Normal551422160204193 34 15  7 
> 22 19   6956
> 
>   test 1  test 2  test 3
> real0m14.715s 0m14.593s   0m15.954s
> user0m0.954s  0m0.955s0m0.854s
> sys 0m13.388s 0m12.537s   0m13.749s
> 
> 2) kernel with fixes, no memory fragmentation
> 'free' and 'buddyinfo' similar to 1)
> 
>   test 1  test 2  test 3
> real0m14.959s 0m14.693s   0m14.762s
> user0m0.948s  0m0.921s0m0.929s
> sys 0m13.538s 0m13.225s   0m13.217s
> 
> 3) kernel with fixes, memory fragmented
> (mmap() all available RAM, touch all pages, munmap() half of pages (each 
> second page), do it again for

Re: [Query]: DSA Understanding

2018-08-10 Thread Florian Fainelli

On 08/10/2018 04:26 AM, Lad, Prabhakar wrote:
> Hi Andrew,
> 
> On Thu, Aug 9, 2018 at 6:23 PM Andrew Lunn  wrote:
>>
>>> Its coming from the switch lan4 I have attached the png, where
>>> C4:F3:12:08:FE:7F is
>>> the mac of lan4, which is broadcast to ff:ff:ff:ff:ff:ff, which is
>>> causing rx counter on
>>> PC to go up.
>>
>> So, big packets are making it from the switch to the PC. But the small
>> ARP packets are not.
>>
>> This is what Florian was suggesting.
>>
>> ARP packets are smaller than 64 bytes, which is the minimum packet
>> size for Ethernet. Any packets smaller than 64 bytes are called runt
>> packets. They have to be padded upto 64 bytes in order to make them
>> valid. Otherwise the destination, or any switch along the path, might
>> throw them away.
>>
>> What could be happening is that the CSPW driver or hardware is padding
>> the packet to 64 bytes. But that packet has a DSA header in it. The
>> switch removes the header, recalculate the checksum and sends the
>> packet. It is now either 4 or 8 bytes smaller, depending on what DSA
>> header was used. It then becomes a runt packet.
>>
> Thank you for the clarification, this really helped me out.
> 
>> Florian had to fix this problem recently.
>>
>> http://patchwork.ozlabs.org/patch/836534/
>>
> But seems like this patch was never accepted, instead
> brcm_tag_xmit_ll() does it if I am understanding it correctly.
> similarly to this ksz_xmit() is taking care of padding.

net/dsa/tag_brcm.c ended up doing the padding because that was a more
generic and central location:

https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/tree/net/dsa/tag_brcm.c#n73

> 
>> You probably need something similar for the cpsw.
>>
> looking at the tag_ksz.c in xmit function this is taken care of

I agree, this should be padding packets correctly, can you still
instrument cpsw to make sure that what comes to its ndo_start_xmit() is
ETH_ZLEN + tag_len or more?

> 
> /* For Ingress (Host -> KSZ), 2 bytes are added before FCS.
>  * ---
>  * DA(6bytes)|SA(6bytes)||Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
>  * ---
>  * tag0 : Prioritization (not used now)
>  * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
>  *
>  * For Egress (KSZ -> Host), 1 byte is added before FCS.
>  * ---
>  * DA(6bytes)|SA(6bytes)||Data(nbytes)|tag0(1byte)|FCS(4bytes)
>  * ---
>  * tag0 : zero-based value represents port
>  *  (eg, 0x00=port1, 0x02=port3, 0x06=port7)
>  */
> 
> #defineKSZ_INGRESS_TAG_LEN2
> #defineKSZ_EGRESS_TAG_LEN1
> 
> static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
> {
> struct dsa_slave_priv *p = netdev_priv(dev);
> struct sk_buff *nskb;
> int padlen;
> u8 *tag;
> 
> padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len;
> 
> if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) {
> /* Let dsa_slave_xmit() free skb */
> if (__skb_put_padto(skb, skb->len + padlen, false))
> return NULL;
> 
> nskb = skb;
> } else {
> nskb = alloc_skb(NET_IP_ALIGN + skb->len +
>  padlen + KSZ_INGRESS_TAG_LEN, GFP_ATOMIC);
> if (!nskb)
> return NULL;
> skb_reserve(nskb, NET_IP_ALIGN);
> 
> skb_reset_mac_header(nskb);
> skb_set_network_header(nskb,
>skb_network_header(skb) - skb->head);
> skb_set_transport_header(nskb,
>  skb_transport_header(skb) - skb->head);
> skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
> 
> /* Let skb_put_padto() free nskb, and let dsa_slave_xmit() free
>  * skb
>  */
> if (skb_put_padto(nskb, nskb->len + padlen))
> return NULL;
> 
> consume_skb(skb);
> }
> 
> tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN);
> tag[0] = 0;
> tag[1] = 1 << p->dp->index; /* destination port */
> 
> return nskb;
> }
> 
> Cheers,
> --Prabhakar Lad
> 


-- 
Florian

Re: [PATCH net-next] cxgb4: add support to display DCB info

2018-08-10 Thread David Miller

From: Ganesh Goudar 
Date: Fri, 10 Aug 2018 14:47:01 +0530

> display Data Center bridging information in debug
> fs.
> 
> Signed-off-by: Casey Leedom 
> Signed-off-by: Ganesh Goudar 

Applied.

[PATCH net-next] openvswitch: Derive IP protocol number for IPv6 later frags

2018-08-10 Thread Yi-Hung Wei

Currently, OVS only parses the IP protocol number for the first
IPv6 fragment, but sets the IP protocol number for the later fragments
to be NEXTHDF_FRAGMENT.  This patch tries to derive the IP protocol
number for the IPV6 later frags so that we can match that.

Signed-off-by: Yi-Hung Wei 
---
 net/openvswitch/flow.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 56b8e7167790..3d654c4f71be 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -297,7 +297,13 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct 
sw_flow_key *key)
 
nh_len = payload_ofs - nh_ofs;
skb_set_transport_header(skb, nh_ofs + nh_len);
-   key->ip.proto = nexthdr;
+   if (key->ip.frag == OVS_FRAG_TYPE_LATER) {
+   unsigned int offset = 0;
+
+   key->ip.proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
+   } else {
+   key->ip.proto = nexthdr;
+   }
return nh_len;
 }
 
-- 
2.7.4

[PATCH v3 2/2] net/sctp: Replace in/out stream arrays with flex_array

2018-08-10 Thread Konstantin Khorenko

This path replaces physically contiguous memory arrays
allocated using kmalloc_array() with flexible arrays.
This enables to avoid memory allocation failures on the
systems under a memory stress.

Signed-off-by: Oleg Babin 
Signed-off-by: Konstantin Khorenko 

---
 include/net/sctp/structs.h |  9 ++---
 net/sctp/stream.c  | 88 ++
 2 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index ce4bf844f573..f922db8029e6 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -57,6 +57,7 @@
 #include   /* This gets us atomic counters.  */
 #include   /* We need sk_buff_head. */
 #include/* We need tq_struct.*/
+#include   /* We need flex_array.   */
 #include /* We need sctp* header structs.  */
 #include  /* We need auth specific structs */
 #include /* For inet_skb_parm */
@@ -1431,8 +1432,8 @@ struct sctp_stream_in {
 };
 
 struct sctp_stream {
-   struct sctp_stream_out *out;
-   struct sctp_stream_in *in;
+   struct flex_array *out;
+   struct flex_array *in;
__u16 outcnt;
__u16 incnt;
/* Current stream being sent, if any */
@@ -1458,14 +1459,14 @@ static inline struct sctp_stream_out *sctp_stream_out(
const struct sctp_stream *stream,
__u16 sid)
 {
-   return ((struct sctp_stream_out *)(stream->out)) + sid;
+   return flex_array_get(stream->out, sid);
 }
 
 static inline struct sctp_stream_in *sctp_stream_in(
const struct sctp_stream *stream,
__u16 sid)
 {
-   return ((struct sctp_stream_in *)(stream->in)) + sid;
+   return flex_array_get(stream->in, sid);
 }
 
 #define SCTP_SO(s, i) sctp_stream_out((s), (i))
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 7ca6fe4e7882..ffb940d3b57c 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -37,6 +37,53 @@
 #include 
 #include 
 
+static struct flex_array *fa_alloc(size_t elem_size, size_t elem_count,
+  gfp_t gfp)
+{
+   struct flex_array *result;
+   int err;
+
+   result = flex_array_alloc(elem_size, elem_count, gfp);
+   if (result) {
+   err = flex_array_prealloc(result, 0, elem_count, gfp);
+   if (err) {
+   flex_array_free(result);
+   result = NULL;
+   }
+   }
+
+   return result;
+}
+
+static void fa_free(struct flex_array *fa)
+{
+   if (fa)
+   flex_array_free(fa);
+}
+
+static void fa_copy(struct flex_array *fa, struct flex_array *from,
+   size_t index, size_t count)
+{
+   void *elem;
+
+   while (count--) {
+   elem = flex_array_get(from, index);
+   flex_array_put(fa, index, elem, 0);
+   index++;
+   }
+}
+
+static void fa_zero(struct flex_array *fa, size_t index, size_t count)
+{
+   void *elem;
+
+   while (count--) {
+   elem = flex_array_get(fa, index);
+   memset(elem, 0, fa->element_size);
+   index++;
+   }
+}
+
 /* Migrates chunks from stream queues to new stream queues if needed,
  * but not across associations. Also, removes those chunks to streams
  * higher than the new max.
@@ -78,34 +125,33 @@ static void sctp_stream_outq_migrate(struct sctp_stream 
*stream,
 * sctp_stream_update will swap ->out pointers.
 */
for (i = 0; i < outcnt; i++) {
-   kfree(new->out[i].ext);
-   new->out[i].ext = stream->out[i].ext;
-   stream->out[i].ext = NULL;
+   kfree(SCTP_SO(new, i)->ext);
+   SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext;
+   SCTP_SO(stream, i)->ext = NULL;
}
}
 
for (i = outcnt; i < stream->outcnt; i++)
-   kfree(stream->out[i].ext);
+   kfree(SCTP_SO(stream, i)->ext);
 }
 
 static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
 gfp_t gfp)
 {
-   struct sctp_stream_out *out;
+   struct flex_array *out;
+   size_t elem_size = sizeof(struct sctp_stream_out);
 
-   out = kmalloc_array(outcnt, sizeof(*out), gfp);
+   out = fa_alloc(elem_size, outcnt, gfp);
if (!out)
return -ENOMEM;
 
if (stream->out) {
-   memcpy(out, stream->out, min(outcnt, stream->outcnt) *
-sizeof(*out));
-   kfree(stream->out);
+   fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
+   fa_free(stream->out);
}
 
if (outcnt > stream->outcnt)
-   memset(out + stream->outcnt, 0,
-  (outcnt - stream->outcnt) * sizeof(*out));
+   fa_zero(out, s

[PATCH v3 0/2] net/sctp: Avoid allocating high order memory with kmalloc()

2018-08-10 Thread Konstantin Khorenko

Each SCTP association can have up to 65535 input and output streams.
For each stream type an array of sctp_stream_in or sctp_stream_out
structures is allocated using kmalloc_array() function. This function
allocates physically contiguous memory regions, so this can lead
to allocation of memory regions of very high order, i.e.:

  sizeof(struct sctp_stream_out) == 24,
  ((65535 * 24) / 4096) == 383 memory pages (4096 byte per page),
  which means 9th memory order.

This can lead to a memory allocation failures on the systems
under a memory stress.

We actually do not need these arrays of memory to be physically
contiguous. Possible simple solution would be to use kvmalloc()
instread of kmalloc() as kvmalloc() can allocate physically scattered
pages if contiguous pages are not available. But the problem
is that the allocation can happed in a softirq context with
GFP_ATOMIC flag set, and kvmalloc() cannot be used in this scenario.

So the other possible solution is to use flexible arrays instead of
contiguios arrays of memory so that the memory would be allocated
on a per-page basis.

This patchset replaces kvmalloc() with flex_array usage.
It consists of two parts:

  * First patch is preparatory - it mechanically wraps all direct
access to assoc->stream.out[] and assoc->stream.in[] arrays
with SCTP_SO() and SCTP_SI() wrappers so that later a direct
array access could be easily changed to an access to a
flex_array (or any other possible alternative).
  * Second patch replaces kmalloc_array() with flex_array usage.

Oleg Babin (2):
  net/sctp: Make wrappers for accessing in/out streams
  net/sctp: Replace in/out stream arrays with flex_array

 include/net/sctp/structs.h   |  40 +++
 net/sctp/chunk.c |   6 +-
 net/sctp/outqueue.c  |  11 ++--
 net/sctp/socket.c|   4 +-
 net/sctp/stream.c| 153 ---
 net/sctp/stream_interleave.c |  20 +++---
 net/sctp/stream_sched.c  |  13 ++--
 net/sctp/stream_sched_prio.c |  22 +++
 net/sctp/stream_sched_rr.c   |   8 +--
 9 files changed, 172 insertions(+), 105 deletions(-)

v2 changes:
 sctp_stream_in() users are updated to provide stream as an argument,
 sctp_stream_{in,out}_ptr() are now just sctp_stream_{in,out}().

v3 changes:
 Move type chages struct sctp_stream_out -> flex_array to next patch.
 Make sctp_stream_{in,out}() static incline and move them to a header.

Performance results (single stream):

  * Kernel: v4.18-rc6 - stock and with 2 patches from Oleg (earlier in this 
thread)
  * Node: CPU (8 cores): Intel(R) Xeon(R) CPU E31230 @ 3.20GHz
  RAM: 32 Gb

  * netperf: taken from https://github.com/HewlettPackard/netperf.git,
 compiled from sources with sctp support
  * netperf server and client are run on the same node
  * ip link set lo mtu 1500

The script used to run tests:
 # cat run_tests.sh
 #!/bin/bash

for test in SCTP_STREAM SCTP_STREAM_MANY SCTP_RR SCTP_RR_MANY; do
  echo "TEST: $test";
  for i in `seq 1 3`; do
echo "Iteration: $i";
set -x
netperf -t $test -H localhost -p 2 -S 20,20 -s 20,20 \
-l 60 -- -m 1452;
set +x
  done
done


Results (a bit reformatted to be more readable):
Recv   SendSend
Socket Socket  Message  Elapsed
Size   SizeSize Time Throughput
bytes  bytes   bytessecs.10^6bits/sec

v4.18-rc7   v4.18-rc7 + fixes
TEST: SCTP_STREAM
212992 212992   145260.21   1125.52 1247.04
212992 212992   145260.20   1376.38 1149.95
212992 212992   145260.20   1131.40 1163.85
TEST: SCTP_STREAM_MANY
212992 212992   145260.00   .00 1310.05
212992 212992   145260.00   1188.55 1130.50
212992 212992   145260.00   1108.06 1162.50

===
Local /Remote
Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size SizeTime Rate
bytes  Bytes  bytesbytes   secs.per sec

v4.18-rc7   v4.18-rc7 + fixes
TEST: SCTP_RR
212992 212992 11   60.0045486.9846089.43
212992 212992 11   60.0045584.1845994.21
212992 212992 11   60.0045703.8645720.84
TEST: SCTP_RR_MANY
212992 212992 11   60.0040.75   40.77
212992 212992 11   60.0040.58   40.08
212992 212992 11   60.0039.98   39.97

Performance results for many streams:
=
   * Kernel: v4.18-rc8 - stock and with 2 patches v3
   * Node: CPU (8 cores): Intel(R) Xeon(R) CPU E31230 @ 3.20GHz
   RAM: 32 Gb

   * sctp_test: https://github.com/sctp/lksctp-tools
   * both server and client are run on the same node
   * ip link set lo mtu 1500
   * sysctl -w vm.max_map_count=6553 (need it to

[PATCH v3 1/2] net/sctp: Make wrappers for accessing in/out streams

2018-08-10 Thread Konstantin Khorenko

This patch introduces wrappers for accessing in/out streams indirectly.
This will enable to replace physically contiguous memory arrays
of streams with flexible arrays (or maybe any other appropriate
mechanism) which do memory allocation on a per-page basis.

Signed-off-by: Oleg Babin 
Signed-off-by: Konstantin Khorenko 

---
v2 changes:
 sctp_stream_in() users are updated to provide stream as an argument,
 sctp_stream_{in,out}_ptr() are now just sctp_stream_{in,out}().

v3 changes:
 Move type chages struct sctp_stream_out -> flex_array to next patch.
 Make sctp_stream_{in,out}() static incline and move them to a header.
---
 include/net/sctp/structs.h   | 35 +---
 net/sctp/chunk.c |  6 ++--
 net/sctp/outqueue.c  | 11 
 net/sctp/socket.c|  4 +--
 net/sctp/stream.c| 65 +++-
 net/sctp/stream_interleave.c | 20 +++---
 net/sctp/stream_sched.c  | 13 +
 net/sctp/stream_sched_prio.c | 22 +++
 net/sctp/stream_sched_rr.c   |  8 +++---
 9 files changed, 103 insertions(+), 81 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index dbe1b911a24d..ce4bf844f573 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -394,37 +394,35 @@ void sctp_stream_update(struct sctp_stream *stream, 
struct sctp_stream *new);
 
 /* What is the current SSN number for this stream? */
 #define sctp_ssn_peek(stream, type, sid) \
-   ((stream)->type[sid].ssn)
+   (sctp_stream_##type((stream), (sid))->ssn)
 
 /* Return the next SSN number for this stream. */
 #define sctp_ssn_next(stream, type, sid) \
-   ((stream)->type[sid].ssn++)
+   (sctp_stream_##type((stream), (sid))->ssn++)
 
 /* Skip over this ssn and all below. */
 #define sctp_ssn_skip(stream, type, sid, ssn) \
-   ((stream)->type[sid].ssn = ssn + 1)
+   (sctp_stream_##type((stream), (sid))->ssn = ssn + 1)
 
 /* What is the current MID number for this stream? */
 #define sctp_mid_peek(stream, type, sid) \
-   ((stream)->type[sid].mid)
+   (sctp_stream_##type((stream), (sid))->mid)
 
 /* Return the next MID number for this stream.  */
 #define sctp_mid_next(stream, type, sid) \
-   ((stream)->type[sid].mid++)
+   (sctp_stream_##type((stream), (sid))->mid++)
 
 /* Skip over this mid and all below. */
 #define sctp_mid_skip(stream, type, sid, mid) \
-   ((stream)->type[sid].mid = mid + 1)
-
-#define sctp_stream_in(asoc, sid) (&(asoc)->stream.in[sid])
+   (sctp_stream_##type((stream), (sid))->mid = mid + 1)
 
 /* What is the current MID_uo number for this stream? */
 #define sctp_mid_uo_peek(stream, type, sid) \
-   ((stream)->type[sid].mid_uo)
+   (sctp_stream_##type((stream), (sid))->mid_uo)
 
 /* Return the next MID_uo number for this stream.  */
 #define sctp_mid_uo_next(stream, type, sid) \
-   ((stream)->type[sid].mid_uo++)
+   (sctp_stream_##type((stream), (sid))->mid_uo++)
 
 /*
  * Pointers to address related SCTP functions.
@@ -1456,6 +1454,23 @@ struct sctp_stream {
struct sctp_stream_interleave *si;
 };
 
+static inline struct sctp_stream_out *sctp_stream_out(
+   const struct sctp_stream *stream,
+   __u16 sid)
+{
+   return ((struct sctp_stream_out *)(stream->out)) + sid;
+}
+
+static inline struct sctp_stream_in *sctp_stream_in(
+   const struct sctp_stream *stream,
+   __u16 sid)
+{
+   return ((struct sctp_stream_in *)(stream->in)) + sid;
+}
+
+#define SCTP_SO(s, i) sctp_stream_out((s), (i))
+#define SCTP_SI(s, i) sctp_stream_in((s), (i))
+
 #define SCTP_STREAM_CLOSED 0x00
 #define SCTP_STREAM_OPEN   0x01
 
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index bfb9f812e2ef..ce8087846f05 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -325,7 +325,8 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
time_after(jiffies, chunk->msg->expires_at)) {
struct sctp_stream_out *streamout =
-   &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream];
+   SCTP_SO(&chunk->asoc->stream,
+   chunk->sinfo.sinfo_stream);
 
if (chunk->sent_count) {
chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
@@ -339,7 +340,8 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
} else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
   chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
struct sctp_stream_out *streamout =
-   &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream];
+   SCTP_SO(&chunk->asoc->stream,
+   chunk->sinfo.sinfo_stream);
 
chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX

Re: [PATCH v2 0/2] net/sctp: Avoid allocating high order memory with kmalloc()

2018-08-10 Thread Konstantin Khorenko


On 08/09/2018 11:43 AM, Konstantin Khorenko wrote:

On 08/04/2018 02:36 AM, Marcelo Ricardo Leitner wrote:

On Fri, Aug 03, 2018 at 07:21:00PM +0300, Konstantin Khorenko wrote:
...

Performance results:

  * Kernel: v4.18-rc6 - stock and with 2 patches from Oleg (earlier in this 
thread)
  * Node: CPU (8 cores): Intel(R) Xeon(R) CPU E31230 @ 3.20GHz
  RAM: 32 Gb

  * netperf: taken from https://github.com/HewlettPackard/netperf.git,
 compiled from sources with sctp support
  * netperf server and client are run on the same node
  * ip link set lo mtu 1500

The script used to run tests:
 # cat run_tests.sh
 #!/bin/bash

for test in SCTP_STREAM SCTP_STREAM_MANY SCTP_RR SCTP_RR_MANY; do
  echo "TEST: $test";
  for i in `seq 1 3`; do
echo "Iteration: $i";
set -x
netperf -t $test -H localhost -p 2 -S 20,20 -s 20,20 \
-l 60 -- -m 1452;
set +x
  done
done


Results (a bit reformatted to be more readable):

...

Nice, good numbers.

I'm missing some test that actually uses more than 1 stream. All tests
in netperf uses only 1 stream. They can use 1 or Many associations on
a socket, but not multiple streams. That means the numbers here show
that we shouldn't see any regression on the more traditional uses, per
Michael's reply on the other email, but it is not testing how it will
behave if we go crazy and use the 64k streams (worst case).

You'll need some other tool to test it. One idea is sctp_test, from
lksctp-tools. Something like:

Server side:
./sctp_test -H 172.0.0.1 -P 2 -l -d 0
Client side:
time ./sctp_test -H 172.0.0.1 -P 1 \
-h 172.0.0.1 -p 2 -s \
-c 1 -M 65535 -T -t 1 -x 10 -d 0

And then measure the difference on how long each test took. Can you
get these too?

Interesting that in my laptop just to start this test for the first
time can took some *seconds*. Seems kernel had a hard time
defragmenting the memory here. :)


Hi Marcelo,

got 3 of 4 results, please take a look, but i failed to measure running
the test on stock kernel when memory is fragmented, test fails with
*** connect:  Cannot allocate memory ***


Performance results:

  * Kernel: v4.18-rc8 - stock and with 2 patches v3
  * Node: CPU (8 cores): Intel(R) Xeon(R) CPU E31230 @ 3.20GHz
  RAM: 32 Gb

  * sctp_test: https://github.com/sctp/lksctp-tools
  * both server and client are run on the same node
  * ip link set lo mtu 1500
  * sysctl -w vm.max_map_count=6553 (need it to make memory fragmented)

The script used to run tests:
=
# cat run_sctp_test.sh
#!/bin/bash

set -x

uname -r
ip link set lo mtu 1500
swapoff -a

free
cat /proc/buddyinfo

./src/apps/sctp_test -H 127.0.0.1 -P 2 -l -d 0 &
sleep 3

time ./src/apps/sctp_test -H 127.0.0.1 -P 1 -h 127.0.0.1 -p 2 \
-s -c 1 -M 65535 -T -t 1 -x 10 -d 0 1>/dev/null

killall -9 lt-sctp_test
===

Results (a bit reformatted to be more readable):

1) ms stock kernel v4.18-rc8, no memory fragmentation
Info about memory - more or less same to iterations:
# free
  totalusedfree  shared  buff/cache   available
Mem:   32906008  21315632178184 764  51466832260968
Swap: 0   0   0

cat /proc/buddyinfo
Node 0, zone  DMA  0  1  1  0  2  1  1  0   
   1  1  3
Node 0, zoneDMA32  1  3  5  4  2  2  3  6   
   6  4867
Node 0, zone   Normal551422160204193 34 15  7   
  22 19   6956

test 1  test 2  test 3
real0m14.715s   0m14.593s   0m15.954s
user0m0.954s0m0.955s0m0.854s
sys 0m13.388s   0m12.537s   0m13.749s

2) kernel with fixes, no memory fragmentation
'free' and 'buddyinfo' similar to 1)

test 1  test 2  test 3
real0m14.959s   0m14.693s   0m14.762s
user0m0.948s0m0.921s0m0.929s
sys 0m13.538s   0m13.225s   0m13.217s

3) kernel with fixes, memory fragmented
(mmap() all available RAM, touch all pages, munmap() half of pages (each second 
page), do it again for RAM/2)
'free':
  totalusedfree  shared  buff/cache   available
Mem:   3290600830555200  302740 764 2048068  266452
Mem:   3290600830379948  541436 764 1984624  442376
Mem:   3290600830717312  262380 764 1926316  109908

/proc/buddyinfo:
Node 0, zone   Normal  40773 37 34 29  0  0  0  0   
   0  0  0
Node 0, zone   Normal 100332 68  8  4  2  1  1  0   
   0  0  0
Node 0, zone   Normal  31113  7  2  1  0

Re: [PATCH bpf-next] bpf: enable btf for use in all maps

2018-08-10 Thread Alexei Starovoitov

On Fri, Aug 10, 2018 at 09:55:35AM +0200, Daniel Borkmann wrote:
> On 08/10/2018 04:13 AM, Alexei Starovoitov wrote:
> > On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
> >> On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
> >>> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
>  On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
> > On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
> >> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
> >> the basic arraymap") enabled support for BTF and dumping via
> >> BPF fs for arraymap. However, both can be decoupled from each
> >> other such that all BPF maps can be supported for attaching
> >> BTF key/value information, while not all maps necessarily
> >> need to dump via map_seq_show_elem() callback.
> >>
> >> The check in array_map_check_btf() can be generalized as
> >> ultimatively the key and value size is the only contraint
> >> that needs to match for the map. The fact that the key needs
> >> to be of type int is optional; it could be any data type as
> >> long as it matches the 4 byte key size, just like hash table
> >> key or others could be of any data type as well.
> >>
> >> Minimal example of a hash table dump which then works out
> >> of the box for bpftool:
> >>
> >>   # bpftool map dump id 19
> >>   [{
> >>   "key": {
> >>   "": {
> >>   "vip": 0,
> >>   "vipv6": []
> >>   },
> >>   "port": 0,
> >>   "family": 0,
> >>   "proto": 0
> >>   },
> >>   "value": {
> >>   "flags": 0,
> >>   "vip_num": 0
> >>   }
> >>   }
> >>   ]
> >>
> >> Signed-off-by: Daniel Borkmann 
> >> Cc: Yonghong Song 
> >> ---
> >>  include/linux/bpf.h   |  4 +---
> >>  kernel/bpf/arraymap.c | 27 ---
> >>  kernel/bpf/inode.c|  3 ++-
> >>  kernel/bpf/syscall.c  | 24 
> >>  4 files changed, 23 insertions(+), 35 deletions(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index cd8790d..eb76e8e 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -48,8 +48,6 @@ struct bpf_map_ops {
> >>u32 (*map_fd_sys_lookup_elem)(void *ptr);
> >>void (*map_seq_show_elem)(struct bpf_map *map, void *key,
> >>  struct seq_file *m);
> >> -  int (*map_check_btf)(const struct bpf_map *map, const struct 
> >> btf *btf,
> >> -   u32 key_type_id, u32 value_type_id);
> >>  };
> >>  
> >>  struct bpf_map {
> >> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const 
> >> struct bpf_map *map)
> >>  
> >>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
> >>  {
> >> -  return map->ops->map_seq_show_elem && map->ops->map_check_btf;
> >> +  return map->btf && map->ops->map_seq_show_elem;
> >>  }
> >>  
> >>  extern const struct bpf_map_ops bpf_map_offload_ops;
> >> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> >> index 2aa55d030..67f0bdf 100644
> >> --- a/kernel/bpf/arraymap.c
> >> +++ b/kernel/bpf/arraymap.c
> >> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct 
> >> bpf_map *map, void *key,
> >>rcu_read_unlock();
> >>  }
> >>  
> >> -static int array_map_check_btf(const struct bpf_map *map, const 
> >> struct btf *btf,
> >> - u32 btf_key_id, u32 btf_value_id)
> >> -{
> >> -  const struct btf_type *key_type, *value_type;
> >> -  u32 key_size, value_size;
> >> -  u32 int_data;
> >> -
> >> -  key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
> >> -  if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> >> -  return -EINVAL;
> >> -
> >> -  int_data = *(u32 *)(key_type + 1);
> >> -  /* bpf array can only take a u32 key.  This check makes
> >> -   * sure that the btf matches the attr used during map_create.
> >> -   */
> >> -  if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
> >> -  BTF_INT_OFFSET(int_data))
> >> -  return -EINVAL;
> >
> > I think most of these checks are still necessary for array type.
> > Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
> > is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't 
> > makes sense.
> 
>  Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
>  on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
>  cou

Re: C45 support and mdiobus_scan

2018-08-10 Thread Andrew Lunn

On Fri, Aug 10, 2018 at 10:20:56AM -0500, Tom Lendacky wrote:
> On 8/9/2018 10:25 AM, Andrew Lunn wrote:
> >>> The PCIe core will look in the device tree and when it creates the
> >>> platform device for the i210 on the pcie bus, it points
> >>> pdev->dev.of_node at this node. So long as you are using a platform
> >>> with DT, you can do this. I hope you are not using x86..
> >>
> >> Yes I am :( Any possible solution for this?
> 
> I haven't looked too closely, but maybe you can add a new mdiobus_scan
> function for 10G that attempts get_phy_device() with is_c45 set to true
> and if nothing is found falls back to get_phy_device() with is_c45 set to
> false.

Hi Tom

I did consider at one point adding extra flags to the struct mii_bus
to indicate if the bus master supports C22 and or C45, and then scan
the bus as appropriate. We cannot unconditionally do a C45 scan on all
busses, because most bus drivers don't look for MII_ADDR_C45, and so
are wrongly going to do a C22 transaction. There is also one bus
driver i know of which can only do C45. But it at least returns
EOPNOTSUPP if you ask it to do a C22.

I think this needs addressing at some point. We are seeing more 2.5G,
5G and 10G MAC/PHY combinations, and they often need C45. So maybe
adding a flag saying C45 is supported, and then scanning is a good way
forward. Adding a flag saying C22 is supported might be too much work,
without enough return.

> I don't know what would happen if you have a non-c45 phy attached,
> but it's worth a shot to try it and see for each situation.

That should be fine. A C22 PHY should ignore a C45 transaction. At
least that was the design idea when C45 was introduced. But i would
not be too surprised if we find the odd C22 phy get confused, and we
need to add some sort of quirks.

 Andrew

Re: [PATCH bpf] Revert "xdp: add NULL pointer check in __xdp_return()"

2018-08-10 Thread Jakub Kicinski

On Fri, 10 Aug 2018 17:16:45 +0200, Björn Töpel wrote:
> Den fre 10 aug. 2018 kl 16:10 skrev Daniel Borkmann :
> >
> > On 08/10/2018 11:28 AM, Björn Töpel wrote:  
> > > From: Björn Töpel 
> > >
> > > This reverts commit 36e0f12bbfd3016f495904b35e41c5711707509f.
> > >
> > > The reverted commit adds a WARN to check against NULL entries in the
> > > mem_id_ht rhashtable. Any kernel path implementing the XDP (generic or
> > > driver) fast path is required to make a paired
> > > xdp_rxq_info_reg/xdp_rxq_info_unreg call for proper function. In
> > > addition, a driver using a different allocation scheme than the
> > > default MEM_TYPE_PAGE_SHARED is required to additionally call
> > > xdp_rxq_info_reg_mem_model.
> > >
> > > For MEM_TYPE_ZERO_COPY, an xdp_rxq_info_reg_mem_model call ensures
> > > that the mem_id_ht rhashtable has a properly inserted allocator id. If
> > > not, this would be a driver bug. A NULL pointer kernel OOPS is
> > > preferred to the WARN.
> > >
> > > Suggested-by: Jesper Dangaard Brouer 
> > > Signed-off-by: Björn Töpel   
> >
> > Given the last bpf pr went out yesterday night, I've applied this to
> > bpf-next (worst case we can just route it via stable), thanks!  
> 
> Ah, right! Thanks!
> 
> bpf-next is OK. (Since this path is currently not used yet by any driver... 
> :-()

Wasn't this dead code, anyway?  The frame return path is for redirects,
and one can't convert_to_xdp_frame presently?

[PATCH net-next 1/2] ip: add helpers to process in-order fragments faster.

2018-08-10 Thread Peter Oskolkov

This patch introduces several helper functions/macros that will be
used in the follow-up patch. No runtime changes yet.

The new logic (fully implemented in the second patch) is as follows:

* Nodes in the rb-tree will now contain not single fragments, but lists
  of consecutive fragments ("runs").

* At each point in time, the current "active" run at the tail is
  maintained/tracked. Fragments that arrive in-order, adjacent
  to the previous tail fragment, are added to this tail run without
  triggering the re-balancing of the rb-tree.

* If a fragment arrives out of order with the offset _before_ the tail run,
  it is inserted into the rb-tree as a single fragment.

* If a fragment arrives after the current tail fragment (with a gap),
  it starts a new "tail" run, as is inserted into the rb-tree
  at the end as the head of the new run.

skb->cb is used to store additional information
needed here (suggested by Eric Dumazet).

Reported-by: Willem de Bruijn 
Cc: Eric Dumazet 
Cc: Cc: Florian Westphal 

---
 include/net/inet_frag.h |  6 
 net/ipv4/ip_fragment.c  | 73 +
 2 files changed, 79 insertions(+)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index b86d14528188..1662cbc0b46b 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -57,7 +57,9 @@ struct frag_v6_compare_key {
  * @lock: spinlock protecting this frag
  * @refcnt: reference count of the queue
  * @fragments: received fragments head
+ * @rb_fragments: received fragments rb-tree root
  * @fragments_tail: received fragments tail
+ * @last_run_head: the head of the last "run". see ip_fragment.c
  * @stamp: timestamp of the last received fragment
  * @len: total length of the original datagram
  * @meat: length of received fragments so far
@@ -78,6 +80,7 @@ struct inet_frag_queue {
struct sk_buff  *fragments;  /* Used in IPv6. */
struct rb_root  rb_fragments; /* Used in IPv4. */
struct sk_buff  *fragments_tail;
+   struct sk_buff  *last_run_head;
ktime_t stamp;
int len;
int meat;
@@ -113,6 +116,9 @@ void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
 
+/* Free all skbs in the queue; return the sum of their truesizes. */
+unsigned int inet_frag_rbtree_purge(struct rb_root *root);
+
 static inline void inet_frag_put(struct inet_frag_queue *q)
 {
if (refcount_dec_and_test(&q->refcnt))
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7cb7ed761d8c..26ace9d2d976 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -57,6 +57,57 @@
  */
 static const char ip_frag_cache_name[] = "ip4-frags";
 
+/* Use skb->cb to track consecutive/adjacent fragments coming at
+ * the end of the queue. Nodes in the rb-tree queue will
+ * contain "runs" of one or more adjacent fragments.
+ *
+ * Invariants:
+ * - next_frag is NULL at the tail of a "run";
+ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
+ */
+struct ipfrag_skb_cb {
+   struct inet_skb_parmh;
+   struct sk_buff  *next_frag;
+   int frag_run_len;
+};
+
+#define FRAG_CB(skb)   ((struct ipfrag_skb_cb *)((skb)->cb))
+
+static void ip4_frag_init_run(struct sk_buff *skb)
+{
+   BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
+
+   FRAG_CB(skb)->next_frag = NULL;
+   FRAG_CB(skb)->frag_run_len = skb->len;
+}
+
+/* Append skb to the last "run". */
+static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
+   struct sk_buff *skb)
+{
+   RB_CLEAR_NODE(&skb->rbnode);
+   FRAG_CB(skb)->next_frag = NULL;
+
+   FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
+   FRAG_CB(q->fragments_tail)->next_frag = skb;
+   q->fragments_tail = skb;
+}
+
+/* Create a new "run" with the skb. */
+static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
+{
+   if (q->last_run_head)
+   rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
+&q->last_run_head->rbnode.rb_right);
+   else
+   rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
+   rb_insert_color(&skb->rbnode, &q->rb_fragments);
+
+   ip4_frag_init_run(skb);
+   q->fragments_tail = skb;
+   q->last_run_head = skb;
+}
+
 /* Describe an entry in the "incomplete datagrams" queue. */
 struct ipq {
struct inet_frag_queue q;
@@ -654,6 +705,28 @@ struct sk_buff *ip_check_defrag(struct net *net, struct 
sk_buff *skb, u32 user)
 }
 EXPORT_SYMBOL(ip_check_defrag);
 
+unsigned int inet_frag_rbtree_purge(struct rb_root *root)
+{
+   struct rb_node *p = rb_first(root);
+   unsigned int sum = 0;
+
+   while (p) {
+

[PATCH net-next 2/2] ip: process in-order fragments efficiently

2018-08-10 Thread Peter Oskolkov

This patch changes the runtime behavior of IP defrag queue:
incoming in-order fragments are added to the end of the current
list/"run" of in-order fragments at the tail.

On some workloads, UDP stream performance is substantially improved:

RX: ./udp_stream -F 10 -T 2 -l 60
TX: ./udp_stream -c -H  -F 10 -T 5 -l 60

with this patchset applied on a 10Gbps receiver:

  throughput=9524.18
  throughput_units=Mbit/s

upstream (net-next):

  throughput=4608.93
  throughput_units=Mbit/s

Reported-by: Willem de Bruijn 
Cc: Eric Dumazet 
Cc: Cc: Florian Westphal 

---
 net/ipv4/inet_fragment.c |   2 +-
 net/ipv4/ip_fragment.c   | 110 ---
 2 files changed, 70 insertions(+), 42 deletions(-)

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 6d258a5669e7..bcb11f3a27c0 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -146,7 +146,7 @@ void inet_frag_destroy(struct inet_frag_queue *q)
fp = xp;
} while (fp);
} else {
-   sum_truesize = skb_rbtree_purge(&q->rb_fragments);
+   sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
}
sum = sum_truesize + f->qsize;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 26ace9d2d976..88281fbce88c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -126,8 +126,8 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
-struct net_device *dev);
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+struct sk_buff *prev_tail, struct net_device *dev);
 
 
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -219,7 +219,12 @@ static void ip_expire(struct timer_list *t)
head = skb_rb_first(&qp->q.rb_fragments);
if (!head)
goto out;
-   rb_erase(&head->rbnode, &qp->q.rb_fragments);
+   if (FRAG_CB(head)->next_frag)
+   rb_replace_node(&head->rbnode,
+   &FRAG_CB(head)->next_frag->rbnode,
+   &qp->q.rb_fragments);
+   else
+   rb_erase(&head->rbnode, &qp->q.rb_fragments);
memset(&head->rbnode, 0, sizeof(head->rbnode));
barrier();
}
@@ -320,7 +325,7 @@ static int ip_frag_reinit(struct ipq *qp)
return -ETIMEDOUT;
}
 
-   sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
+   sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
sub_frag_mem_limit(qp->q.net, sum_truesize);
 
qp->q.flags = 0;
@@ -329,6 +334,7 @@ static int ip_frag_reinit(struct ipq *qp)
qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
+   qp->q.last_run_head = NULL;
qp->iif = 0;
qp->ecn = 0;
 
@@ -340,7 +346,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff 
*skb)
 {
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct rb_node **rbn, *parent;
-   struct sk_buff *skb1;
+   struct sk_buff *skb1, *prev_tail;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
@@ -418,38 +424,41 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff 
*skb)
 */
 
/* Find out where to put this fragment.  */
-   skb1 = qp->q.fragments_tail;
-   if (!skb1) {
-   /* This is the first fragment we've received. */
-   rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
-   qp->q.fragments_tail = skb;
-   } else if ((skb1->ip_defrag_offset + skb1->len) < end) {
-   /* This is the common/special case: skb goes to the end. */
+   prev_tail = qp->q.fragments_tail;
+   if (!prev_tail)
+   ip4_frag_create_run(&qp->q, skb);  /* First fragment. */
+   else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
+   /* This is the common case: skb goes to the end. */
/* Detect and discard overlaps. */
-   if (offset < (skb1->ip_defrag_offset + skb1->len))
+   if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
goto discard_qp;
-   /* Insert after skb1. */
-   rb_link_node(&skb->rbnode, &skb1->rbnode, 
&skb1->rbnode.rb_right);
-   qp->q.fragments_tail = skb;
+   if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
+   ip4_frag_append_to_last_run(&qp->q, skb);
+   else
+   ip4_frag_create_run(&qp->q, skb);
} else {
-   /* Binary search. Note that skb can become the first fragment, 
but
-* not the last (covered

Re: [PATCH bpf] Revert "xdp: add NULL pointer check in __xdp_return()"

2018-08-10 Thread Jakub Kicinski

On Fri, 10 Aug 2018 17:15:07 +0200, Björn Töpel wrote:
> Den fre 10 aug. 2018 kl 12:18 skrev Jesper Dangaard Brouer 
> :
> >
> > On Fri, 10 Aug 2018 11:28:02 +0200
> > Björn Töpel  wrote:
> >  
> > > From: Björn Töpel 
> > >
> > > This reverts commit 36e0f12bbfd3016f495904b35e41c5711707509f.
> > >
> > > The reverted commit adds a WARN to check against NULL entries in the
> > > mem_id_ht rhashtable. Any kernel path implementing the XDP (generic or
> > > driver) fast path is required to make a paired
> > > xdp_rxq_info_reg/xdp_rxq_info_unreg call for proper function. In
> > > addition, a driver using a different allocation scheme than the
> > > default MEM_TYPE_PAGE_SHARED is required to additionally call
> > > xdp_rxq_info_reg_mem_model.
> > >
> > > For MEM_TYPE_ZERO_COPY, an xdp_rxq_info_reg_mem_model call ensures
> > > that the mem_id_ht rhashtable has a properly inserted allocator id. If
> > > not, this would be a driver bug. A NULL pointer kernel OOPS is
> > > preferred to the WARN.  
> >
> > Acked-by: Jesper Dangaard Brouer 
> >
> > As a comment says in the code: /* NB! Only valid from an xdp_buff! */
> > Which is (currently) guarded by the return/exit in convert_to_xdp_frame().
> >
> > This means that this code path can only be invoked while the driver is
> > still running under the RX NAPI process. Thus, there is no chance that
> > the allocator-id is gone (via calling xdp_rxq_info_unreg) for this code
> > path.
> >
> > But I really hope we at somepoint can convert a MEM_TYPE_ZERO_COPY into
> > a form of xdp_frame, that can travel further into the redirect-core.
> > In which case, we likely need to handle the NULL case (but also need
> > other code to handle what to do with the memory backing the frame)
> >
> > (I'm my vision here:)
> >
> > I really dislike that the current Zero-Copy mode steal ALL packets,
> > when ZC is enabled on a RX-queue.  This is not better than the existing
> > bypass solutions, which have ugly ways of re-injecting packet back into
> > the network stack.  With the integration with XDP, we have the
> > flexibility of selecting frames, that we don't want to be "bypassed"
> > into AF_XDP, and want the kernel process these. (The most common
> > use-case is letting the kernel handle the arptable).  IHMO this is what
> > will/would make AF_XDP superior to other bypass solutions.

Perhaps I'm misunderstanding, but I don't think that's necessarily
true.  AFAIU on XDP_PASS drivers should copy the frame into a skb and
pass it up the stack.  Granted that's fairly slow but *semantically*
AF_XDP doesn't necessarily steal all the packets :)

> Thanks for putting your visions/ideas here! I agree with both of your
> last sections, and this is what we're working towards. AF_XDP ZC has
> to play nicer with XDP.  The current (well, the soon-to-be-published
> [1] ;-)) ZC scheme is just a first step, and should be seen as a
> starting point so people can start playing using AF_XDP. Jakub also
> mentioned these issues a couple of threads ago, so there are
> definitely more people feeling the ZC allocator pains. Mid-term a
> sophisticated/proper and generic (for inter-driver reuse) ZC allocator
> is needed; Converting xdp_buffs to xdp_frames cheaply for multi-CPU
> completion, and hopefully dito for the XDP_PASS/kernel stack path. But
> let's start with something simple that works, and take it from there.
> 
> Björn
> 
> [1] WIP: https://github.com/bjoto/linux/tree/af-xdp-i40e-zc

Nice, looking forward to a refresh of i40e patches! :)

> > > Suggested-by: Jesper Dangaard Brouer 
> > > Signed-off-by: Björn Töpel 
> > > ---
> > >  net/core/xdp.c | 3 +--
> > >  1 file changed, 1 insertion(+), 2 deletions(-)
> > >
> > > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > > index 6771f1855b96..9d1f22072d5d 100644
> > > --- a/net/core/xdp.c
> > > +++ b/net/core/xdp.c
> > > @@ -345,8 +345,7 @@ static void __xdp_return(void *data, struct 
> > > xdp_mem_info *mem, bool napi_direct,
> > >   rcu_read_lock();
> > >   /* mem->id is valid, checked in 
> > > xdp_rxq_info_reg_mem_model() */
> > >   xa = rhashtable_lookup(mem_id_ht, &mem->id, 
> > > mem_id_rht_params);
> > > - if (!WARN_ON_ONCE(!xa))
> > > - xa->zc_alloc->free(xa->zc_alloc, handle);
> > > + xa->zc_alloc->free(xa->zc_alloc, handle);
> > >   rcu_read_unlock();
> > >   default:
> > >   /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ 
> > >  
> >
> >
> >
> > --
> > Best regards,
> >   Jesper Dangaard Brouer
> >   MSc.CS, Principal Kernel Engineer at Red Hat
> >   LinkedIn: http://www.linkedin.com/in/brouer

Motorcycle Owners List

2018-08-10 Thread Brenda Calkins




--
Hi,
 
Greetings of the day!
 
Would you be interested in acquiring an email list of "Motorcycle Owners List" from USA?
 
We also have data for Luxury Car Owners List, Luxury Brand Buyers List, Mercedes-Benz Owners, BMW Owners, Audi Owners, Hyundai Owners, Porsche Owners, Toyota Owners, Harley Davidson Owners List, Car Owners, HNI list, RV Owners and many more...
 
All the contacts are opt-in verified, 100% permission based and can be used for unlimited multi-channel marketing.
 
Please let me know your thoughts towards procuring the Motorcycle Owners List.
 
Best Regards,

Brenda Calkins
Research Analyst
 
 
We respect your privacy, if you do not wish to receive any further emails from our end, please reply with a subject “Leave Out”.

Re: [PATCH net] l2tp: use sk_dst_check() to avoid race on sk->sk_dst_cache

2018-08-10 Thread Guillaume Nault

On Thu, Aug 09, 2018 at 11:54:05AM -0700, Wei Wang wrote:
> From: Wei Wang 
> 
> In l2tp code, if it is a L2TP_UDP_ENCAP tunnel, tunnel->sk points to a
> UDP socket. User could call sendmsg() on both this tunnel and the UDP
> socket itself concurrently. As l2tp_xmit_skb() holds socket lock and call
> __sk_dst_check() to refresh sk->sk_dst_cache, while udpv6_sendmsg() is
> lockless and call sk_dst_check() to refresh sk->sk_dst_cache, there
> could be a race and cause the dst cache to be freed multiple times.
> So we fix l2tp side code to always call sk_dst_check() to garantee
> xchg() is called when refreshing sk->sk_dst_cache to avoid race
> conditions.
> 
> Syzkaller reported stack trace:
> BUG: KASAN: use-after-free in atomic_read 
> include/asm-generic/atomic-instrumented.h:21 [inline]
> BUG: KASAN: use-after-free in atomic_fetch_add_unless 
> include/linux/atomic.h:575 [inline]
> BUG: KASAN: use-after-free in atomic_add_unless include/linux/atomic.h:597 
> [inline]
> BUG: KASAN: use-after-free in dst_hold_safe include/net/dst.h:308 [inline]
> BUG: KASAN: use-after-free in ip6_hold_safe+0xe6/0x670 net/ipv6/route.c:1029
> Read of size 4 at addr 8801aea9a880 by task syz-executor129/4829
> 
> CPU: 0 PID: 4829 Comm: syz-executor129 Not tainted 4.18.0-rc7-next-20180802+ 
> #30
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
>  print_address_description+0x6c/0x20b mm/kasan/report.c:256
>  kasan_report_error mm/kasan/report.c:354 [inline]
>  kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
>  check_memory_region_inline mm/kasan/kasan.c:260 [inline]
>  check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
>  kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272
>  atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
>  atomic_fetch_add_unless include/linux/atomic.h:575 [inline]
>  atomic_add_unless include/linux/atomic.h:597 [inline]
>  dst_hold_safe include/net/dst.h:308 [inline]
>  ip6_hold_safe+0xe6/0x670 net/ipv6/route.c:1029
>  rt6_get_pcpu_route net/ipv6/route.c:1249 [inline]
>  ip6_pol_route+0x354/0xd20 net/ipv6/route.c:1922
>  ip6_pol_route_output+0x54/0x70 net/ipv6/route.c:2098
>  fib6_rule_lookup+0x283/0x890 net/ipv6/fib6_rules.c:122
>  ip6_route_output_flags+0x2c5/0x350 net/ipv6/route.c:2126
>  ip6_dst_lookup_tail+0x1278/0x1da0 net/ipv6/ip6_output.c:978
>  ip6_dst_lookup_flow+0xc8/0x270 net/ipv6/ip6_output.c:1079
>  ip6_sk_dst_lookup_flow+0x5ed/0xc50 net/ipv6/ip6_output.c:1117
>  udpv6_sendmsg+0x2163/0x36b0 net/ipv6/udp.c:1354
>  inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
>  sock_sendmsg_nosec net/socket.c:622 [inline]
>  sock_sendmsg+0xd5/0x120 net/socket.c:632
>  ___sys_sendmsg+0x51d/0x930 net/socket.c:2115
>  __sys_sendmmsg+0x240/0x6f0 net/socket.c:2210
>  __do_sys_sendmmsg net/socket.c:2239 [inline]
>  __se_sys_sendmmsg net/socket.c:2236 [inline]
>  __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2236
>  do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x446a29
> Code: e8 ac b8 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 
> 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 
> 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:7f4de5532db8 EFLAGS: 0246 ORIG_RAX: 0133
> RAX: ffda RBX: 006dcc38 RCX: 00446a29
> RDX: 00b8 RSI: 20001b00 RDI: 0003
> RBP: 006dcc30 R08: 7f4de5533700 R09: 
> R10:  R11: 0246 R12: 006dcc3c
> R13: 7ffe2b830fdf R14: 7f4de55339c0 R15: 0001
> 
> Fixes: 71b1391a4128 ("l2tp: ensure sk->dst is still valid")
> Reported-by: syzbot+05f840f3b04f211ba...@syzkaller.appspotmail.com
> Signed-off-by: Wei Wang 
> Signed-off-by: Martin KaFai Lau 
> Cc: David Ahern 
> Cc: Cong Wang 
> ---
>  net/l2tp/l2tp_core.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
> index 40261cb68e83..7166b61338d4 100644
> --- a/net/l2tp/l2tp_core.c
> +++ b/net/l2tp/l2tp_core.c
> @@ -1110,7 +1110,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct 
> sk_buff *skb, int hdr_len
>  
>   /* Get routing info from the tunnel socket */
>   skb_dst_drop(skb);
> - skb_dst_set(skb, dst_clone(__sk_dst_check(sk, 0)));
> + skb_dst_set(skb, dst_clone(sk_dst_check(sk, 0)));
>  
With sk_dst_check(), we're now holding an extra reference on the cached
dst. How is it dropped? Shouldn't we remove dst_clone()?

Also, does syzbot have a reproducer? I haven't found the original
report on the mailing list.

Re: Error running AF_XDP sample application

2018-08-10 Thread Björn Töpel

Den fre 10 aug. 2018 kl 15:23 skrev Konrad Djimeli :
>
> On 2018-08-10 11:58, Konrad Djimeli wrote:
> > On 2018-08-10 03:51, Jakub Kicinski wrote:
> >> On Thu, 09 Aug 2018 18:18:08 +0200, kdjimeli wrote:
> >>> Hello,
> >>>
> >>> I have been trying to test a sample AF_XDP program, but I have been
> >>> experiencing some issues.
> >>> After building the sample code
> >>> https://github.com/torvalds/linux/tree/master/samples/bpf,
> >>> when running the xdpsock binary, I get the errors
> >>> "libbpf: failed to create map (name: 'xsks_map'): Invalid argument"
> >>> "libbpf: failed to load object './xdpsock_kern.o"
> >>>
> >>> I tried to figure out the cause of the error but all I know is that it
> >>> occurs at line 910 with the function
> >>> call "bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)".
> >>>
> >>> Please I would like to inquire what could be a possible for this error.
> >>
> >> which kernel version are you running?
> >
> > My kernel version is 4.18.0-rc8+. I cloned it from
> > https://github.com/torvalds/linux before building a running.
> >
> > My commit head(git show-ref --head) is at
> > 1236568ee3cbb0d3ac62d0074a29b97ecf34cbbc HEAD
> > 1236568ee3cbb0d3ac62d0074a29b97ecf34cbbc refs/heads/master
> > 1236568ee3cbb0d3ac62d0074a29b97ecf34cbbc refs/remotes/origin/HEAD
> > 1236568ee3cbb0d3ac62d0074a29b97ecf34cbbc refs/remotes/origin/master
> > ...
> >
> >
> > I also applied the patch https://patchwork.ozlabs.org/patch/949884/
> > (samples: bpf: convert xdpsock_user.c to libbpf ), as the error was
> > initially in the form show below:
> >   "failed to create a map: 22 Invalid argument"
> >   "ERROR: load_bpf_file"
> >
> > Thanks
> > Konrad
>
> Also other sample applications that make use of other bpf maps, such as
> BPF_MAP_TYPE_CPUMAP in xdp_redirect_cpu work fine. But the application
> with BPF_MAP_TYPE_XSKMAP fails producing the error mentioned above.
>
> Thanks
> Konrad

Thanks for taking AF_XDP for a spin!

Before I start digging into details; Do you have CONFIG_XDP_SOCKETS=y
in your config? :-)


Björn

Re: C45 support and mdiobus_scan

2018-08-10 Thread Tom Lendacky

On 8/9/2018 10:25 AM, Andrew Lunn wrote:
>>> The PCIe core will look in the device tree and when it creates the
>>> platform device for the i210 on the pcie bus, it points
>>> pdev->dev.of_node at this node. So long as you are using a platform
>>> with DT, you can do this. I hope you are not using x86..
>>
>> Yes I am :( Any possible solution for this?

I haven't looked too closely, but maybe you can add a new mdiobus_scan
function for 10G that attempts get_phy_device() with is_c45 set to true
and if nothing is found falls back to get_phy_device() with is_c45 set to
false.  I don't know what would happen if you have a non-c45 phy attached,
but it's worth a shot to try it and see for each situation.

Thanks,
Tom

> 
> Well, DT can be used with x86. I think Edison did that. But i assume
> your PCIe host is in ACPI, not DT. So getting this linking working
> will not be easy.
> 
> There has been some work to add an ACPI binding for PHYs. I don't know
> if it actually got far enough that you can hack your DSDT to add a
> PHY. But i'm sure it did not get far enough that you can describe an
> MDIO bus in DSDT, so it probably is not going to help you.
> 
>> I guess in ultimate case I will have to switch to ARM based setup.
> 
> Yes, or MIPS.
> 
>  Andrew
>

Re: [PATCH bpf] Revert "xdp: add NULL pointer check in __xdp_return()"

2018-08-10 Thread Björn Töpel

Den fre 10 aug. 2018 kl 16:10 skrev Daniel Borkmann :
>
> On 08/10/2018 11:28 AM, Björn Töpel wrote:
> > From: Björn Töpel 
> >
> > This reverts commit 36e0f12bbfd3016f495904b35e41c5711707509f.
> >
> > The reverted commit adds a WARN to check against NULL entries in the
> > mem_id_ht rhashtable. Any kernel path implementing the XDP (generic or
> > driver) fast path is required to make a paired
> > xdp_rxq_info_reg/xdp_rxq_info_unreg call for proper function. In
> > addition, a driver using a different allocation scheme than the
> > default MEM_TYPE_PAGE_SHARED is required to additionally call
> > xdp_rxq_info_reg_mem_model.
> >
> > For MEM_TYPE_ZERO_COPY, an xdp_rxq_info_reg_mem_model call ensures
> > that the mem_id_ht rhashtable has a properly inserted allocator id. If
> > not, this would be a driver bug. A NULL pointer kernel OOPS is
> > preferred to the WARN.
> >
> > Suggested-by: Jesper Dangaard Brouer 
> > Signed-off-by: Björn Töpel 
>
> Given the last bpf pr went out yesterday night, I've applied this to
> bpf-next (worst case we can just route it via stable), thanks!

Ah, right! Thanks!

bpf-next is OK. (Since this path is currently not used yet by any driver... :-()


Björn

Re: [PATCH bpf] Revert "xdp: add NULL pointer check in __xdp_return()"

2018-08-10 Thread Björn Töpel

Den fre 10 aug. 2018 kl 12:18 skrev Jesper Dangaard Brouer :
>
> On Fri, 10 Aug 2018 11:28:02 +0200
> Björn Töpel  wrote:
>
> > From: Björn Töpel 
> >
> > This reverts commit 36e0f12bbfd3016f495904b35e41c5711707509f.
> >
> > The reverted commit adds a WARN to check against NULL entries in the
> > mem_id_ht rhashtable. Any kernel path implementing the XDP (generic or
> > driver) fast path is required to make a paired
> > xdp_rxq_info_reg/xdp_rxq_info_unreg call for proper function. In
> > addition, a driver using a different allocation scheme than the
> > default MEM_TYPE_PAGE_SHARED is required to additionally call
> > xdp_rxq_info_reg_mem_model.
> >
> > For MEM_TYPE_ZERO_COPY, an xdp_rxq_info_reg_mem_model call ensures
> > that the mem_id_ht rhashtable has a properly inserted allocator id. If
> > not, this would be a driver bug. A NULL pointer kernel OOPS is
> > preferred to the WARN.
>
> Acked-by: Jesper Dangaard Brouer 
>
> As a comment says in the code: /* NB! Only valid from an xdp_buff! */
> Which is (currently) guarded by the return/exit in convert_to_xdp_frame().
>
> This means that this code path can only be invoked while the driver is
> still running under the RX NAPI process. Thus, there is no chance that
> the allocator-id is gone (via calling xdp_rxq_info_unreg) for this code
> path.
>
> But I really hope we at somepoint can convert a MEM_TYPE_ZERO_COPY into
> a form of xdp_frame, that can travel further into the redirect-core.
> In which case, we likely need to handle the NULL case (but also need
> other code to handle what to do with the memory backing the frame)
>
> (I'm my vision here:)
>
> I really dislike that the current Zero-Copy mode steal ALL packets,
> when ZC is enabled on a RX-queue.  This is not better than the existing
> bypass solutions, which have ugly ways of re-injecting packet back into
> the network stack.  With the integration with XDP, we have the
> flexibility of selecting frames, that we don't want to be "bypassed"
> into AF_XDP, and want the kernel process these. (The most common
> use-case is letting the kernel handle the arptable).  IHMO this is what
> will/would make AF_XDP superior to other bypass solutions.
>
>

Thanks for putting your visions/ideas here! I agree with both of your
last sections, and this is what we're working towards. AF_XDP ZC has
to play nicer with XDP.  The current (well, the soon-to-be-published
[1] ;-)) ZC scheme is just a first step, and should be seen as a
starting point so people can start playing using AF_XDP. Jakub also
mentioned these issues a couple of threads ago, so there are
definitely more people feeling the ZC allocator pains. Mid-term a
sophisticated/proper and generic (for inter-driver reuse) ZC allocator
is needed; Converting xdp_buffs to xdp_frames cheaply for multi-CPU
completion, and hopefully dito for the XDP_PASS/kernel stack path. But
let's start with something simple that works, and take it from there.

Björn

[1] WIP: https://github.com/bjoto/linux/tree/af-xdp-i40e-zc

> > Suggested-by: Jesper Dangaard Brouer 
> > Signed-off-by: Björn Töpel 
> > ---
> >  net/core/xdp.c | 3 +--
> >  1 file changed, 1 insertion(+), 2 deletions(-)
> >
> > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > index 6771f1855b96..9d1f22072d5d 100644
> > --- a/net/core/xdp.c
> > +++ b/net/core/xdp.c
> > @@ -345,8 +345,7 @@ static void __xdp_return(void *data, struct 
> > xdp_mem_info *mem, bool napi_direct,
> >   rcu_read_lock();
> >   /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() 
> > */
> >   xa = rhashtable_lookup(mem_id_ht, &mem->id, 
> > mem_id_rht_params);
> > - if (!WARN_ON_ONCE(!xa))
> > - xa->zc_alloc->free(xa->zc_alloc, handle);
> > + xa->zc_alloc->free(xa->zc_alloc, handle);
> >   rcu_read_unlock();
> >   default:
> >   /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
>
>
>
> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCH iproute2/net-next v2] tc_util: Add support for showing TCA_STATS_BASIC_HW statistics

2018-08-10 Thread Eelco Chaudron




On 10 Aug 2018, at 16:44, Stephen Hemminger wrote:

> On Fri, 10 Aug 2018 07:59:30 -0400
> Eelco Chaudron  wrote:
>
>> +if (bs.bytes >= bs_hw.bytes && bs.packets >= bs_hw.packets) {
>> +print_string(PRINT_FP, NULL, "%s", _SL_);
>> +print_string(PRINT_FP, NULL, "%s", prefix);
>> +print_lluint(PRINT_ANY, "sw_bytes",
>> + "Sent software %llu bytes",
>> + bs.bytes - bs_hw.bytes);
>> +print_uint(PRINT_ANY, "sw_packets", " %u pkt",
>> +   bs.packets - bs_hw.packets);
>> +}
>> +}
>> +
>> +print_string(PRINT_FP, NULL, "%s", _SL_);
>> +print_string(PRINT_FP, NULL, "%s", prefix);
>> +print_lluint(PRINT_ANY, "hw_bytes", "Sent hardware %llu bytes",
>> + bs_hw.bytes);
>
> What does the output look like?

See the two +’es below:

$ tc -s filter show dev enp3s0np0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  dst_ip 2.0.0.0
  src_ip 1.0.0.0
  ip_flags nofrag
  in_hw
action order 1: mirred (Egress Redirect to device eth1) stolen
index 1 ref 1 bind 1 installed 0 sec used 0 sec
Action statistics:
Sent 534884742 bytes 8915697 pkt (dropped 0, overlimits 0 requeues 0)
+   Sent software 187542 bytes 4077 pkt
+   Sent hardware 534697200 bytes 8911620 pkt
backlog 0b 0p requeues 0
cookie 89173e6a7001becfd486bda17e29

Re: [PATCH iproute2/net-next v2] tc_util: Add support for showing TCA_STATS_BASIC_HW statistics

2018-08-10 Thread Stephen Hemminger

On Fri, 10 Aug 2018 07:59:30 -0400
Eelco Chaudron  wrote:

> + if (bs.bytes >= bs_hw.bytes && bs.packets >= bs_hw.packets) {
> + print_string(PRINT_FP, NULL, "%s", _SL_);
> + print_string(PRINT_FP, NULL, "%s", prefix);
> + print_lluint(PRINT_ANY, "sw_bytes",
> +  "Sent software %llu bytes",
> +  bs.bytes - bs_hw.bytes);
> + print_uint(PRINT_ANY, "sw_packets", " %u pkt",
> +bs.packets - bs_hw.packets);
> + }
> + }
> +
> + print_string(PRINT_FP, NULL, "%s", _SL_);
> + print_string(PRINT_FP, NULL, "%s", prefix);
> + print_lluint(PRINT_ANY, "hw_bytes", "Sent hardware %llu bytes",
> +  bs_hw.bytes);

What does the output look like?

Re: [PATCH v8 bpf-next 00/10] veth: Driver XDP

2018-08-10 Thread Daniel Borkmann

On 08/03/2018 09:58 AM, Toshiaki Makita wrote:
> This patch set introduces driver XDP for veth.
> Basically this is used in conjunction with redirect action of another XDP
> program.
> 
>   NIC ---> veth===veth
>  (XDP) (redirect)(XDP)
> 
> In this case xdp_frame can be forwarded to the peer veth without
> modification, so we can expect far better performance than generic XDP.
> 
> 
> Envisioned use-cases
> 
> 
> * Container managed XDP program
> Container host redirects frames to containers by XDP redirect action, and
> privileged containers can deploy their own XDP programs.
> 
> * XDP program cascading
> Two or more XDP programs can be called for each packet by redirecting
> xdp frames to veth.
> 
> * Internal interface for an XDP bridge
> When using XDP redirection to create a virtual bridge, veth can be used
> to create an internal interface for the bridge.
> 
> 
> Implementation
> --
> 
> This changeset is making use of NAPI to implement ndo_xdp_xmit and
> XDP_TX/REDIRECT. This is mainly because XDP heavily relies on NAPI
> context.
>  - patch 1: Export a function needed for veth XDP.
>  - patch 2-3: Basic implementation of veth XDP.
>  - patch 4-6: Add ndo_xdp_xmit.
>  - patch 7-9: Add XDP_TX and XDP_REDIRECT.
>  - patch 10: Performance optimization for multi-queue env.
> 
> 
> Tests and performance numbers
> -
> 
> Tested with a simple XDP program which only redirects packets between
> NIC and veth. I used i40e 25G NIC (XXV710) for the physical NIC. The
> server has 20 of Xeon Silver 2.20 GHz cores.
> 
>   pktgen --(wire)--> XXV710 (i40e) <--(XDP redirect)--> veth===veth (XDP)
> 
> The rightmost veth loads XDP progs and just does DROP or TX. The number
> of packets is measured in the XDP progs. The leftmost pktgen sends
> packets at 37.1 Mpps (almost 25G wire speed).
> 
> veth XDP actionFlowsMpps
> 
> DROP   110.6
> DROP   221.2
> DROP 10036.0
> TX 1 5.0
> TX 210.0
> TX   10031.0
> 
> I also measured netperf TCP_STREAM but was not so great performance due
> to lack of tx/rx checksum offload and TSO, etc.
> 
>   netperf <--(wire)--> XXV710 (i40e) <--(XDP redirect)--> veth===veth (XDP 
> PASS)
> 
> Direction Flows   Gbps
> ==
> external->veth1   20.8
> external->veth2   23.5
> external->veth  100   23.6
> veth->external19.0
> veth->external2   17.8
> veth->external  100   22.9
> 
> Also tested doing ifup/down or load/unload a XDP program repeatedly
> during processing XDP packets in order to check if enabling/disabling
> NAPI is working as expected, and found no problems.
> 
> v8:
> - Don't use xdp_frame pointer address to calculate skb->head, headroom,
>   and xdp_buff.data_hard_start.
> 
> v7:
> - Introduce xdp_scrub_frame() to clear kernel pointers in xdp_frame and
>   use it instead of memset().
> 
> v6:
> - Check skb->len only if reallocation is needed.
> - Add __GFP_NOWARN to alloc_page() since it can be triggered by external
>   events.
> - Fix sparse warning around EXPORT_SYMBOL.
> 
> v5:
> - Fix broken SOBs.
> 
> v4:
> - Don't adjust MTU automatically.
> - Skip peer IFF_UP check on .ndo_xdp_xmit() because it is unnecessary.
>   Add comments to explain that.
> - Use redirect_info instead of xdp_mem_info for storing no_direct flag
>   to avoid per packet copy cost.
> 
> v3:
> - Drop skb bulk xmit patch since it makes little performance
>   difference. The hotspot in TCP skb xmit at this point is checksum
>   computation in skb_segment and packet copy on XDP_REDIRECT due to
>   cloned/nonlinear skb.
> - Fix race on closing device.
> - Add extack messages in ndo_bpf.
> 
> v2:
> - Squash NAPI patch with "Add driver XDP" patch.
> - Remove conversion from xdp_frame to skb when NAPI is not enabled.
> - Introduce per-queue XDP ring (patch 8).
> - Introduce bulk skb xmit when XDP is enabled on the peer (patch 9).
> 
> Signed-off-by: Toshiaki Makita 
> 
> Toshiaki Makita (10):
>   net: Export skb_headers_offset_update
>   veth: Add driver XDP
>   veth: Avoid drops by oversized packets when XDP is enabled
>   xdp: Helper function to clear kernel pointers in xdp_frame
>   veth: Handle xdp_frames in xdp napi ring
>   veth: Add ndo_xdp_xmit
>   bpf: Make redirect_info accessible from modules
>   xdp: Helpers for disabling napi_direct of xdp_return_frame
>   veth: Add XDP TX and REDIRECT
>   veth: Support per queue XDP ring
> 
>  drivers/net/veth.c | 750 
> -
>  include/linux/filter.h |  35 +++
>  include/linux/skbuff.h |   1 +
>  include/net/xdp.h  |   7 +
>  net/core/filter.c  |  29 +-
>  net/core/skbuff.c  |   3 +-
>  net/core/xdp.c |   6 +-
>  7 files changed, 801 insertions(+), 30 deletions(-)
> 

Applied to bpf-n

Re: [bpf-next V2 PATCH 0/2] Implement sample code for XDP cpumap IP-pair load-balancing

2018-08-10 Thread Daniel Borkmann

On 08/10/2018 02:02 PM, Jesper Dangaard Brouer wrote:
> Background: cpumap moves the SKB allocation out of the driver code,
> and instead allocate it on the remote CPU, and invokes the regular
> kernel network stack with the newly allocated SKB.
> 
> The idea behind the XDP CPU redirect feature, is to use XDP as a
> load-balancer step in-front of regular kernel network stack.  But the
> current sample code does not provide a good example of this.  Part of
> the reason is that, I have implemented this as part of Suricata XDP
> load-balancer.
> 
> Given this is the most frequent feature request I get.  This patchset
> implement the same XDP load-balancing as Suricata does, which is a
> symmetric hash based on the IP-pairs + L4-protocol.
> 
> The expected setup for the use-case is to reduce the number of NIC RX
> queues via ethtool (as XDP can handle more per core), and via
> smp_affinity assign these RX queues to a set of CPUs, which will be
> handling RX packets.  The CPUs that runs the regular network stack is
> supplied to the sample xdp_redirect_cpu tool by specifying
> the --cpu option multiple times on the cmdline.
> 
> I do note that cpumap SKB creation is not feature complete yet, and
> more work is coming.  E.g. given GRO is not implemented yet, do expect
> TCP workloads to be slower.  My measurements do indicate UDP workloads
> are faster.

Applied to bpf-next, thanks Jesper!

[PATCH net-next 1/1] tc: Update README and add config

2018-08-10 Thread Keara Leibovitz

Updated README.

Added config file that contains the minimum required features enabled to
run the tests currently present in the kernel.
This must be updated when new unittests are created and require their own
modules.

Signed-off-by: Keara Leibovitz 
---
 tools/testing/selftests/tc-testing/README | 16 +++
 tools/testing/selftests/tc-testing/config | 48 +++
 2 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 tools/testing/selftests/tc-testing/config

diff --git a/tools/testing/selftests/tc-testing/README 
b/tools/testing/selftests/tc-testing/README
index 3a0336782d2d..49a6f8c3fdae 100644
--- a/tools/testing/selftests/tc-testing/README
+++ b/tools/testing/selftests/tc-testing/README
@@ -17,6 +17,10 @@ REQUIREMENTS
 *  The kernel must have veth support available, as a veth pair is created
prior to running the tests.
 
+*  The kernel must have the appropriate infrastructure enabled to run all tdc
+   unit tests. See the config file in this directory for minimum required
+   features. As new tests will be added, config options list will be updated.
+
 *  All tc-related features being tested must be built in or available as
modules.  To check what is required in current setup run:
./tdc.py -c
@@ -109,8 +113,8 @@ COMMAND LINE ARGUMENTS
 Run tdc.py -h to see the full list of available arguments.
 
 usage: tdc.py [-h] [-p PATH] [-D DIR [DIR ...]] [-f FILE [FILE ...]]
-  [-c [CATG [CATG ...]]] [-e ID [ID ...]] [-l] [-s] [-i] [-v]
-  [-d DEVICE] [-n NS] [-V]
+  [-c [CATG [CATG ...]]] [-e ID [ID ...]] [-l] [-s] [-i] [-v] [-N]
+  [-d DEVICE] [-P] [-n] [-V]
 
 Linux TC unit tests
 
@@ -118,8 +122,10 @@ optional arguments:
   -h, --helpshow this help message and exit
   -p PATH, --path PATH  The full path to the tc executable to use
   -v, --verbose Show the commands that are being run
+  -N, --notap   Suppress tap results for command under test
   -d DEVICE, --device DEVICE
 Execute the test case in flower category
+  -P, --pause   Pause execution just before post-suite stage
 
 selection:
   select which test cases: files plus directories; filtered by categories
@@ -146,10 +152,10 @@ action:
   -i, --id  Generate ID numbers for new test cases
 
 netns:
-  options for nsPlugin(run commands in net namespace)
+  options for nsPlugin (run commands in net namespace)
 
-  -n NS, --namespace NS
-Run commands in namespace NS
+  -n, --namespace
+Run commands in namespace as specified in tdc_config.py
 
 valgrind:
   options for valgrindPlugin (run command under test under Valgrind)
diff --git a/tools/testing/selftests/tc-testing/config 
b/tools/testing/selftests/tc-testing/config
new file mode 100644
index ..203302065458
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/config
@@ -0,0 +1,48 @@
+CONFIG_NET_SCHED=y
+
+#
+# Queueing/Scheduling
+#
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_INGRESS=m
+
+#
+# Classification
+#
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_CLS_U32_PERF=y
+CONFIG_CLS_U32_MARK=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_STACK=32
+CONFIG_NET_EMATCH_CMP=m
+CONFIG_NET_EMATCH_NBYTE=m
+CONFIG_NET_EMATCH_U32=m
+CONFIG_NET_EMATCH_META=m
+CONFIG_NET_EMATCH_TEXT=m
+CONFIG_NET_EMATCH_IPSET=m
+CONFIG_NET_EMATCH_IPT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_POLICE=m
+CONFIG_NET_ACT_GACT=m
+CONFIG_GACT_PROB=y
+CONFIG_NET_ACT_MIRRED=m
+CONFIG_NET_ACT_SAMPLE=m
+CONFIG_NET_ACT_IPT=m
+CONFIG_NET_ACT_NAT=m
+CONFIG_NET_ACT_PEDIT=m
+CONFIG_NET_ACT_SIMP=m
+CONFIG_NET_ACT_SKBEDIT=m
+CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_VLAN=m
+CONFIG_NET_ACT_BPF=m
+CONFIG_NET_ACT_CONNMARK=m
+CONFIG_NET_ACT_SKBMOD=m
+CONFIG_NET_ACT_IFE=m
+CONFIG_NET_ACT_TUNNEL_KEY=m
+CONFIG_NET_IFE_SKBMARK=m
+CONFIG_NET_IFE_SKBPRIO=m
+CONFIG_NET_IFE_SKBTCINDEX=m
+CONFIG_NET_CLS_IND=y
+CONFIG_NET_SCH_FIFO=y
-- 
2.7.4

Re: [PATCH bpf] Revert "xdp: add NULL pointer check in __xdp_return()"

2018-08-10 Thread Daniel Borkmann

On 08/10/2018 11:28 AM, Björn Töpel wrote:
> From: Björn Töpel 
> 
> This reverts commit 36e0f12bbfd3016f495904b35e41c5711707509f.
> 
> The reverted commit adds a WARN to check against NULL entries in the
> mem_id_ht rhashtable. Any kernel path implementing the XDP (generic or
> driver) fast path is required to make a paired
> xdp_rxq_info_reg/xdp_rxq_info_unreg call for proper function. In
> addition, a driver using a different allocation scheme than the
> default MEM_TYPE_PAGE_SHARED is required to additionally call
> xdp_rxq_info_reg_mem_model.
> 
> For MEM_TYPE_ZERO_COPY, an xdp_rxq_info_reg_mem_model call ensures
> that the mem_id_ht rhashtable has a properly inserted allocator id. If
> not, this would be a driver bug. A NULL pointer kernel OOPS is
> preferred to the WARN.
> 
> Suggested-by: Jesper Dangaard Brouer 
> Signed-off-by: Björn Töpel 

Given the last bpf pr went out yesterday night, I've applied this to
bpf-next (worst case we can just route it via stable), thanks!

Re: [PATCH bpf-next] bpf: enable btf for use in all maps

2018-08-10 Thread Daniel Borkmann

On 08/10/2018 02:54 PM, Martin KaFai Lau wrote:
> On Fri, Aug 10, 2018 at 09:55:35AM +0200, Daniel Borkmann wrote:
>> On 08/10/2018 04:13 AM, Alexei Starovoitov wrote:
>>> On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
 On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
>> On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
>>> On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
 Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
 the basic arraymap") enabled support for BTF and dumping via
 BPF fs for arraymap. However, both can be decoupled from each
 other such that all BPF maps can be supported for attaching
 BTF key/value information, while not all maps necessarily
 need to dump via map_seq_show_elem() callback.

 The check in array_map_check_btf() can be generalized as
 ultimatively the key and value size is the only contraint
 that needs to match for the map. The fact that the key needs
 to be of type int is optional; it could be any data type as
 long as it matches the 4 byte key size, just like hash table
 key or others could be of any data type as well.

 Minimal example of a hash table dump which then works out
 of the box for bpftool:

   # bpftool map dump id 19
   [{
   "key": {
   "": {
   "vip": 0,
   "vipv6": []
   },
   "port": 0,
   "family": 0,
   "proto": 0
   },
   "value": {
   "flags": 0,
   "vip_num": 0
   }
   }
   ]

 Signed-off-by: Daniel Borkmann 
 Cc: Yonghong Song 
 ---
  include/linux/bpf.h   |  4 +---
  kernel/bpf/arraymap.c | 27 ---
  kernel/bpf/inode.c|  3 ++-
  kernel/bpf/syscall.c  | 24 
  4 files changed, 23 insertions(+), 35 deletions(-)

 diff --git a/include/linux/bpf.h b/include/linux/bpf.h
 index cd8790d..eb76e8e 100644
 --- a/include/linux/bpf.h
 +++ b/include/linux/bpf.h
 @@ -48,8 +48,6 @@ struct bpf_map_ops {
u32 (*map_fd_sys_lookup_elem)(void *ptr);
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
  struct seq_file *m);
 -  int (*map_check_btf)(const struct bpf_map *map, const struct 
 btf *btf,
 -   u32 key_type_id, u32 value_type_id);
  };
  
  struct bpf_map {
 @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const 
 struct bpf_map *map)
  
  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
  {
 -  return map->ops->map_seq_show_elem && map->ops->map_check_btf;
 +  return map->btf && map->ops->map_seq_show_elem;
  }
  
  extern const struct bpf_map_ops bpf_map_offload_ops;
 diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
 index 2aa55d030..67f0bdf 100644
 --- a/kernel/bpf/arraymap.c
 +++ b/kernel/bpf/arraymap.c
 @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct 
 bpf_map *map, void *key,
rcu_read_unlock();
  }
  
 -static int array_map_check_btf(const struct bpf_map *map, const 
 struct btf *btf,
 - u32 btf_key_id, u32 btf_value_id)
 -{
 -  const struct btf_type *key_type, *value_type;
 -  u32 key_size, value_size;
 -  u32 int_data;
 -
 -  key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
 -  if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
 -  return -EINVAL;
 -
 -  int_data = *(u32 *)(key_type + 1);
 -  /* bpf array can only take a u32 key.  This check makes
 -   * sure that the btf matches the attr used during map_create.
 -   */
 -  if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
 -  BTF_INT_OFFSET(int_data))
 -  return -EINVAL;
>>>
>>> I think most of these checks are still necessary for array type.
>>> Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
>>> is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't 
>>> makes sense.
>>
>> Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
>> on 32 bit it may be allowed du

Re: Error running AF_XDP sample application

2018-08-10 Thread Konrad Djimeli

On 2018-08-10 11:58, Konrad Djimeli wrote:
> On 2018-08-10 03:51, Jakub Kicinski wrote:
>> On Thu, 09 Aug 2018 18:18:08 +0200, kdjimeli wrote:
>>> Hello,
>>>
>>> I have been trying to test a sample AF_XDP program, but I have been
>>> experiencing some issues.
>>> After building the sample code
>>> https://github.com/torvalds/linux/tree/master/samples/bpf,
>>> when running the xdpsock binary, I get the errors
>>> "libbpf: failed to create map (name: 'xsks_map'): Invalid argument"
>>> "libbpf: failed to load object './xdpsock_kern.o"
>>>
>>> I tried to figure out the cause of the error but all I know is that it
>>> occurs at line 910 with the function
>>> call "bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)".
>>>
>>> Please I would like to inquire what could be a possible for this error.
>>
>> which kernel version are you running?
> 
> My kernel version is 4.18.0-rc8+. I cloned it from
> https://github.com/torvalds/linux before building a running.
> 
> My commit head(git show-ref --head) is at
> 1236568ee3cbb0d3ac62d0074a29b97ecf34cbbc HEAD
> 1236568ee3cbb0d3ac62d0074a29b97ecf34cbbc refs/heads/master
> 1236568ee3cbb0d3ac62d0074a29b97ecf34cbbc refs/remotes/origin/HEAD
> 1236568ee3cbb0d3ac62d0074a29b97ecf34cbbc refs/remotes/origin/master
> ...
> 
> 
> I also applied the patch https://patchwork.ozlabs.org/patch/949884/
> (samples: bpf: convert xdpsock_user.c to libbpf ), as the error was
> initially in the form show below:
>   "failed to create a map: 22 Invalid argument"
>   "ERROR: load_bpf_file"
> 
> Thanks
> Konrad

Also other sample applications that make use of other bpf maps, such as
BPF_MAP_TYPE_CPUMAP in xdp_redirect_cpu work fine. But the application
with BPF_MAP_TYPE_XSKMAP fails producing the error mentioned above.

Thanks
Konrad

Re: [PATCH bpf-next] bpf: enable btf for use in all maps

2018-08-10 Thread Martin KaFai Lau

On Fri, Aug 10, 2018 at 09:55:35AM +0200, Daniel Borkmann wrote:
> On 08/10/2018 04:13 AM, Alexei Starovoitov wrote:
> > On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
> >> On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
> >>> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
>  On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
> > On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
> >> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
> >> the basic arraymap") enabled support for BTF and dumping via
> >> BPF fs for arraymap. However, both can be decoupled from each
> >> other such that all BPF maps can be supported for attaching
> >> BTF key/value information, while not all maps necessarily
> >> need to dump via map_seq_show_elem() callback.
> >>
> >> The check in array_map_check_btf() can be generalized as
> >> ultimatively the key and value size is the only contraint
> >> that needs to match for the map. The fact that the key needs
> >> to be of type int is optional; it could be any data type as
> >> long as it matches the 4 byte key size, just like hash table
> >> key or others could be of any data type as well.
> >>
> >> Minimal example of a hash table dump which then works out
> >> of the box for bpftool:
> >>
> >>   # bpftool map dump id 19
> >>   [{
> >>   "key": {
> >>   "": {
> >>   "vip": 0,
> >>   "vipv6": []
> >>   },
> >>   "port": 0,
> >>   "family": 0,
> >>   "proto": 0
> >>   },
> >>   "value": {
> >>   "flags": 0,
> >>   "vip_num": 0
> >>   }
> >>   }
> >>   ]
> >>
> >> Signed-off-by: Daniel Borkmann 
> >> Cc: Yonghong Song 
> >> ---
> >>  include/linux/bpf.h   |  4 +---
> >>  kernel/bpf/arraymap.c | 27 ---
> >>  kernel/bpf/inode.c|  3 ++-
> >>  kernel/bpf/syscall.c  | 24 
> >>  4 files changed, 23 insertions(+), 35 deletions(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index cd8790d..eb76e8e 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -48,8 +48,6 @@ struct bpf_map_ops {
> >>u32 (*map_fd_sys_lookup_elem)(void *ptr);
> >>void (*map_seq_show_elem)(struct bpf_map *map, void *key,
> >>  struct seq_file *m);
> >> -  int (*map_check_btf)(const struct bpf_map *map, const struct 
> >> btf *btf,
> >> -   u32 key_type_id, u32 value_type_id);
> >>  };
> >>  
> >>  struct bpf_map {
> >> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const 
> >> struct bpf_map *map)
> >>  
> >>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
> >>  {
> >> -  return map->ops->map_seq_show_elem && map->ops->map_check_btf;
> >> +  return map->btf && map->ops->map_seq_show_elem;
> >>  }
> >>  
> >>  extern const struct bpf_map_ops bpf_map_offload_ops;
> >> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> >> index 2aa55d030..67f0bdf 100644
> >> --- a/kernel/bpf/arraymap.c
> >> +++ b/kernel/bpf/arraymap.c
> >> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct 
> >> bpf_map *map, void *key,
> >>rcu_read_unlock();
> >>  }
> >>  
> >> -static int array_map_check_btf(const struct bpf_map *map, const 
> >> struct btf *btf,
> >> - u32 btf_key_id, u32 btf_value_id)
> >> -{
> >> -  const struct btf_type *key_type, *value_type;
> >> -  u32 key_size, value_size;
> >> -  u32 int_data;
> >> -
> >> -  key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
> >> -  if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> >> -  return -EINVAL;
> >> -
> >> -  int_data = *(u32 *)(key_type + 1);
> >> -  /* bpf array can only take a u32 key.  This check makes
> >> -   * sure that the btf matches the attr used during map_create.
> >> -   */
> >> -  if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
> >> -  BTF_INT_OFFSET(int_data))
> >> -  return -EINVAL;
> >
> > I think most of these checks are still necessary for array type.
> > Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
> > is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't 
> > makes sense.
> 
>  Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
>  on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
>  cou

[PATCH v2 bpf-next] BPF: helpers: New helper to obtain namespace data from current task

2018-08-10 Thread Carlos Neira

This helper obtains the active namespace from current and returns pid, tgid,
device and namespace id as seen from that namespace, allowing to instrument
a process inside a container.
Device is read from /proc/self/ns/pid, as in the future it's possible that
different pid_ns files may belong to different devices, according
to the discussion between Eric Biederman and Yonghong in 2017 linux plumbers
conference.

Currently bpf_get_current_pid_tgid(), is used to do pid filtering in bcc's
scripts but this helper returns the pid as seen by the root namespace which is
fine when a bcc script is not executed inside a container.
When the process of interest is inside a container, pid filtering will not work
if bpf_get_current_pid_tgid() is used. This helper addresses this limitation
returning the pid as it's seen by the current namespace where the script is
executing.

This helper has the same use cases as bpf_get_current_pid_tgid() as it can be
used to do pid filtering even inside a container.

For example a bcc script using bpf_get_current_pid_tgid() (tools/funccount.py):

u32 pid = bpf_get_current_pid_tgid() >> 32;
if (pid != )
return 0;

Could be modified to use bpf_get_current_pidns_info() as follows:

struct bpf_pidns pidns;
bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
u32 pid = pidns.tgid;
u32 nsid = pidns.nsid;
if ((pid != ) && (nsid != ))
return 0;

To find out the name PID namespace id of a process, you could use this command:

$ ps -h -o pidns -p 

Or this other command:

$ ls -Li /proc//ns/pid

Signed-off-by: Carlos Antonio Neira Bustos 
---
 include/linux/bpf.h   |  1 +
 include/uapi/linux/bpf.h  | 24 +++-
 kernel/bpf/core.c |  1 +
 kernel/bpf/helpers.c  | 64 +++
 kernel/trace/bpf_trace.c  |  2 +
 samples/bpf/Makefile  |  3 ++
 samples/bpf/trace_ns_info_user.c  | 35 +
 samples/bpf/trace_ns_info_user_kern.c | 45 ++
 tools/include/uapi/linux/bpf.h| 24 +++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++
 10 files changed, 200 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/trace_ns_info_user.c
 create mode 100644 samples/bpf/trace_ns_info_user_kern.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd8790d2c6ed..3f4b999f7c99 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -787,6 +787,7 @@ extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
+extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
 
 extern const struct bpf_func_proto bpf_get_local_storage_proto;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd5758dc35d3..8462f9881465 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2113,6 +2113,18 @@ union bpf_attr {
  * the shared data.
  * Return
  * Pointer to the local storage area.
+ *
+ * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 
size_of_pidns)
+ * Description
+ * Copies into *pidns* pid, namespace id and tgid as seen by the
+ * current namespace and also device from /proc/self/ns/pid.
+ * *size_of_pidns* must be the size of *pidns*
+ *
+ * This helper is used when pid filtering is needed inside a
+ * container as bpf_get_current_tgid() helper returns always the
+ * pid id as seen by the root namespace.
+ * Return
+ * 0 on success -EINVAL on error.
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -2196,7 +2208,8 @@ union bpf_attr {
FN(rc_keydown), \
FN(skb_cgroup_id),  \
FN(get_current_cgroup_id),  \
-   FN(get_local_storage),
+   FN(get_local_storage),  \
+   FN(get_current_pidns_info),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2724,4 +2737,13 @@ enum bpf_task_fd_type {
BPF_FD_TYPE_URETPROBE,  /* filename + offset */
 };
 
+/* helper bpf_get_current_pidns_info will store the following
+ * data, dev will contain major/minor from /proc/self/ns/pid.
+ */
+struct bpf_pidns_info {
+   __u32 dev;
+   __u32 nsid;
+   __u32 tgid;
+   __u32 pid;
+};
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4d09e610777f..98ce53ce2ea6 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1796,6 +1796,7 @@ const struct bpf_func_proto bpf_sock_map_update_proto 
__weak;
 const struc

[bpf-next V2 PATCH 0/2] Implement sample code for XDP cpumap IP-pair load-balancing

2018-08-10 Thread Jesper Dangaard Brouer

Background: cpumap moves the SKB allocation out of the driver code,
and instead allocate it on the remote CPU, and invokes the regular
kernel network stack with the newly allocated SKB.

The idea behind the XDP CPU redirect feature, is to use XDP as a
load-balancer step in-front of regular kernel network stack.  But the
current sample code does not provide a good example of this.  Part of
the reason is that, I have implemented this as part of Suricata XDP
load-balancer.

Given this is the most frequent feature request I get.  This patchset
implement the same XDP load-balancing as Suricata does, which is a
symmetric hash based on the IP-pairs + L4-protocol.

The expected setup for the use-case is to reduce the number of NIC RX
queues via ethtool (as XDP can handle more per core), and via
smp_affinity assign these RX queues to a set of CPUs, which will be
handling RX packets.  The CPUs that runs the regular network stack is
supplied to the sample xdp_redirect_cpu tool by specifying
the --cpu option multiple times on the cmdline.

I do note that cpumap SKB creation is not feature complete yet, and
more work is coming.  E.g. given GRO is not implemented yet, do expect
TCP workloads to be slower.  My measurements do indicate UDP workloads
are faster.

---

Jesper Dangaard Brouer (2):
  samples/bpf: add Paul Hsieh's (LGPL 2.1) hash function SuperFastHash
  samples/bpf: xdp_redirect_cpu load balance like Suricata


 samples/bpf/hash_func01.h   |   55 +++
 samples/bpf/xdp_redirect_cpu_kern.c |  103 +++
 samples/bpf/xdp_redirect_cpu_user.c |4 +
 3 files changed, 160 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/hash_func01.h

--

[bpf-next V2 PATCH 2/2] samples/bpf: xdp_redirect_cpu load balance like Suricata

2018-08-10 Thread Jesper Dangaard Brouer

From: Jesper Dangaard Brouer 

This implement XDP CPU redirection load-balancing across available
CPUs, based on the hashing IP-pairs + L4-protocol.  This equivalent to
xdp-cpu-redirect feature in Suricata, which is inspired by the
Suricata 'ippair' hashing code.

An important property is that the hashing is flow symmetric, meaning
that if the source and destination gets swapped then the selected CPU
will remain the same.  This is helps locality by placing both directions
of a flows on the same CPU, in a forwarding/routing scenario.

The hashing INITVAL (15485863 the 10^6th prime number) was fairly
arbitrary choosen, but experiments with kernel tree pktgen scripts
(pktgen_sample04_many_flows.sh +pktgen_sample05_flow_per_thread.sh)
showed this improved the distribution.

This patch also change the default loaded XDP program to be this
load-balancer.  As based on different user feedback, this seems to be
the expected behavior of the sample xdp_redirect_cpu.

Link: https://github.com/OISF/suricata/commit/796ec08dd7a63
Signed-off-by: Jesper Dangaard Brouer 
---
 samples/bpf/xdp_redirect_cpu_kern.c |  103 +++
 samples/bpf/xdp_redirect_cpu_user.c |4 +
 2 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/xdp_redirect_cpu_kern.c 
b/samples/bpf/xdp_redirect_cpu_kern.c
index 8cb703671b04..081ef4bb4fe3 100644
--- a/samples/bpf/xdp_redirect_cpu_kern.c
+++ b/samples/bpf/xdp_redirect_cpu_kern.c
@@ -13,6 +13,7 @@
 
 #include 
 #include "bpf_helpers.h"
+#include "hash_func01.h"
 
 #define MAX_CPUS 12 /* WARNING - sync with _user.c */
 
@@ -461,6 +462,108 @@ int  xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }
 
+/* Hashing initval */
+#define INITVAL 15485863
+
+static __always_inline
+u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
+{
+   void *data_end = (void *)(long)ctx->data_end;
+   void *data = (void *)(long)ctx->data;
+   struct iphdr *iph = data + nh_off;
+   u32 cpu_hash;
+
+   if (iph + 1 > data_end)
+   return 0;
+
+   cpu_hash = iph->saddr + iph->daddr;
+   cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol);
+
+   return cpu_hash;
+}
+
+static __always_inline
+u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
+{
+   void *data_end = (void *)(long)ctx->data_end;
+   void *data = (void *)(long)ctx->data;
+   struct ipv6hdr *ip6h = data + nh_off;
+   u32 cpu_hash;
+
+   if (ip6h + 1 > data_end)
+   return 0;
+
+   cpu_hash  = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0];
+   cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1];
+   cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2];
+   cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3];
+   cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr);
+
+   return cpu_hash;
+}
+
+/* Load-Balance traffic based on hashing IP-addrs + L4-proto.  The
+ * hashing scheme is symmetric, meaning swapping IP src/dest still hit
+ * same CPU.
+ */
+SEC("xdp_cpu_map5_lb_hash_ip_pairs")
+int  xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx)
+{
+   void *data_end = (void *)(long)ctx->data_end;
+   void *data = (void *)(long)ctx->data;
+   struct ethhdr *eth = data;
+   u8 ip_proto = IPPROTO_UDP;
+   struct datarec *rec;
+   u16 eth_proto = 0;
+   u64 l3_offset = 0;
+   u32 cpu_dest = 0;
+   u32 cpu_idx = 0;
+   u32 *cpu_lookup;
+   u32 *cpu_max;
+   u32 cpu_hash;
+   u32 key = 0;
+
+   /* Count RX packet in map */
+   rec = bpf_map_lookup_elem(&rx_cnt, &key);
+   if (!rec)
+   return XDP_ABORTED;
+   rec->processed++;
+
+   cpu_max = bpf_map_lookup_elem(&cpus_count, &key);
+   if (!cpu_max)
+   return XDP_ABORTED;
+
+   if (!(parse_eth(eth, data_end, ð_proto, &l3_offset)))
+   return XDP_PASS; /* Just skip */
+
+   /* Hash for IPv4 and IPv6 */
+   switch (eth_proto) {
+   case ETH_P_IP:
+   cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset);
+   break;
+   case ETH_P_IPV6:
+   cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset);
+   break;
+   case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */
+   default:
+   cpu_hash = 0;
+   }
+
+   /* Choose CPU based on hash */
+   cpu_idx = cpu_hash % *cpu_max;
+
+   cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+   if (!cpu_lookup)
+   return XDP_ABORTED;
+   cpu_dest = *cpu_lookup;
+
+   if (cpu_dest >= MAX_CPUS) {
+   rec->issue++;
+   return XDP_ABORTED;
+   }
+
+   return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/samples/bpf/xdp_redirect_cpu_user.c 
b/samples/bpf/xdp_re

[bpf-next V2 PATCH 1/2] samples/bpf: add Paul Hsieh's (LGPL 2.1) hash function SuperFastHash

2018-08-10 Thread Jesper Dangaard Brouer

Adjusted function call API to take an initval. This allow the API
user to set the initial value, as a seed. This could also be used for
inputting the previous hash.

Signed-off-by: Jesper Dangaard Brouer 
---
 samples/bpf/hash_func01.h |   55 +
 1 file changed, 55 insertions(+)
 create mode 100644 samples/bpf/hash_func01.h

diff --git a/samples/bpf/hash_func01.h b/samples/bpf/hash_func01.h
new file mode 100644
index ..38255812e376
--- /dev/null
+++ b/samples/bpf/hash_func01.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1
+ *
+ * Based on Paul Hsieh's (LGPG 2.1) hash function
+ * From: http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+#define get16bits(d) (*((const __u16 *) (d)))
+
+static __always_inline
+__u32 SuperFastHash (const char *data, int len, __u32 initval) {
+   __u32 hash = initval;
+   __u32 tmp;
+   int rem;
+
+   if (len <= 0 || data == NULL) return 0;
+
+   rem = len & 3;
+   len >>= 2;
+
+   /* Main loop */
+#pragma clang loop unroll(full)
+   for (;len > 0; len--) {
+   hash  += get16bits (data);
+   tmp= (get16bits (data+2) << 11) ^ hash;
+   hash   = (hash << 16) ^ tmp;
+   data  += 2*sizeof (__u16);
+   hash  += hash >> 11;
+   }
+
+   /* Handle end cases */
+   switch (rem) {
+case 3: hash += get16bits (data);
+hash ^= hash << 16;
+hash ^= ((signed char)data[sizeof (__u16)]) << 18;
+hash += hash >> 11;
+break;
+case 2: hash += get16bits (data);
+hash ^= hash << 11;
+hash += hash >> 17;
+break;
+case 1: hash += (signed char)*data;
+hash ^= hash << 10;
+hash += hash >> 1;
+   }
+
+   /* Force "avalanching" of final 127 bits */
+   hash ^= hash << 3;
+   hash += hash >> 5;
+   hash ^= hash << 4;
+   hash += hash >> 17;
+   hash ^= hash << 25;
+   hash += hash >> 6;
+
+   return hash;
+}

[PATCH iproute2/net-next v2] tc_util: Add support for showing TCA_STATS_BASIC_HW statistics

2018-08-10 Thread Eelco Chaudron

Add support for showing hardware specific counters to easy
troubleshooting hardware offload.

$ tc -s filter show dev enp3s0np0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  dst_ip 2.0.0.0
  src_ip 1.0.0.0
  ip_flags nofrag
  in_hw
action order 1: mirred (Egress Redirect to device eth1) stolen
index 1 ref 1 bind 1 installed 0 sec used 0 sec
Action statistics:
Sent 534884742 bytes 8915697 pkt (dropped 0, overlimits 0 requeues 0)
Sent software 187542 bytes 4077 pkt
Sent hardware 534697200 bytes 8911620 pkt
backlog 0b 0p requeues 0
cookie 89173e6a7001becfd486bda17e29


Signed-off-by: Eelco Chaudron 
---
v2:
 * Removed unnecessary initialization
 * Made not displaying of missing TCA_STATS_BASIC_HW more obvious
 * Use _SL_ macro for single line output

 include/uapi/linux/gen_stats.h |1 +
 tc/tc_util.c   |   41 
 2 files changed, 42 insertions(+)

diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h
index 24a861c..065408e 100644
--- a/include/uapi/linux/gen_stats.h
+++ b/include/uapi/linux/gen_stats.h
@@ -12,6 +12,7 @@ enum {
TCA_STATS_APP,
TCA_STATS_RATE_EST64,
TCA_STATS_PAD,
+   TCA_STATS_BASIC_HW,
__TCA_STATS_MAX,
 };
 #define TCA_STATS_MAX (__TCA_STATS_MAX - 1)
diff --git a/tc/tc_util.c b/tc/tc_util.c
index d757852..5a1bbf2 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -800,6 +800,44 @@ void print_tm(FILE *f, const struct tcf_t *tm)
}
 }
 
+static void print_tcstats_basic_hw(struct rtattr **tbs, char *prefix)
+{
+   struct gnet_stats_basic bs_hw;
+
+   if (!tbs[TCA_STATS_BASIC_HW])
+   return;
+
+   memcpy(&bs_hw, RTA_DATA(tbs[TCA_STATS_BASIC_HW]),
+  MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC_HW]), sizeof(bs_hw)));
+
+   if (bs_hw.bytes == 0 && bs_hw.packets == 0)
+   return;
+
+   if (tbs[TCA_STATS_BASIC]) {
+   struct gnet_stats_basic bs;
+
+   memcpy(&bs, RTA_DATA(tbs[TCA_STATS_BASIC]),
+  MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
+  sizeof(bs)));
+
+   if (bs.bytes >= bs_hw.bytes && bs.packets >= bs_hw.packets) {
+   print_string(PRINT_FP, NULL, "%s", _SL_);
+   print_string(PRINT_FP, NULL, "%s", prefix);
+   print_lluint(PRINT_ANY, "sw_bytes",
+"Sent software %llu bytes",
+bs.bytes - bs_hw.bytes);
+   print_uint(PRINT_ANY, "sw_packets", " %u pkt",
+  bs.packets - bs_hw.packets);
+   }
+   }
+
+   print_string(PRINT_FP, NULL, "%s", _SL_);
+   print_string(PRINT_FP, NULL, "%s", prefix);
+   print_lluint(PRINT_ANY, "hw_bytes", "Sent hardware %llu bytes",
+bs_hw.bytes);
+   print_uint(PRINT_ANY, "hw_packets", " %u pkt", bs_hw.packets);
+}
+
 void print_tcstats2_attr(FILE *fp, struct rtattr *rta, char *prefix, struct 
rtattr **xstats)
 {
SPRINT_BUF(b1);
@@ -826,6 +864,9 @@ void print_tcstats2_attr(FILE *fp, struct rtattr *rta, char 
*prefix, struct rtat
print_uint(PRINT_ANY, "requeues", " requeues %u) ", q.requeues);
}
 
+   if (tbs[TCA_STATS_BASIC_HW])
+   print_tcstats_basic_hw(tbs, prefix);
+
if (tbs[TCA_STATS_RATE_EST64]) {
struct gnet_stats_rate_est64 re = {0};

1 2 >

1 - 100 of 126 matches

Mail list logo