date:20180225

[net-next v3 2/2] bpf: Add eBPF seccomp sample programs

2018-02-25 Thread Sargun Dhillon

This adds a sample program that uses seccomp-eBPF, called
seccomp1. It shows the simple ability to code seccomp filters
in C.

Signed-off-by: Sargun Dhillon 
---
 samples/bpf/Makefile|  5 +
 samples/bpf/bpf_load.c  |  9 ++--
 samples/bpf/test_seccomp_kern.c | 41 
 samples/bpf/test_seccomp_user.c | 46 +
 4 files changed, 99 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/test_seccomp_kern.c
 create mode 100644 samples/bpf/test_seccomp_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ec3fc8d88e87..05f21988775f 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
+hostprogs-y += test_seccomp
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -93,6 +94,8 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) 
xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
+test_seccomp-objs := bpf_load.o $(LIBBPF) test_seccomp_user.o
+
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -144,6 +147,7 @@ always += xdp_monitor_kern.o
 always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
+always += test_seccomp_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -188,6 +192,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
+HOSTLOADLIBES_test_seccomp += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 69806d74fa53..856bc8b93916 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -67,6 +67,7 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
bool is_sockops = strncmp(event, "sockops", 7) == 0;
bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
+   bool is_seccomp = strncmp(event, "seccomp", 7) == 0;
size_t insns_cnt = size / sizeof(struct bpf_insn);
enum bpf_prog_type prog_type;
char buf[256];
@@ -96,6 +97,8 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
prog_type = BPF_PROG_TYPE_SOCK_OPS;
} else if (is_sk_skb) {
prog_type = BPF_PROG_TYPE_SK_SKB;
+   } else if (is_seccomp) {
+   prog_type = BPF_PROG_TYPE_SECCOMP;
} else {
printf("Unknown event '%s'\n", event);
return -1;
@@ -110,7 +113,8 @@ static int load_and_attach(const char *event, struct 
bpf_insn *prog, int size)
 
prog_fd[prog_cnt++] = fd;
 
-   if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
+   if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk ||
+   is_seccomp)
return 0;
 
if (is_socket || is_sockops || is_sk_skb) {
@@ -589,7 +593,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb 
fixup_map)
memcmp(shname, "socket", 6) == 0 ||
memcmp(shname, "cgroup/", 7) == 0 ||
memcmp(shname, "sockops", 7) == 0 ||
-   memcmp(shname, "sk_skb", 6) == 0) {
+   memcmp(shname, "sk_skb", 6) == 0 ||
+   memcmp(shname, "seccomp", 7) == 0) {
ret = load_and_attach(shname, data->d_buf,
  data->d_size);
if (ret != 0)
diff --git a/samples/bpf/test_seccomp_kern.c b/samples/bpf/test_seccomp_kern.c
new file mode 100644
index ..a0dd39b4ba16
--- /dev/null
+++ b/samples/bpf/test_seccomp_kern.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+#include 
+
+#if defined(__x86_64__)
+#define ARCH   AUDIT_ARCH_X86_64
+#elif defined(__i386__)
+#define ARCH   AUDIT_ARCH_I386
+#else
+#endif
+
+#ifdef ARCH
+/* Returns EPERM when trying to close fd 999 */
+SEC("seccomp")
+int bpf_prog1(struct seccomp_data *ctx)
+{
+   /*
+* Make sure this BPF program is being run on the same architecture it
+* was compiled on.
+*/
+   if (ctx->arch != ARCH)
+   return SECCOMP_RET_ERRNO | EPERM;
+   if (ctx->nr == __NR_close && ctx->args[0] == 999)
+   return SECCOMP_RET_ERRNO | EPERM;
+
+   return SECCOMP_RET_ALLOW;
+}
+#else

[net-next v3 1/2] bpf, seccomp: Add eBPF filter capabilities

2018-02-25 Thread Sargun Dhillon

This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
to be used for seccomp filters as an alternative to cBPF filters. The
program type has relatively limited capabilities in terms of helpers,
but that can be extended later on.

The eBPF code loading is separated from attachment of the filter, so
a privileged user can load the filter, and pass it back to an
unprivileged user who can attach it and use it at a later time.

In order to attach the filter itself, you need to supply a flag to the
seccomp syscall indicating that a eBPF filter is being attached, as
opposed to a cBPF one. Verification occurs at program load time,
so the user should only receive errors related to attachment.

Signed-off-by: Sargun Dhillon 
---
 arch/Kconfig |   8 +++
 include/linux/bpf_types.h|   3 +
 include/linux/seccomp.h  |   3 +-
 include/uapi/linux/bpf.h |   2 +
 include/uapi/linux/seccomp.h |   7 +-
 kernel/bpf/syscall.c |   1 +
 kernel/seccomp.c | 159 ---
 7 files changed, 156 insertions(+), 27 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 76c0b54443b1..8490d35e59d6 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -401,6 +401,14 @@ config SECCOMP_FILTER
 
  See Documentation/prctl/seccomp_filter.txt for details.
 
+config SECCOMP_FILTER_EXTENDED
+   bool "Extended BPF seccomp filters"
+   depends on SECCOMP_FILTER && BPF_SYSCALL
+   depends on !CHECKPOINT_RESTORE
+   help
+ Enables seccomp filters to be written in eBPF, as opposed
+ to just cBPF filters.
+
 config HAVE_GCC_PLUGINS
bool
help
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 19b8349a3809..945c65c4e461 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
+BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index c723a5c4e3ff..a7df3ba6cf25 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -5,7 +5,8 @@
 #include 
 
 #define SECCOMP_FILTER_FLAG_MASK   (SECCOMP_FILTER_FLAG_TSYNC | \
-SECCOMP_FILTER_FLAG_LOG)
+SECCOMP_FILTER_FLAG_LOG | \
+SECCOMP_FILTER_FLAG_EXTENDED)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index db6bdc375126..5f96cb7ed954 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1,3 +1,4 @@
+
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  *
@@ -133,6 +134,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SOCK_OPS,
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_CGROUP_DEVICE,
+   BPF_PROG_TYPE_SECCOMP,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 2a0bd9dd104d..730af6c7ec2e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,10 +16,11 @@
 #define SECCOMP_SET_MODE_FILTER1
 #define SECCOMP_GET_ACTION_AVAIL   2
 
-/* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC  1
-#define SECCOMP_FILTER_FLAG_LOG2
 
+/* Valid flags for SECCOMP_SET_MODE_FILTER */
+#define SECCOMP_FILTER_FLAG_TSYNC  (1 << 0)
+#define SECCOMP_FILTER_FLAG_LOG(1 << 1)
+#define SECCOMP_FILTER_FLAG_EXTENDED   (1 << 2)
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e24aa3241387..86d6ec8b916d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
+   type != BPF_PROG_TYPE_SECCOMP &&
!capable(CAP_SYS_ADMIN))
return -EPERM;
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dc77548167ef..d95c24181a6c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * struct seccomp_filter - container for seccomp BPF programs
@@ -367,17 +368,6 @@ static struct seccomp_filter 
*seccomp_prepare_filter(struct sock_fprog *fprog)
 
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
 
-   /*
-* Installing a seccomp filter requires that the task has
-* CAP_SYS_ADMIN in its namespace or be running with

[net-next v3 0/2] eBPF seccomp filters

2018-02-25 Thread Sargun Dhillon

This patchset enables seccomp filters to be written in eBPF. Although, this
patchset doesn't introduce much of the functionality enabled by eBPF, it lays
the ground work for it. Currently, you have to disable CHECKPOINT_RESTORE
support in order to utilize eBPF seccomp filters, as eBPF filters cannot be
retrieved via the ptrace GET_FILTER API.

Any user can load a bpf seccomp filter program, and it can be pinned and
reused without requiring access to the bpf syscalls. A user only requires
the traditional permissions of either being cap_sys_admin, or have
no_new_privs set in order to install their rule.

The primary reason for not adding maps support in this patchset is
to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS.
If we have a map that the BPF program can read, it can potentially
"change" privileges after running. It seems like doing writes only
is safe, because it can be pure, and side effect free, and therefore
not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come
to an agreement, this can be in a follow-up patchset.

A benchmark of this patchset is as follows for a very standard eBPF filter:

Given this test program:
for (i = 10; i < ; i++) syscall(__NR_getpid);

If I implement an eBPF filter with PROG_ARRAYs with a program per syscall,
and tail call, the numbers are such:
ebpf JIT 12.3% slower than native
ebpf no JIT 13.6% slower than native
seccomp JIT 17.6% slower than native
seccomp no JIT 37% slower than native

The speed of the traditional seccomp filter increases O(n) with the number
of syscalls with discrete rulesets, whereas ebpf is O(1), given any number
of syscall filters.

Changes since v2:
  * Rename sample
  * Code cleanup
Changes since v1:
  * Use a flag to indicate loading an eBPF filter, not a separate command
  * Remove printk helper
  * Remove ptrace patch / restore filter / sample
  * Add some safe helpers

Sargun Dhillon (2):
  bpf, seccomp: Add eBPF filter capabilities
  bpf: Add eBPF seccomp sample programs

 arch/Kconfig|   8 ++
 include/linux/bpf_types.h   |   3 +
 include/linux/seccomp.h |   3 +-
 include/uapi/linux/bpf.h|   2 +
 include/uapi/linux/seccomp.h|   7 +-
 kernel/bpf/syscall.c|   1 +
 kernel/seccomp.c| 159 ++--
 samples/bpf/Makefile|   5 ++
 samples/bpf/bpf_load.c  |   9 ++-
 samples/bpf/test_seccomp_kern.c |  41 +++
 samples/bpf/test_seccomp_user.c |  46 
 11 files changed, 255 insertions(+), 29 deletions(-)
 create mode 100644 samples/bpf/test_seccomp_kern.c
 create mode 100644 samples/bpf/test_seccomp_user.c

-- 
2.14.1

Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device

2018-02-25 Thread Jiri Pirko

Sat, Feb 24, 2018 at 12:59:04AM CET, step...@networkplumber.org wrote:
>On Thu, 22 Feb 2018 13:30:12 -0800
>Alexander Duyck  wrote:
>
>> > Again, I undertand your motivation. Yet I don't like your solution.
>> > But if the decision is made to do this in-driver bonding. I would like
>> > to see it baing done some generic way:
>> > 1) share the same "in-driver bonding core" code with netvsc
>> >put to net/core.
>> > 2) the "in-driver bonding core" will strictly limit the functionality,
>> >like active-backup mode only, one vf, one backup, vf netdev type
>> >check (so noone could enslave a tap or anything else)
>> > If user would need something more, he should employ team/bond.  
>
>Sharing would be good, but netvsc world would really like to only have
>one visible network device.

Why do you mind? All would be the same, there would be just another
netdevice unused by the vm user (same as the vf netdev).

Re: [patch net-next] mlxsw: spectrum_switchdev: Allow port enslavement to a VLAN-unaware bridge

2018-02-25 Thread Ido Schimmel

Hi Dave,

On Thu, Feb 22, 2018 at 01:58:39PM -0500, David Miller wrote:
> I'm waiting for this discussion to be fully resolved before applying this
> patch.  Just FYI...

I have a fix for the issue David reported, but it is not related to this
patch (problem manifests itself with VLAN-aware bridges as well).

RE: [PATCH v2 net-next 1/2] lan743x: Add main source files for new lan743x driver

2018-02-25 Thread Bryan.Whitehead

> > Ok, but it seems to me that what I have is an example of "specific
> > book keeping private information". Can you clarify the style you prefer?
> >
> > In cases of allocation where I can just compare a pointer to null, I
> > can easily remove the flags. But in other cases I need a record of
> > which steps completed in order to clean up properly. In cases where I
> > need some sort of a flag would you prefer I avoid a bit mask, and have a
> standalone variable for each flag?
> 
> Hi Bryan
> 
> Often you know some thing has been done, because if it had not been
> done, you would of bombed out with an error. In the release function you
> can assume everything done in probe has been done, otherwise the probe
> would not be successful. In close, you can assume everything done in open
> was successful, otherwise the open would of failed
> 
> So probe does not need any flags. open does not need any flags.
> 
>Andrew

Hi Andrew,

OK, so there are two cases where clean-up is necessary. One is through call to 
remove or ndo_stop. For these cases I agree with you. Everything must have been 
set up correctly, so everything should be cleaned up, no flags required.

But the other case is when things fail anywhere during probe, or open. In these 
cases things are partially initialized and I assumed I needed to clean up 
whatever was initialized successfully before returning an error. I used flags 
so I could share a common clean up function for all possible error cases as 
well as the fully successful case. Without flags I would need to customize a 
clean-up sequence for each possible error point.

Or are you suggesting that I don't need to worry about clean up if an error 
happens during probe or open?

Bryan

[RFC PATCH V4] pci: virtio_pci: Add SR-IOV support for virtio_pci devices

2018-02-25 Thread Mark Rustad

Hardware-realized virtio_pci devices can implement SR-IOV, so this
patch enables its use. The device in question is an upcoming Intel
NIC that implements both a virtio_net PF and virtio_net VFs. These
are hardware realizations of what has been up to now been a software
interface.

The device in question has the following 4-part PCI IDs:

PF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 15fe
VF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 05fe

The patch needs no check for device ID, because the callback will
never be made for devices that do not assert the capability or
when run on a platform incapable of SR-IOV.

One reason for this patch is because the hardware requires the
vendor ID of a VF to be the same as the vendor ID of the PF that
created it. So it seemed logical to simply have a fully-functioning
virtio_net PF create the VFs. This patch makes that possible.

Signed-off-by: Mark Rustad 
Reviewed-by: Alexander Duyck 
---
Changes in V4:
- V3 was a mis-send, this has what was intended
- Move most code to new helpers in pci/iov.c, pci_sriov_configure_generic
  and pci_sriov_disable_generic
- Correct mislabeling of vendor and device IDs
- Other minor changelog fixes
- Rebased to pci/master, since most changes are in that area now
- No new ifdefs with this approach (yay)
Changes in V3:
- Missent patch, please disregard
Changes in V2:
- Simplified logic from previous version, removed added driver variable
- Disable SR-IOV on driver removal except when VFs are assigned
- Sent as RFC to virtio-dev, linux-pci, netdev, lkml and others
---
 drivers/pci/iov.c  |   50 
 drivers/virtio/virtio_pci_common.c |2 +
 include/linux/pci.h|   10 +++
 3 files changed, 62 insertions(+)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 677924ae0350..4b110e169b7c 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -367,6 +367,56 @@ static void sriov_disable(struct pci_dev *dev)
pci_iov_set_numvfs(dev, 0);
 }
 
+/**
+ * pci_sriov_disable_generic - standard helper to disable SR-IOV
+ * @dev:the PCI PF device whose VFs are to be disabled
+ */
+int pci_sriov_disable_generic(struct pci_dev *dev)
+{
+   /*
+* If vfs are assigned we cannot shut down SR-IOV without causing
+* issues, so just leave the hardware available.
+*/
+   if (pci_vfs_assigned(dev)) {
+   pci_warn(dev,
+"Cannot disable SR-IOV while VFs are assigned - VFs 
will not be deallocated\n");
+   return -EPERM;
+   }
+   pci_disable_sriov(dev);
+   return 0;
+}
+EXPORT_SYMBOL_GPL(pci_sriov_disable_generic);
+
+static int pci_sriov_enable(struct pci_dev *dev, int num_vfs)
+{
+   int rc;
+
+   if (pci_num_vf(dev))
+   return -EINVAL;
+
+   rc = pci_enable_sriov(dev, num_vfs);
+   if (rc) {
+   pci_warn(dev, "Failed to enable PCI sriov: %d\n", rc);
+   return rc;
+   }
+   pci_info(dev, "SR-IOV enabled with %d VFs\n", num_vfs);
+   return num_vfs;
+}
+
+/**
+ * pci_sriov_configure_generic - standard helper to configure SR-IOV
+ * @dev: the PCI PF device that is configuring SR-IOV
+ */
+int pci_sriov_configure_generic(struct pci_dev *dev, int num_vfs)
+{
+   if (num_vfs)
+   return pci_sriov_enable(dev, num_vfs);
+   if (!pci_num_vf(dev))
+   return -EINVAL;
+   return pci_sriov_disable_generic(dev);
+}
+EXPORT_SYMBOL_GPL(pci_sriov_configure_generic);
+
 static int sriov_init(struct pci_dev *dev, int pos)
 {
int i, bar64;
diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 48d4d1cf1cb6..d7679377131f 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -584,6 +584,7 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
else
virtio_pci_modern_remove(vp_dev);
 
+   pci_sriov_disable_generic(pci_dev);
pci_disable_device(pci_dev);
put_device(dev);
 }
@@ -596,6 +597,7 @@ static struct pci_driver virtio_pci_driver = {
 #ifdef CONFIG_PM_SLEEP
.driver.pm  = _pci_pm_ops,
 #endif
+   .sriov_configure = pci_sriov_configure_generic,
 };
 
 module_pci_driver(virtio_pci_driver);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 024a1beda008..937124d4e098 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1947,6 +1947,8 @@ int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);
 
 int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
 void pci_disable_sriov(struct pci_dev *dev);
+int pci_sriov_disable_generic(struct pci_dev *dev);
+int pci_sriov_configure_generic(struct pci_dev *dev, int num_vfs);
 int pci_iov_add_virtfn(struct pci_dev *dev, int id);
 void pci_iov_remove_virtfn(struct pci_dev *dev, int id);
 int pci_num_vf(struct pci_dev *dev);
@@ -1973,6

Re: [PATCH iproute2 0/3] ss: fix slab statistics

2018-02-25 Thread David Ahern

On 2/24/18 12:16 PM, Stephen Hemminger wrote:
> The ss -s command has been broken in several places since the
> 2.6.13 kernel. The slab values it is looking for have changed
> names and kernel started merging the values (see kernel patch
> for fixing that).
> 
> Stephen Hemminger (3):
>   ss: drop unused slabstat for skb's
>   ss: convert socket statistics to unsigned
>   ss: update slabinfo names and sum IPv4 and IPv6
> 
>  misc/ss.c | 125 
> +++---
>  1 file changed, 62 insertions(+), 63 deletions(-)
> 

LGTM. Acked-by: David Ahern

Re: [PATCH iproute2-next v3] ip: link_gre6.c: Support IP6_TNL_F_ALLOW_LOCAL_REMOTE flag

2018-02-25 Thread David Ahern

On 2/21/18 4:18 AM, Petr Machata wrote:
> For IP-in-IP tunnels, one can specify the [no]allow-localremote command
> when configuring a device. Under the hood, this flips the
> IP6_TNL_F_ALLOW_LOCAL_REMOTE flag on the netdevice. However, ip6gretap
> and ip6erspan devices, where the flag is also relevant, are not IP-in-IP
> tunnels, and thus there's no way to configure the flag on these
> netdevices. Therefore introduce the command to link_gre6 as well.
> 
> The original support was introduced in commit 21440d19d957
> ("ip: link_ip6tnl.c/ip6tunnel.c: Support IP6_TNL_F_ALLOW_LOCAL_REMOTE flag")
> 
> Signed-off-by: Petr Machata 
> ---

applied to iproute2-next

[no subject]

2018-02-25 Thread Alfred Chow





Good Day,

I am Mr. Alfred Cheuk Yu Chow, the Director for Credit & Marketing
Chong Hing Bank, Hong Kong, Chong Hing Bank Centre, 24 Des Voeux Road
Central, Hong Kong. I have a business proposal of  $38,980,369.00.

All confirmable documents to back up the claims will be made available
to you prior to your acceptance and as soon as I receive your return
mail.

Email me for more details:

Best Regards.

Re: [PATCH net-next 5/5] ipv6: route: dissect flow in input path if fib rules need it

2018-02-25 Thread Roopa Prabhu

On Sun, Feb 25, 2018 at 7:19 PM, David Ahern  wrote:
> On 2/24/18 10:44 PM, Roopa Prabhu wrote:
>
>> @@ -1847,12 +1858,27 @@ void ip6_route_input(struct sk_buff *skb)
>>   .flowi6_mark = skb->mark,
>>   .flowi6_proto = iph->nexthdr,
>>   };
>> + struct flow_keys *flkeys = NULL, _flkeys;
>>
>>   tun_info = skb_tunnel_info(skb);
>>   if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
>>   fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
>> +
>> +#ifdef CONFIG_IPV6_MULTIPLE_TABLES
>> + if (net->ipv6.fib6_rules_require_fldissect) {
>> + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
>> +
>> + memset(&_flkeys, 0, sizeof(_flkeys));
>> + skb_flow_dissect_flow_keys(skb, &_flkeys, flag);
>> + fl6.fl6_sport = _flkeys.ports.src;
>> + fl6.fl6_dport = _flkeys.ports.dst;
>> + fl6.flowi6_proto = _flkeys.basic.ip_proto;
>> + flkeys = &_flkeys;
>> + }
>> +#endif
>
> same here - helper versus inline.
>

ack

Re: [PATCH net-next 0/5] fib_rules: support sport, dport and ip proto match

2018-02-25 Thread Roopa Prabhu

On Sun, Feb 25, 2018 at 7:20 PM, David Ahern  wrote:
> On 2/24/18 10:44 PM, Roopa Prabhu wrote:
>> From: Roopa Prabhu 
>>
>> This series extends fib rule match support to include sport, dport
>> and ip proto match (to complete the 5-tuple match support).
>> Common use-cases of Policy based routing in the data center require
>> 5-tuple match. The last 2 patches in the series add a call to flow dissect
>> in the fwd path if required by the installed fib rules (controlled by a 
>> flag).
>>
>> v1:
>>   - Fix errors reported by kbuild and feedback on RFC series
>>   - extend port match uapi to accomodate port ranges
>
> Would be good to have a test script under tools/testing/selftests/net
> that covers expectations for good and bad cases. Can create one based on
> tools/testing/selftests/net/fib-onlink-tests.sh

yep, that would help. Let me see if I can change my current bash test
script to selftests

Re: [PATCH net-next 1/5] net: fib_rules: support for match on ip_proto, sport and dport

2018-02-25 Thread Roopa Prabhu

On Sun, Feb 25, 2018 at 7:08 PM, David Ahern  wrote:
> On 2/24/18 10:44 PM, Roopa Prabhu wrote:
>> From: Roopa Prabhu 
>>
>> uapi for ip_proto, sport and dport range match
>> in fib rules.
>>
>> Signed-off-by: Roopa Prabhu 
>> ---
>>  include/net/fib_rules.h| 31 +-
>>  include/uapi/linux/fib_rules.h |  8 
>>  net/core/fib_rules.c   | 94 
>> +-
>>  3 files changed, 130 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
>> index b3d2162..6d99202 100644
>> --- a/include/net/fib_rules.h
>> +++ b/include/net/fib_rules.h
>> @@ -11,6 +11,11 @@
>>  #include 
>>  #include 
>>
>> +struct fib_port_range {
>> + __u16 start;
>> + __u16 end;
>
> u16 for kernel headers; __u16 is for uapi.
>

ack,

>> +};
>> +
>>  struct fib_kuid_range {
>>   kuid_t start;
>>   kuid_t end;
>> @@ -27,7 +32,7 @@ struct fib_rule {
>>   u8  action;
>>   u8  l3mdev;
>>   u8  proto;
>> - /* 1 byte hole, try to use */
>> + u8  ip_proto;
>>   u32 target;
>>   __be64  tun_id;
>>   struct fib_rule __rcu   *ctarget;
>> @@ -40,6 +45,8 @@ struct fib_rule {
>>   chariifname[IFNAMSIZ];
>>   charoifname[IFNAMSIZ];
>>   struct fib_kuid_range   uid_range;
>> + struct fib_port_range   sport_range;
>> + struct fib_port_range   dport_range;
>>   struct rcu_head rcu;
>>  };
>>
>> @@ -144,6 +151,28 @@ static inline u32 frh_get_table(struct fib_rule_hdr 
>> *frh, struct nlattr **nla)
>>   return frh->table;
>>  }
>>
>> +static inline bool fib_rule_port_inrange(struct fib_port_range *a,
>> +  __be16 port)
>> +{
>> + if (!a->start)
>> + return true;
>> + return ntohs(port) >= a->start &&
>> + ntohs(port) <= a->end;
>> +}
>> +
>> +static inline bool fib_rule_port_range_valid(const struct fib_port_range *a)
>> +{
>> + return a->start > 0 && a->end < 0x &&
>> + a->start <= a->end;
>> +}
>> +
>> +static inline bool fib_rule_port_range_compare(struct fib_port_range *a,
>> +struct fib_port_range *b)
>> +{
>> + return a->start == b->start &&
>> + a->end == b->end;
>> +}
>> +
>>  struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *,
>>struct net *);
>>  void fib_rules_unregister(struct fib_rules_ops *);
>> diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
>> index 77d90ae..232df14 100644
>> --- a/include/uapi/linux/fib_rules.h
>> +++ b/include/uapi/linux/fib_rules.h
>> @@ -35,6 +35,11 @@ struct fib_rule_uid_range {
>>   __u32   end;
>>  };
>>
>> +struct fib_rule_port_range {
>> + __u16   start;
>> + __u16   end;
>> +};
>> +
>>  enum {
>>   FRA_UNSPEC,
>>   FRA_DST,/* destination address */
>> @@ -59,6 +64,9 @@ enum {
>>   FRA_L3MDEV, /* iif or oif is l3mdev goto its table */
>>   FRA_UID_RANGE,  /* UID range */
>>   FRA_PROTOCOL,   /* Originator of the rule */
>> + FRA_IP_PROTO,   /* ip proto */
>> + FRA_SPORT_RANGE, /* sport */
>> + FRA_DPORT_RANGE, /* dport */
>>   __FRA_MAX
>>  };
>>
>> diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
>> index a6aea80..5008235 100644
>> --- a/net/core/fib_rules.c
>> +++ b/net/core/fib_rules.c
>> @@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule)
>>   if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
>>   !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
>>   return false;
>> + if (fib_rule_port_range_valid(>sport_range))
>> + return false;
>> + if (fib_rule_port_range_valid(>dport_range))
>> + return false;
>
> Seems like that should be a check that start and end are both not 0.
> Given the uses of fib_rule_port_range_valid, perhaps another helper is
> needed to make this more readable -- e.g., fib_rule_port_range_set --
> which would be used here and fill_rule.

yeah, was trying to not add two helpers. But, i sure can.


>
>
>>   return true;
>>  }
>>  EXPORT_SYMBOL_GPL(fib_rule_matchall);
>> @@ -221,6 +225,12 @@ static int nla_put_uid_range(struct sk_buff *skb, 
>> struct fib_kuid_range *range)
>>   return nla_put(skb, FRA_UID_RANGE, sizeof(out), );
>>  }
>>
>> +static int nla_put_port_range(struct sk_buff *skb, int attrtype,
>> +   struct fib_port_range *range)
>> +{
>> + return nla_put(skb, attrtype, sizeof(*range), range);
>> +}
>> +
>>  static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
>>

Re: [PATCH iproute2-next v3 8/8] iplink: Reduce number of arguments to iplink_parse()

2018-02-25 Thread David Ahern

On 2/22/18 6:02 AM, Serhey Popovych wrote:
> Introduce new @struct iplink_parse_args data structure to consolidate
> arguments to iplink_parse(). This will reduce number of arguments
> passed to it.
> 
> Pass this data structure to ->parse_opt() in iplink specific modules:
> it may be used to get network device name and other information.
> 
> Signed-off-by: Serhey Popovych 
> ---
>  ip/ip_common.h   |   16 +---
>  ip/iplink.c  |   34 ++
>  ip/iplink_bond.c |4 +++-
>  ip/iplink_bond_slave.c   |4 +++-
>  ip/iplink_bridge.c   |4 +++-
>  ip/iplink_bridge_slave.c |4 +++-
>  ip/iplink_can.c  |4 +++-
>  ip/iplink_geneve.c   |4 +++-
>  ip/iplink_hsr.c  |4 +++-
>  ip/iplink_ipoib.c|4 +++-
>  ip/iplink_ipvlan.c   |4 +++-
>  ip/iplink_macvlan.c  |4 +++-
>  ip/iplink_vlan.c |4 +++-
>  ip/iplink_vrf.c  |5 -
>  ip/iplink_vxcan.c|   14 ++
>  ip/iplink_vxlan.c|4 +++-
>  ip/ipmacsec.c|4 +++-
>  ip/link_gre.c|6 --
>  ip/link_gre6.c   |6 --
>  ip/link_ip6tnl.c |6 --
>  ip/link_iptnl.c  |6 --
>  ip/link_veth.c   |   14 ++
>  ip/link_vti.c|6 --
>  ip/link_vti6.c   |6 --
>  24 files changed, 114 insertions(+), 57 deletions(-)
> 

Seems like a lot of churn for no benefit.

Re: [PATCH iproute2-next v3 6/8] iplink: Perform most of request buffer setups and checks in iplink_parse()

2018-02-25 Thread David Ahern

On 2/22/18 6:02 AM, Serhey Popovych wrote:
> To benefit other users (e.g. link_veth.c) of iplink_parse() from
> additional attribute checks and setups made in iplink_modify(). This
> catches most of weired cobination of parameters to peer device
> configuration.
> 
> Drop @link, @group and @index from iplink_parse() parameters list: they
> are not needed outside.
> 
> While there change return -1 to exit(-1) for group parsing errors: we
> want to stop further command processing unless -force option is given
> to get error line easily.
> 
> Signed-off-by: Serhey Popovych 
> ---
>  ip/ip_common.h|3 +-
>  ip/iplink.c   |  118 
> +
>  ip/iplink_vxcan.c |   13 +-
>  ip/link_veth.c|   13 +-
>  4 files changed, 59 insertions(+), 88 deletions(-)
> 

IMO veth and vxcan should not be re-using iplink_parse since they only
want a subset of the parsing.

Re: [PATCH iproute2-next v3 5/8] veth,vxcan: Save/reinitialize/restore whole @struct ifinfomsg

2018-02-25 Thread David Ahern

On 2/22/18 6:02 AM, Serhey Popovych wrote:
> Now in iplink_parse() we use ->ifi_change and ->ifi_flags fields and
> plan to use ->ifi_index with upcoming change.
> 
> Saving, restoring and reinitializing individual fields is error prone:
> using new field in iplink_parse() without updating callers in veth and
> vxcan will overwrite main device ifinfomsg data.
> 
> Since @struct ifinfomsg is small enough with known sizeof() compiler may
> inline memcpy()/memset() with few load/store instructions.
> 
> Signed-off-by: Serhey Popovych 
> ---
>  ip/iplink_vxcan.c |   22 --
>  ip/link_veth.c|   22 --
>  2 files changed, 16 insertions(+), 28 deletions(-)

I don't agree that this change has any benefit. Only the flags and
change field are wanted; there is no need to save the entire struct,

Re: [PATCH net-next 0/5] fib_rules: support sport, dport and ip proto match

2018-02-25 Thread David Ahern

On 2/24/18 10:44 PM, Roopa Prabhu wrote:
> From: Roopa Prabhu 
> 
> This series extends fib rule match support to include sport, dport
> and ip proto match (to complete the 5-tuple match support).
> Common use-cases of Policy based routing in the data center require
> 5-tuple match. The last 2 patches in the series add a call to flow dissect
> in the fwd path if required by the installed fib rules (controlled by a flag).
> 
> v1:
>   - Fix errors reported by kbuild and feedback on RFC series
>   - extend port match uapi to accomodate port ranges

Would be good to have a test script under tools/testing/selftests/net
that covers expectations for good and bad cases. Can create one based on
tools/testing/selftests/net/fib-onlink-tests.sh

Re: [PATCH net-next 5/5] ipv6: route: dissect flow in input path if fib rules need it

2018-02-25 Thread David Ahern

On 2/24/18 10:44 PM, Roopa Prabhu wrote:

> @@ -1847,12 +1858,27 @@ void ip6_route_input(struct sk_buff *skb)
>   .flowi6_mark = skb->mark,
>   .flowi6_proto = iph->nexthdr,
>   };
> + struct flow_keys *flkeys = NULL, _flkeys;
>  
>   tun_info = skb_tunnel_info(skb);
>   if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
>   fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
> +
> +#ifdef CONFIG_IPV6_MULTIPLE_TABLES
> + if (net->ipv6.fib6_rules_require_fldissect) {
> + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
> +
> + memset(&_flkeys, 0, sizeof(_flkeys));
> + skb_flow_dissect_flow_keys(skb, &_flkeys, flag);
> + fl6.fl6_sport = _flkeys.ports.src;
> + fl6.fl6_dport = _flkeys.ports.dst;
> + fl6.flowi6_proto = _flkeys.basic.ip_proto;
> + flkeys = &_flkeys;
> + }
> +#endif

same here - helper versus inline.

[RFC PATCH V3] virtio_pci: Add SR-IOV support

2018-02-25 Thread Mark Rustad

Hardware-realized virtio-pci devices can implement SR-IOV, so this
patch enables its use. The device in question is an upcoming Intel
NIC that implements both a virtio-net PF and virtio-net VFs. These
are hardware realizations of what has been up to now been a software
interface.

The device in question has the following 4-part PCI IDs:

PF: device: 1af4 vendor: 1041 subvendor: 8086 subdevice: 15fe
VF: device: 1af4 vendor: 1041 subvendor: 8086 subdevice: 05fe

The patch needs no check for device ID, because the callback will
never be made for devices that do not assert the capability or
when run on a platform incapable of SR-IOV.

One reason for this patch is because the hardware requires the
vendor ID of a VF to be the same as the vendor ID of the PF that
created it. So it seemed logical to simply have a fully-functioning
virtio-net PF create the VFs. This patch makes that possible.

Signed-off-by: Mark Rustad 
Reviewed-by: Alexander Duyck 
---
Changes in V3:
- Move most code to a new helper in pci/iov.c, pci_sriov_configure
Changes in V2:
- Simplified logic from previous version, removed added driver variable
- Disable SR-IOV on driver removal excapt when VFs are assigned
- Sent as RFC to virtio-dev, linux-pci, netdev, lkml and others
---
 drivers/pci/iov.c  |   48 
 drivers/virtio/virtio_pci_common.c |2 ++
 include/linux/pci.h|   10 
 3 files changed, 60 insertions(+)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 677924ae0350..ddd44a9d93ec 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -367,6 +367,54 @@ static void sriov_disable(struct pci_dev *dev)
pci_iov_set_numvfs(dev, 0);
 }
 
+/**
+ * pci_sriov_disable - standard helper to disable SR-IOV
+ * @dev:the PCI PF device whose VFs are to be disabled
+ */
+int pci_sriov_disable(struct pci_dev *dev)
+{
+   /*
+* If vfs are assigned we cannot shut down SR-IOV without causing
+* issues, so just leave the hardware available.
+*/
+   if (pci_vfs_assigned(dev)) {
+   pci_warn(>dev,
+"Cannot disable SR-IOV while VFs are assigned - VFs 
will not be deallocated\n");
+   return -EPERM;
+   }
+   pci_disable_sriov(dev);
+   return 0;
+}
+
+static int pci_sriov_enable(struct pci_dev *dev, int num_vfs)
+{
+   int rc;
+
+   if (pci_num_vf(dev))
+   return -EINVAL;
+
+   rc = pci_enable_sriov(dev, num_vfs);
+   if (rc) {
+   pci_warn(dev, "Failed to enable PCI sriov: %d\n", rc);
+   return rc;
+   }
+   dev_info(dev, "SR-IOV enabled with %d VFs\n", num_vfs);
+   return num_vfs;
+}
+
+/**
+ * pci_sriov_configure - standard helper to configure SR-IOV
+ * @dev: the PCI PF device that is configuring SR-IOV
+ */
+int pci_sriov_configure(struct pci_dev *dev, int num_vfs)
+{
+   if (num_vfs)
+   return pci_sriov_enable(dev, num_vfs);
+   if (!pci_num_vf(dev))
+   return -EINVAL;
+   return pci_sriov_disable(dev);
+}
+
 static int sriov_init(struct pci_dev *dev, int pos)
 {
int i, bar64;
diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 48d4d1cf1cb6..37e353c4f8b4 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -584,6 +584,7 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
else
virtio_pci_modern_remove(vp_dev);
 
+   pci_sriov_disable(pci_dev);
pci_disable_device(pci_dev);
put_device(dev);
 }
@@ -596,6 +597,7 @@ static struct pci_driver virtio_pci_driver = {
 #ifdef CONFIG_PM_SLEEP
.driver.pm  = _pci_pm_ops,
 #endif
+   .sriov_configure = pci_sriov_configure,
 };
 
 module_pci_driver(virtio_pci_driver);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 024a1beda008..ef6b359afefd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1947,6 +1947,8 @@ int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);
 
 int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
 void pci_disable_sriov(struct pci_dev *dev);
+int pci_sriov_disable(struct pci_dev *dev);
+int pci_sriov_configure(struct pci_dev *dev, int num_vfs);
 int pci_iov_add_virtfn(struct pci_dev *dev, int id);
 void pci_iov_remove_virtfn(struct pci_dev *dev, int id);
 int pci_num_vf(struct pci_dev *dev);
@@ -1973,6 +1975,14 @@ static inline int pci_iov_add_virtfn(struct pci_dev 
*dev, int id)
 static inline void pci_iov_remove_virtfn(struct pci_dev *dev,
 int id) { }
 static inline void pci_disable_sriov(struct pci_dev *dev) { }
+static inline int pci_sriov_disable(struct pci_dev *dev)
+{
+   return -ENODEV;
+}
+static inline int pci_sriov_configure(struct pci_dev *dev, int num_vfs)
+{
+   return -ENODEV;
+}
 static inline int pci_num_vf(struct

Re: [PATCH net-next 4/5] ipv4: route: dissect flow in input path if fib rules need it

2018-02-25 Thread David Ahern

On 2/24/18 10:44 PM, Roopa Prabhu wrote:
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 26eefa2..72dd6c6 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1783,7 +1783,7 @@ static void ip_multipath_l3_keys(const struct sk_buff 
> *skb,
>  
>  /* if skb is set it will be used and fl4 can be NULL */

update that comment for flow_keys.

>  int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
> -const struct sk_buff *skb)
> +const struct sk_buff *skb, struct flow_keys *flkeys)
>  {
>   struct net *net = fi->fib_net;
>   struct flow_keys hash_keys;
> @@ -1810,14 +1810,23 @@ int fib_multipath_hash(const struct fib_info *fi, 
> const struct flowi4 *fl4,
>   if (skb->l4_hash)
>   return skb_get_hash_raw(skb) >> 1;
>   memset(_keys, 0, sizeof(hash_keys));
> - skb_flow_dissect_flow_keys(skb, , flag);
>  
> - hash_keys.control.addr_type = 
> FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> - hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
> - hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
> - hash_keys.ports.src = keys.ports.src;
> - hash_keys.ports.dst = keys.ports.dst;
> - hash_keys.basic.ip_proto = keys.basic.ip_proto;
> + if (flkeys) {
> + hash_keys.control.addr_type = 
> FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> + hash_keys.addrs.v4addrs.src = 
> flkeys->addrs.v4addrs.src;
> + hash_keys.addrs.v4addrs.dst = 
> flkeys->addrs.v4addrs.dst;
> + hash_keys.ports.src = flkeys->ports.src;
> + hash_keys.ports.dst = flkeys->ports.dst;
> + hash_keys.basic.ip_proto = 
> flkeys->basic.ip_proto;
> + } else {
> + skb_flow_dissect_flow_keys(skb, , flag);
> + hash_keys.control.addr_type = 
> FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> + hash_keys.addrs.v4addrs.src = 
> keys.addrs.v4addrs.src;
> + hash_keys.addrs.v4addrs.dst = 
> keys.addrs.v4addrs.dst;
> + hash_keys.ports.src = keys.ports.src;
> + hash_keys.ports.dst = keys.ports.dst;
> + hash_keys.basic.ip_proto = keys.basic.ip_proto;
> + }
>   } else {
>   memset(_keys, 0, sizeof(hash_keys));
>   hash_keys.control.addr_type = 
> FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> @@ -1838,11 +1847,12 @@ int fib_multipath_hash(const struct fib_info *fi, 
> const struct flowi4 *fl4,
>  static int ip_mkroute_input(struct sk_buff *skb,
>   struct fib_result *res,
>   struct in_device *in_dev,
> - __be32 daddr, __be32 saddr, u32 tos)
> + __be32 daddr, __be32 saddr, u32 tos,
> + struct flow_keys *hkeys)
>  {
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   if (res->fi && res->fi->fib_nhs > 1) {
> - int h = fib_multipath_hash(res->fi, NULL, skb);
> + int h = fib_multipath_hash(res->fi, NULL, skb, hkeys);
>  
>   fib_select_multipath(res, h);
>   }
> @@ -1868,13 +1878,14 @@ static int ip_route_input_slow(struct sk_buff *skb, 
> __be32 daddr, __be32 saddr,
>  struct fib_result *res)
>  {
>   struct in_device *in_dev = __in_dev_get_rcu(dev);
> + struct flow_keys *flkeys = NULL, _flkeys;
> + struct net*net = dev_net(dev);
>   struct ip_tunnel_info *tun_info;
> - struct flowi4   fl4;
> + int err = -EINVAL;
>   unsigned intflags = 0;
>   u32 itag = 0;
>   struct rtable   *rth;
> - int err = -EINVAL;
> - struct net*net = dev_net(dev);
> + struct flowi4   fl4;
>   bool do_cache;
>  
>   /* IP on this device is disabled. */
> @@ -1933,6 +1944,19 @@ static int ip_route_input_slow(struct sk_buff *skb, 
> __be32 daddr, __be32 saddr,
>   fl4.daddr = daddr;
>   fl4.saddr = saddr;
>   fl4.flowi4_uid = sock_net_uid(net, NULL);
> +
> +#ifdef CONFIG_IP_MULTIPLE_TABLES
> + if (net->ipv4.fib_rules_require_fldissect) {
> + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
> +
> + memset(&_flkeys, 0, sizeof(_flkeys));
> + skb_flow_dissect_flow_keys(skb, &_flkeys, flag);
> + fl4.fl4_sport = _flkeys.ports.src;
> + fl4.fl4_dport = _flkeys.ports.dst;
> + fl4.flowi4_proto = _flkeys.basic.ip_proto;
> + flkeys = &_flkeys;
> + }
> +#endif

I think a helper that compiles out if the config is disabled

Re: [RFC PATCH v2] ptr_ring: linked list fallback

2018-02-25 Thread Jason Wang




On 2018年02月26日 09:17, Michael S. Tsirkin wrote:

So pointer rings work fine, but they have a problem: make them too small
and not enough entries fit.  Make them too large and you start flushing
your cache and running out of memory.

This is a new idea of mine: a ring backed by a linked list. Once you run
out of ring entries, instead of a drop you fall back on a list with a
common lock.

Should work well for the case where the ring is typically sized
correctly, but will help address the fact that some user try to set e.g.
tx queue length to 100.

In other words, the idea is that if a user sets a really huge TX queue
length, we allocate a ptr_ring which is smaller, and use the backup
linked list when necessary to provide the requested TX queue length
legitimately.

My hope this will move us closer to direction where e.g. fw codel can
use ptr rings without locking at all.  The API is still very rough, and
I really need to take a hard look at lock nesting.

Compiled only, sending for early feedback/flames.

Signed-off-by: Michael S. Tsirkin 
---

changes from v1:
- added clarifications by DaveM in the commit log
- build fixes

  include/linux/ptr_ring.h | 64 +---
  1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index d72b2e7..8aa8882 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -31,11 +31,18 @@
  #include 
  #endif
  
+/* entries must start with the following structure */

+struct plist {
+   struct plist *next;
+   struct plist *last; /* only valid in the 1st entry */
+};


So I wonder whether or not it's better to do this in e.g skb_array 
implementation. Then it can use its own prev/next field.



+
  struct ptr_ring {
int producer cacheline_aligned_in_smp;
spinlock_t producer_lock;
int consumer_head cacheline_aligned_in_smp; /* next valid entry */
int consumer_tail; /* next entry to invalidate */
+   struct plist *consumer_list;
spinlock_t consumer_lock;
/* Shared consumer/producer data */
/* Read-only by both the producer and the consumer */
@@ -120,10 +127,40 @@ static inline int __ptr_ring_produce(struct ptr_ring *r, 
void *ptr)
  }
  
  /*

- * Note: resize (below) nests producer lock within consumer lock, so if you
- * consume in interrupt or BH context, you must disable interrupts/BH when
- * calling this.
+ * Note: resize API with the _fallback should be used when calling this.
   */
+static inline int ptr_ring_produce_fallback(struct ptr_ring *r, void *ptr)
+{
+   int ret;
+   unsigned long flags;
+   struct plist *p = ptr;
+
+   p->next = NULL;
+   p->last = p;
+
+   spin_lock_irqsave(>producer_lock, flags);
+   ret = __ptr_ring_produce(r, ptr);
+   if (ret) {
+   spin_lock(>consumer_lock);
+   ret = __ptr_ring_produce(r, ptr);
+   if (ret) {
+   int producer = r->producer ? r->producer - 1 :
+   r->size - 1;
+   struct plist *first = r->queue[producer];
+
+   BUG_ON(!first);
+
+   first->last->next = p;
+   first->last = p;


I believe we still need a limitation on the total size of the queue.

Thanks


+   }
+   spin_unlock(>consumer_lock);
+   }
+
+   spin_unlock_irqrestore(>producer_lock, flags);
+
+   return ret;
+}
+
  static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr)
  {
int ret;
@@ -135,6 +172,7 @@ static inline int ptr_ring_produce(struct ptr_ring *r, void 
*ptr)
return ret;
  }
  
+

  static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr)
  {
int ret;
@@ -359,6 +397,26 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
return ptr;
  }
  
+static inline void *ptr_ring_consume_fallback(struct ptr_ring *r)

+{
+   unsigned long flags;
+   struct plist *ptr;
+
+   spin_lock_irqsave(>consumer_lock, flags);
+   if (r->consumer_list) {
+   ptr = r->consumer_list;
+   r->consumer_list = ptr->next;
+   } else {
+   ptr = __ptr_ring_consume(r);
+   if (ptr) {
+   r->consumer_list = ptr->next;
+   }
+   }
+   spin_unlock_irqrestore(>consumer_lock, flags);
+
+   return ptr;
+}
+
  static inline int ptr_ring_consume_batched(struct ptr_ring *r,
   void **array, int n)
  {

[PATCH net] r8152: fix tx packets accounting

2018-02-25 Thread Eric Dumazet

From: Eric Dumazet 

r8152 driver handles TSO packets (limited to ~16KB) quite well,
but pretends each TSO logical packet is a single packet on the wire.

There is also some error since headers are accounted once, but
error rate is small enough that we do not care.

Signed-off-by: Eric Dumazet 
---
 drivers/net/usb/r8152.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 
958b2e8b90f689249abfea6c713e491300a7dc94..86f7196f9d91fbf55c791fff88687a43518d66d8
 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -1794,7 +1794,7 @@ static int r8152_tx_agg_fill(struct r8152 *tp, struct 
tx_agg *agg)
 
tx_data += len;
agg->skb_len += len;
-   agg->skb_num++;
+   agg->skb_num += skb_shinfo(skb)->gso_segs ?: 1;
 
dev_kfree_skb_any(skb);

Re: [PATCH net-next 1/5] net: fib_rules: support for match on ip_proto, sport and dport

2018-02-25 Thread David Ahern

On 2/24/18 10:44 PM, Roopa Prabhu wrote:
> From: Roopa Prabhu 
> 
> uapi for ip_proto, sport and dport range match
> in fib rules.
> 
> Signed-off-by: Roopa Prabhu 
> ---
>  include/net/fib_rules.h| 31 +-
>  include/uapi/linux/fib_rules.h |  8 
>  net/core/fib_rules.c   | 94 
> +-
>  3 files changed, 130 insertions(+), 3 deletions(-)
> 
> diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
> index b3d2162..6d99202 100644
> --- a/include/net/fib_rules.h
> +++ b/include/net/fib_rules.h
> @@ -11,6 +11,11 @@
>  #include 
>  #include 
>  
> +struct fib_port_range {
> + __u16 start;
> + __u16 end;

u16 for kernel headers; __u16 is for uapi.

> +};
> +
>  struct fib_kuid_range {
>   kuid_t start;
>   kuid_t end;
> @@ -27,7 +32,7 @@ struct fib_rule {
>   u8  action;
>   u8  l3mdev;
>   u8  proto;
> - /* 1 byte hole, try to use */
> + u8  ip_proto;
>   u32 target;
>   __be64  tun_id;
>   struct fib_rule __rcu   *ctarget;
> @@ -40,6 +45,8 @@ struct fib_rule {
>   chariifname[IFNAMSIZ];
>   charoifname[IFNAMSIZ];
>   struct fib_kuid_range   uid_range;
> + struct fib_port_range   sport_range;
> + struct fib_port_range   dport_range;
>   struct rcu_head rcu;
>  };
>  
> @@ -144,6 +151,28 @@ static inline u32 frh_get_table(struct fib_rule_hdr 
> *frh, struct nlattr **nla)
>   return frh->table;
>  }
>  
> +static inline bool fib_rule_port_inrange(struct fib_port_range *a,
> +  __be16 port)
> +{
> + if (!a->start)
> + return true;
> + return ntohs(port) >= a->start &&
> + ntohs(port) <= a->end;
> +}
> +
> +static inline bool fib_rule_port_range_valid(const struct fib_port_range *a)
> +{
> + return a->start > 0 && a->end < 0x &&
> + a->start <= a->end;
> +}
> +
> +static inline bool fib_rule_port_range_compare(struct fib_port_range *a,
> +struct fib_port_range *b)
> +{
> + return a->start == b->start &&
> + a->end == b->end;
> +}
> +
>  struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *,
>struct net *);
>  void fib_rules_unregister(struct fib_rules_ops *);
> diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
> index 77d90ae..232df14 100644
> --- a/include/uapi/linux/fib_rules.h
> +++ b/include/uapi/linux/fib_rules.h
> @@ -35,6 +35,11 @@ struct fib_rule_uid_range {
>   __u32   end;
>  };
>  
> +struct fib_rule_port_range {
> + __u16   start;
> + __u16   end;
> +};
> +
>  enum {
>   FRA_UNSPEC,
>   FRA_DST,/* destination address */
> @@ -59,6 +64,9 @@ enum {
>   FRA_L3MDEV, /* iif or oif is l3mdev goto its table */
>   FRA_UID_RANGE,  /* UID range */
>   FRA_PROTOCOL,   /* Originator of the rule */
> + FRA_IP_PROTO,   /* ip proto */
> + FRA_SPORT_RANGE, /* sport */
> + FRA_DPORT_RANGE, /* dport */
>   __FRA_MAX
>  };
>  
> diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
> index a6aea80..5008235 100644
> --- a/net/core/fib_rules.c
> +++ b/net/core/fib_rules.c
> @@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule)
>   if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
>   !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
>   return false;
> + if (fib_rule_port_range_valid(>sport_range))
> + return false;
> + if (fib_rule_port_range_valid(>dport_range))
> + return false;

Seems like that should be a check that start and end are both not 0.
Given the uses of fib_rule_port_range_valid, perhaps another helper is
needed to make this more readable -- e.g., fib_rule_port_range_set --
which would be used here and fill_rule.


>   return true;
>  }
>  EXPORT_SYMBOL_GPL(fib_rule_matchall);
> @@ -221,6 +225,12 @@ static int nla_put_uid_range(struct sk_buff *skb, struct 
> fib_kuid_range *range)
>   return nla_put(skb, FRA_UID_RANGE, sizeof(out), );
>  }
>  
> +static int nla_put_port_range(struct sk_buff *skb, int attrtype,
> +   struct fib_port_range *range)
> +{
> + return nla_put(skb, attrtype, sizeof(*range), range);
> +}
> +
>  static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
> struct flowi *fl, int flags,
> struct fib_lookup_arg *arg)
> @@ -425,6 +435,17 @@ static int rule_exists(struct fib_rules_ops *ops, struct 
> fib_rule_hdr *frh,
>   !uid_eq(r->uid_range.end, rule->uid_range.end))
>

Re: [PATCH net] net: phy: Restore phy_resume() locking assumption

2018-02-25 Thread Andrew Lunn

> > +int phy_resume(struct phy_device *phydev)
> > +{
> > +   int ret;
> > +
> > +   mutex_lock(>lock);
> > +   ret = phy_resume(phydev);
> 
> phy_resume -> __phy_resume?

Ah, where did i put the brown paper bag :-(

Thanks

Andrew

[RFC PATCH] e1000e: Fix link check race condition.

2018-02-25 Thread Benjamin Poirier

Alex reported the following race condition:

/* link goes up... interrupt... schedule watchdog */
\ e1000_watchdog_task
\ e1000e_has_link
\ hw->mac.ops.check_for_link() === e1000e_check_for_copper_link
\ e1000e_phy_has_link_generic(..., )
link = true

 /* link goes down... interrupt */
 \ e1000_msix_other
 hw->mac.get_link_status = true

/* link is up */
mac->get_link_status = false

link_active = true
/* link_active is true, wrongly, and stays so because
 * get_link_status is false */

Avoid this problem by making sure that we don't set get_link_status = false
after having checked the link.

It seems this problem has been present since the introduction of e1000e.

Link: https://lkml.org/lkml/2018/1/29/338
Reported-by: Alexander Duyck 
Signed-off-by: Benjamin Poirier 
---
 drivers/net/ethernet/intel/e1000e/ich8lan.c | 41 -
 drivers/net/ethernet/intel/e1000e/mac.c | 14 +++---
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c 
b/drivers/net/ethernet/intel/e1000e/ich8lan.c
index ff308b05d68c..3c2c4f87e075 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
@@ -1386,6 +1386,7 @@ static s32 e1000_check_for_copper_link_ich8lan(struct 
e1000_hw *hw)
 */
if (!mac->get_link_status)
return 1;
+   mac->get_link_status = false;
 
/* First we want to see if the MII Status Register reports
 * link.  If so, then we want to get the current speed/duplex
@@ -1393,12 +1394,12 @@ static s32 e1000_check_for_copper_link_ich8lan(struct 
e1000_hw *hw)
 */
ret_val = e1000e_phy_has_link_generic(hw, 1, 0, );
if (ret_val)
-   return ret_val;
+   goto out;
 
if (hw->mac.type == e1000_pchlan) {
ret_val = e1000_k1_gig_workaround_hv(hw, link);
if (ret_val)
-   return ret_val;
+   goto out;
}
 
/* When connected at 10Mbps half-duplex, some parts are excessively
@@ -1431,7 +1432,7 @@ static s32 e1000_check_for_copper_link_ich8lan(struct 
e1000_hw *hw)
 
ret_val = hw->phy.ops.acquire(hw);
if (ret_val)
-   return ret_val;
+   goto out;
 
if (hw->mac.type == e1000_pch2lan)
emi_addr = I82579_RX_CONFIG;
@@ -1453,7 +1454,7 @@ static s32 e1000_check_for_copper_link_ich8lan(struct 
e1000_hw *hw)
hw->phy.ops.release(hw);
 
if (ret_val)
-   return ret_val;
+   goto out;
 
if (hw->mac.type >= e1000_pch_spt) {
u16 data;
@@ -1462,14 +1463,14 @@ static s32 e1000_check_for_copper_link_ich8lan(struct 
e1000_hw *hw)
if (speed == SPEED_1000) {
ret_val = hw->phy.ops.acquire(hw);
if (ret_val)
-   return ret_val;
+   goto out;
 
ret_val = e1e_rphy_locked(hw,
  PHY_REG(776, 20),
  );
if (ret_val) {
hw->phy.ops.release(hw);
-   return ret_val;
+   goto out;
}
 
ptr_gap = (data & (0x3FF << 2)) >> 2;
@@ -1483,18 +1484,18 @@ static s32 e1000_check_for_copper_link_ich8lan(struct 
e1000_hw *hw)
}
hw->phy.ops.release(hw);
if (ret_val)
-   return ret_val;
+   goto out;
} else {
ret_val = hw->phy.ops.acquire(hw);
if (ret_val)
-   return ret_val;
+   goto out;
 
ret_val = e1e_wphy_locked(hw,
  PHY_REG(776, 20),
  0xC023);
hw->phy.ops.release(hw);
if (ret_val)
-   return ret_val;
+

Re: [PATCH net] net: phy: Restore phy_resume() locking assumption

2018-02-25 Thread Yunsheng Lin

Hi, Andrew

On 2018/2/26 7:04, Lunn wrote:
> commit f5e64032a799 ("net: phy: fix resume handling") changes the
> locking semantics for phy_resume() such that the caller now needs to
> hold the phy mutex. Not all call sites were adopted to this new
> semantic, resulting in warnings from the added
> WARN_ON(!mutex_is_locked(>lock)).  Rather than change the
> semantics, add a __phy_resume() and restore the old behavior of
> phy_resume().
> 
> Reported-by: Heiner Kallweit 
> Fixes: f5e64032a799 ("net: phy: fix resume handling")
> Signed-off-by: Andrew Lunn 
> ---
>  drivers/net/phy/phy.c|  2 +-
>  drivers/net/phy/phy_device.c | 18 +-
>  include/linux/phy.h  |  1 +
>  3 files changed, 15 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
> index e3e29c2b028b..a6f924fee584 100644
> --- a/drivers/net/phy/phy.c
> +++ b/drivers/net/phy/phy.c
> @@ -819,7 +819,7 @@ void phy_start(struct phy_device *phydev)
>   break;
>   case PHY_HALTED:
>   /* if phy was suspended, bring the physical link up again */
> - phy_resume(phydev);
> + __phy_resume(phydev);
>  
>   /* make sure interrupts are re-enabled for the PHY */
>   if (phy_interrupt_is_valid(phydev)) {
> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> index d39ae77707ef..4cfb3851ed1f 100644
> --- a/drivers/net/phy/phy_device.c
> +++ b/drivers/net/phy/phy_device.c
> @@ -135,9 +135,7 @@ static int mdio_bus_phy_resume(struct device *dev)
>   if (!mdio_bus_phy_may_suspend(phydev))
>   goto no_resume;
>  
> - mutex_lock(>lock);
>   ret = phy_resume(phydev);
> - mutex_unlock(>lock);
>   if (ret < 0)
>   return ret;
>  
> @@ -1041,9 +1039,7 @@ int phy_attach_direct(struct net_device *dev, struct 
> phy_device *phydev,
>   if (err)
>   goto error;
>  
> - mutex_lock(>lock);
>   phy_resume(phydev);
> - mutex_unlock(>lock);
>   phy_led_triggers_register(phydev);
>  
>   return err;
> @@ -1172,7 +1168,7 @@ int phy_suspend(struct phy_device *phydev)
>  }
>  EXPORT_SYMBOL(phy_suspend);
>  
> -int phy_resume(struct phy_device *phydev)
> +int __phy_resume(struct phy_device *phydev)
>  {
>   struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver);
>   int ret = 0;
> @@ -1189,6 +1185,18 @@ int phy_resume(struct phy_device *phydev)
>  
>   return ret;
>  }
> +EXPORT_SYMBOL(__phy_resume);
> +
> +int phy_resume(struct phy_device *phydev)
> +{
> + int ret;
> +
> + mutex_lock(>lock);
> + ret = phy_resume(phydev);

phy_resume -> __phy_resume?

> + mutex_unlock(>lock);
> +
> + return ret;
> +}
>  EXPORT_SYMBOL(phy_resume);
>  
>  int phy_loopback(struct phy_device *phydev, bool enable)
> diff --git a/include/linux/phy.h b/include/linux/phy.h
> index 5a0c3e53e7c2..d7069539f351 100644
> --- a/include/linux/phy.h
> +++ b/include/linux/phy.h
> @@ -924,6 +924,7 @@ void phy_device_remove(struct phy_device *phydev);
>  int phy_init_hw(struct phy_device *phydev);
>  int phy_suspend(struct phy_device *phydev);
>  int phy_resume(struct phy_device *phydev);
> +int __phy_resume(struct phy_device *phydev);
>  int phy_loopback(struct phy_device *phydev, bool enable);
>  struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
> phy_interface_t interface);
>

[PATCH bpf-next v2] samples/bpf: Add program for CPU state statistics

2018-02-25 Thread Leo Yan

CPU is active when have running tasks on it and CPUFreq governor can
select different operating points (OPP) according to different workload;
we use 'pstate' to present CPU state which have running tasks with one
specific OPP.  On the other hand, CPU is idle which only idle task on
it, CPUIdle governor can select one specific idle state to power off
hardware logics; we use 'cstate' to present CPU idle state.

Based on trace events 'cpu_idle' and 'cpu_frequency' we can accomplish
the duration statistics for every state.  Every time when CPU enters
into or exits from idle states, the trace event 'cpu_idle' is recorded;
trace event 'cpu_frequency' records the event for CPU OPP changing, so
it's easily to know how long time the CPU stays in the specified OPP,
and the CPU must be not in any idle state.

This patch is to utilize the mentioned trace events for pstate and
cstate statistics.  To achieve more accurate profiling data, the program
uses below sequence to insure CPU running/idle time aren't missed:

- Before profiling the user space program wakes up all CPUs for once, so
  can avoid to missing account time for CPU staying in idle state for
  long time; the program forces to set 'scaling_max_freq' to lowest
  frequency and then restore 'scaling_max_freq' to highest frequency,
  this can ensure the frequency to be set to lowest frequency and later
  after start to run workload the frequency can be easily to be changed
  to higher frequency;

- User space program reads map data and update statistics for every 5s,
  so this is same with other sample bpf programs for avoiding big
  overload introduced by bpf program self;

- When send signal to terminate program, the signal handler wakes up
  all CPUs, set lowest frequency and restore highest frequency to
  'scaling_max_freq'; this is exactly same with the first step so
  avoid to missing account CPU pstate and cstate time during last
  stage.  Finally it reports the latest statistics.

The program has been tested on Hikey board with octa CA53 CPUs, below
is one example for statistics result, the format mainly follows up
Jesper Dangaard Brouer suggestion.

Jesper reminds to 'get printf to pretty print with thousands separators
use %' and setlocale(LC_NUMERIC, "en_US")', tried three different arm64
GCC toolchains (5.4.0 20160609, 6.2.1 20161016, 6.3.0 20170516) but all
of them cannot support printf flag character %' on arm64 platform, so go
back print number without grouping mode.

CPU states statistics:
state(ms)  cstate-0cstate-1cstate-2pstate-0pstate-1pstate-2 
   pstate-3pstate-4
CPU-0  767 6111111863  561 31  756  
   853 190
CPU-1  241 10606   107956  484 125 646  
   990 85
CPU-2  413 19721   98735   636 84  696  
   757 89
CPU-3  84  11711   79989   17516   909 4811 
   5773341
CPU-4  152 19610   98229   444 53  649  
   708 1283
CPU-5  185 8781108697  666 91  671  
   677 1365
CPU-6  157 21964   95825   581 67  566  
   684 1284
CPU-7  125 15238   102704  398 20  665  
   786 1197

Cc: Daniel Lezcano 
Cc: Vincent Guittot 
Signed-off-by: Leo Yan 
---
 samples/bpf/Makefile   |   4 +
 samples/bpf/cpustat_kern.c | 281 +
 samples/bpf/cpustat_user.c | 219 +++
 3 files changed, 504 insertions(+)
 create mode 100644 samples/bpf/cpustat_kern.c
 create mode 100644 samples/bpf/cpustat_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ec3fc8d..2c2a587 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
+hostprogs-y += cpustat
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -93,6 +94,7 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) 
xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
+cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -144,6 +146,7 @@ always += xdp_monitor_kern.o
 always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
+always += cpustat_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -188,6 +191,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf

[RFC PATCH v2] ptr_ring: linked list fallback

2018-02-25 Thread Michael S. Tsirkin

So pointer rings work fine, but they have a problem: make them too small
and not enough entries fit.  Make them too large and you start flushing
your cache and running out of memory.

This is a new idea of mine: a ring backed by a linked list. Once you run
out of ring entries, instead of a drop you fall back on a list with a
common lock.

Should work well for the case where the ring is typically sized
correctly, but will help address the fact that some user try to set e.g.
tx queue length to 100.

In other words, the idea is that if a user sets a really huge TX queue
length, we allocate a ptr_ring which is smaller, and use the backup
linked list when necessary to provide the requested TX queue length
legitimately.

My hope this will move us closer to direction where e.g. fw codel can
use ptr rings without locking at all.  The API is still very rough, and
I really need to take a hard look at lock nesting.

Compiled only, sending for early feedback/flames.

Signed-off-by: Michael S. Tsirkin 
---

changes from v1:
- added clarifications by DaveM in the commit log
- build fixes

 include/linux/ptr_ring.h | 64 +---
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index d72b2e7..8aa8882 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -31,11 +31,18 @@
 #include 
 #endif
 
+/* entries must start with the following structure */
+struct plist {
+   struct plist *next;
+   struct plist *last; /* only valid in the 1st entry */
+};
+
 struct ptr_ring {
int producer cacheline_aligned_in_smp;
spinlock_t producer_lock;
int consumer_head cacheline_aligned_in_smp; /* next valid entry */
int consumer_tail; /* next entry to invalidate */
+   struct plist *consumer_list;
spinlock_t consumer_lock;
/* Shared consumer/producer data */
/* Read-only by both the producer and the consumer */
@@ -120,10 +127,40 @@ static inline int __ptr_ring_produce(struct ptr_ring *r, 
void *ptr)
 }
 
 /*
- * Note: resize (below) nests producer lock within consumer lock, so if you
- * consume in interrupt or BH context, you must disable interrupts/BH when
- * calling this.
+ * Note: resize API with the _fallback should be used when calling this.
  */
+static inline int ptr_ring_produce_fallback(struct ptr_ring *r, void *ptr)
+{
+   int ret;
+   unsigned long flags;
+   struct plist *p = ptr;
+
+   p->next = NULL;
+   p->last = p;
+
+   spin_lock_irqsave(>producer_lock, flags);
+   ret = __ptr_ring_produce(r, ptr);
+   if (ret) {
+   spin_lock(>consumer_lock);
+   ret = __ptr_ring_produce(r, ptr);
+   if (ret) {
+   int producer = r->producer ? r->producer - 1 :
+   r->size - 1;
+   struct plist *first = r->queue[producer];
+
+   BUG_ON(!first);
+
+   first->last->next = p;
+   first->last = p;
+   }
+   spin_unlock(>consumer_lock);
+   }
+
+   spin_unlock_irqrestore(>producer_lock, flags);
+
+   return ret;
+}
+
 static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr)
 {
int ret;
@@ -135,6 +172,7 @@ static inline int ptr_ring_produce(struct ptr_ring *r, void 
*ptr)
return ret;
 }
 
+
 static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr)
 {
int ret;
@@ -359,6 +397,26 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
return ptr;
 }
 
+static inline void *ptr_ring_consume_fallback(struct ptr_ring *r)
+{
+   unsigned long flags;
+   struct plist *ptr;
+
+   spin_lock_irqsave(>consumer_lock, flags);
+   if (r->consumer_list) {
+   ptr = r->consumer_list;
+   r->consumer_list = ptr->next;
+   } else {
+   ptr = __ptr_ring_consume(r);
+   if (ptr) {
+   r->consumer_list = ptr->next;
+   }
+   }
+   spin_unlock_irqrestore(>consumer_lock, flags);
+
+   return ptr;
+}
+
 static inline int ptr_ring_consume_batched(struct ptr_ring *r,
   void **array, int n)
 {
-- 
MST

Re: [RFC PATCH] ptr_ring: linked list fallback

2018-02-25 Thread Michael S. Tsirkin

On Fri, Feb 16, 2018 at 04:32:05PM -0500, David Miller wrote:
> From: "Michael S. Tsirkin" 
> Date: Fri, 16 Feb 2018 09:40:54 +0200
> 
> > So pointer rings work fine, but they have a problem:
> > make them too small and not enough entries fit.
> > Make them too large and you start flushing your cache
> > and running out of memory.
> > 
> > This is a new idea of mine: a ring backed by a
> > linked list. Once you run out of rin entries,
> > instead of a drop you fall back on a list with
> > a common lock.
> > 
> > Should work well for the case where the ring is typically sized
> > correctly, but will help address the fact that some user try to set e.g.
> > tx queue length to 100.
> > 
> > My hope this will move us closer to direction where e.g. fw codel can
> > use ptr rings without locking at all.
> > The API is still very rough, and I really need to take a hard look
> > at lock nesting.
> > 
> > Completely untested, sending for early feedback/flames.
> > 
> > Signed-off-by: Michael S. Tsirkin 
> 
> So the idea is that if a user sets a really huge TX queue length, we allocate
> a ptr_ring which is smaller, and use the backup linked list when necessary
> to provide the requested TX queue length legitimately.
> 
> Right?

Exactly, thanks for adding this clarification.

-- 
MST

linux-next: manual merge of the bpf-next tree with the bpf tree

2018-02-25 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the bpf-next tree got a conflict in:

  tools/testing/selftests/bpf/test_verifier.c

between commit:

  ca36960211eb ("bpf: allow xadd only on aligned memory")

from the bpf tree and commit:

  23d191a82c13 ("bpf: add various jit test cases")

from the bpf-next tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc tools/testing/selftests/bpf/test_verifier.c
index 437c0b1c9d21,c987d3a2426f..
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@@ -11163,64 -11140,95 +11166,153 @@@ static struct bpf_test tests[] = 
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
 +  {
 +  "xadd/w check unaligned stack",
 +  .insns = {
 +  BPF_MOV64_IMM(BPF_REG_0, 1),
 +  BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
 +  BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -7),
 +  BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
 +  BPF_EXIT_INSN(),
 +  },
 +  .result = REJECT,
 +  .errstr = "misaligned stack access off",
 +  .prog_type = BPF_PROG_TYPE_SCHED_CLS,
 +  },
 +  {
 +  "xadd/w check unaligned map",
 +  .insns = {
 +  BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 +  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 +  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 +  BPF_LD_MAP_FD(BPF_REG_1, 0),
 +  BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
 +   BPF_FUNC_map_lookup_elem),
 +  BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
 +  BPF_EXIT_INSN(),
 +  BPF_MOV64_IMM(BPF_REG_1, 1),
 +  BPF_STX_XADD(BPF_W, BPF_REG_0, BPF_REG_1, 3),
 +  BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 3),
 +  BPF_EXIT_INSN(),
 +  },
 +  .fixup_map1 = { 3 },
 +  .result = REJECT,
 +  .errstr = "misaligned value access off",
 +  .prog_type = BPF_PROG_TYPE_SCHED_CLS,
 +  },
 +  {
 +  "xadd/w check unaligned pkt",
 +  .insns = {
 +  BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
 +  offsetof(struct xdp_md, data)),
 +  BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
 +  offsetof(struct xdp_md, data_end)),
 +  BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
 +  BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
 +  BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 2),
 +  BPF_MOV64_IMM(BPF_REG_0, 99),
 +  BPF_JMP_IMM(BPF_JA, 0, 0, 6),
 +  BPF_MOV64_IMM(BPF_REG_0, 1),
 +  BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
 +  BPF_ST_MEM(BPF_W, BPF_REG_2, 3, 0),
 +  BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 1),
 +  BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 2),
 +  BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 1),
 +  BPF_EXIT_INSN(),
 +  },
 +  .result = REJECT,
 +  .errstr = "BPF_XADD stores into R2 packet",
 +  .prog_type = BPF_PROG_TYPE_XDP,
 +  },
+   {
+   "jit: lsh, rsh, arsh by 1",
+   .insns = {
+   BPF_MOV64_IMM(BPF_REG_0, 1),
+   BPF_MOV64_IMM(BPF_REG_1, 0xff),
+   BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 1),
+   BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 1),
+   BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
+   BPF_EXIT_INSN(),
+   BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 1),
+   BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 1),
+   BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0xff, 1),
+   BPF_EXIT_INSN(),
+   BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 1),
+   BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x7f, 1),
+   BPF_EXIT_INSN(),
+   BPF_MOV64_IMM(BPF_REG_0, 2),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = 2,
+   },
+   {
+   "jit: mov32 for ldimm64, 1",
+   .insns = {
+

Re: [Intel-wired-lan] [next-queue PATCH 7/8] igb: Add support for adding offloaded clsflower filters

2018-02-25 Thread kbuild test robot

Hi Vinicius,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on jkirsher-next-queue/dev-queue]
[also build test WARNING on v4.16-rc2 next-20180223]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Vinicius-Costa-Gomes/igb-Fix-not-adding-filter-elements-to-the-list/20180226-053124
base:   https://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git 
dev-queue
reproduce:
# apt-get install sparse
make ARCH=x86_64 allmodconfig
make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

   drivers/net/ethernet/intel/igb/igb_main.c:474:25: sparse: cast to restricted 
__le64
   drivers/net/ethernet/intel/igb/igb_main.c:474:25: sparse: cast to restricted 
__le64
   drivers/net/ethernet/intel/igb/igb_main.c:554:33: sparse: cast to restricted 
__le64
   drivers/net/ethernet/intel/igb/igb_main.c:554:33: sparse: cast to restricted 
__le64
   drivers/net/ethernet/intel/igb/igb_main.c:560:33: sparse: cast to restricted 
__le64
   drivers/net/ethernet/intel/igb/igb_main.c:560:33: sparse: cast to restricted 
__le64
>> drivers/net/ethernet/intel/igb/igb_main.c:2573:48: sparse: incorrect type in 
>> assignment (different base types) @@ expected restricted __be16 vlan_tci @@ 
>> got unsignedrestricted __be16 vlan_tci @@
   drivers/net/ethernet/intel/igb/igb_main.c:2573:48: expected restricted 
__be16 vlan_tci
   drivers/net/ethernet/intel/igb/igb_main.c:2573:48: got unsigned short 
vlan_priority:3
   drivers/net/ethernet/intel/igb/igb_main.c:5616:46: sparse: incorrect type in 
argument 2 (different base types) @@ expected restricted __wsum diff @@ got 
restricted __wsum diff @@
   drivers/net/ethernet/intel/igb/igb_main.c:5616:46: expected restricted 
__wsum diff
   drivers/net/ethernet/intel/igb/igb_main.c:5616:46: got restricted __be32 

   drivers/net/ethernet/intel/igb/igb_main.c:8047:31: sparse: cast to 
restricted __be16
   drivers/net/ethernet/intel/igb/igb_main.c:8047:31: sparse: cast from 
restricted __le16
   drivers/net/ethernet/intel/igb/igb_main.c:8047:31: sparse: cast to 
restricted __be16
   drivers/net/ethernet/intel/igb/igb_main.c:8047:31: sparse: cast from 
restricted __le16
   drivers/net/ethernet/intel/igb/igb_main.c:8047:31: sparse: cast to 
restricted __be16
   drivers/net/ethernet/intel/igb/igb_main.c:8047:31: sparse: cast from 
restricted __le16
   drivers/net/ethernet/intel/igb/igb_main.c:8047:31: sparse: cast to 
restricted __be16
   drivers/net/ethernet/intel/igb/igb_main.c:8047:31: sparse: cast from 
restricted __le16

vim +2573 drivers/net/ethernet/intel/igb/igb_main.c

  2503  
  2504  static int igb_parse_cls_flower(struct igb_adapter *adapter,
  2505  struct tc_cls_flower_offload *f,
  2506  int traffic_class,
  2507  struct igb_nfc_filter *input)
  2508  {
  2509  if (f->dissector->used_keys &
  2510  ~(BIT(FLOW_DISSECTOR_KEY_BASIC) |
  2511BIT(FLOW_DISSECTOR_KEY_CONTROL) |
  2512BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
  2513BIT(FLOW_DISSECTOR_KEY_VLAN))) {
  2514  dev_err(>pdev->dev, "Unsupported key used: 
0x%x\n",
  2515  f->dissector->used_keys);
  2516  return -EOPNOTSUPP;
  2517  }
  2518  
  2519  if (dissector_uses_key(f->dissector, 
FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
  2520  struct flow_dissector_key_eth_addrs *key =
  2521  skb_flow_dissector_target(f->dissector,
  2522
FLOW_DISSECTOR_KEY_ETH_ADDRS,
  2523f->key);
  2524  
  2525  struct flow_dissector_key_eth_addrs *mask =
  2526  skb_flow_dissector_target(f->dissector,
  2527
FLOW_DISSECTOR_KEY_ETH_ADDRS,
  2528f->mask);
  2529  
  2530  if (is_broadcast_ether_addr(mask->dst)) {
  2531  input->filter.match_flags |=
  2532  IGB_FILTER_FLAG_DST_MAC_ADDR;
  2533  ether_addr_copy(input->filter.dst_addr, 
key->dst);
  2534  }
  2535  
  2536  if (is_broadcast_ether_addr(mask->src)) {
  2537  input->filter.match_flags |=
  2538  IGB_FILTER_FLAG_SRC_MAC_ADDR;
  2539  ether_addr_copy(input->filter.src_addr, 
key->src);
  2540  }
  2541  }
  2542  
  2543  if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) 
{
  2544  struct flow_dissector_key_basic *key =
  2545

[PATCH V3 net-next 1/3] selftests/net: revert the zerocopy Rx path for PF_RDS

2018-02-25 Thread Sowmini Varadhan

In preparation for optimized reception of zerocopy completion,
revert the Rx side changes introduced by Commit dfb8434b0a94
("selftests/net: add zerocopy support for PF_RDS test case")

Signed-off-by: Sowmini Varadhan 
---
v2: prepare to remove sk_error_queue based path; remove recvmsg() as well,
PF_RDS can also use recv() for the usage pattern in msg_zerocopy

 tools/testing/selftests/net/msg_zerocopy.c |   67 
 1 files changed, 0 insertions(+), 67 deletions(-)

diff --git a/tools/testing/selftests/net/msg_zerocopy.c 
b/tools/testing/selftests/net/msg_zerocopy.c
index 5cc2a53..eff9cf2 100644
--- a/tools/testing/selftests/net/msg_zerocopy.c
+++ b/tools/testing/selftests/net/msg_zerocopy.c
@@ -344,26 +344,6 @@ static int do_setup_tx(int domain, int type, int protocol)
return fd;
 }
 
-static int do_process_zerocopy_cookies(struct sock_extended_err *serr,
-  uint32_t *ckbuf, size_t nbytes)
-{
-   int ncookies, i;
-
-   if (serr->ee_errno != 0)
-   error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
-   ncookies = serr->ee_data;
-   if (ncookies > SO_EE_ORIGIN_MAX_ZCOOKIES)
-   error(1, 0, "Returned %d cookies, max expected %d\n",
- ncookies, SO_EE_ORIGIN_MAX_ZCOOKIES);
-   if (nbytes != ncookies * sizeof(uint32_t))
-   error(1, 0, "Expected %d cookies, got %ld\n",
- ncookies, nbytes/sizeof(uint32_t));
-   for (i = 0; i < ncookies; i++)
-   if (cfg_verbose >= 2)
-   fprintf(stderr, "%d\n", ckbuf[i]);
-   return ncookies;
-}
-
 static bool do_recv_completion(int fd)
 {
struct sock_extended_err *serr;
@@ -372,17 +352,10 @@ static bool do_recv_completion(int fd)
uint32_t hi, lo, range;
int ret, zerocopy;
char control[100];
-   uint32_t ckbuf[SO_EE_ORIGIN_MAX_ZCOOKIES];
-   struct iovec iov;
 
msg.msg_control = control;
msg.msg_controllen = sizeof(control);
 
-   iov.iov_base = ckbuf;
-   iov.iov_len = (SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(ckbuf[0]));
-   msg.msg_iov = 
-   msg.msg_iovlen = 1;
-
ret = recvmsg(fd, , MSG_ERRQUEUE);
if (ret == -1 && errno == EAGAIN)
return false;
@@ -402,10 +375,6 @@ static bool do_recv_completion(int fd)
 
serr = (void *) CMSG_DATA(cm);
 
-   if (serr->ee_origin == SO_EE_ORIGIN_ZCOOKIE) {
-   completions += do_process_zerocopy_cookies(serr, ckbuf, ret);
-   return true;
-   }
if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
if (serr->ee_errno != 0)
@@ -631,40 +600,6 @@ static void do_flush_datagram(int fd, int type)
bytes += cfg_payload_len;
 }
 
-
-static void do_recvmsg(int fd)
-{
-   int ret, off = 0;
-   char *buf;
-   struct iovec iov;
-   struct msghdr msg;
-   struct sockaddr_storage din;
-
-   buf = calloc(cfg_payload_len, sizeof(char));
-   iov.iov_base = buf;
-   iov.iov_len = cfg_payload_len;
-
-   memset(, 0, sizeof(msg));
-   msg.msg_name = 
-   msg.msg_namelen = sizeof(din);
-   msg.msg_iov = 
-   msg.msg_iovlen = 1;
-
-   ret = recvmsg(fd, , MSG_TRUNC);
-
-   if (ret == -1)
-   error(1, errno, "recv");
-   if (ret != cfg_payload_len)
-   error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
-
-   if (memcmp(buf + off, payload, ret))
-   error(1, 0, "recv: data mismatch");
-
-   free(buf);
-   packets++;
-   bytes += cfg_payload_len;
-}
-
 static void do_rx(int domain, int type, int protocol)
 {
uint64_t tstop;
@@ -676,8 +611,6 @@ static void do_rx(int domain, int type, int protocol)
do {
if (type == SOCK_STREAM)
do_flush_tcp(fd);
-   else if (domain == PF_RDS)
-   do_recvmsg(fd);
else
do_flush_datagram(fd, type);
 
-- 
1.7.1

[PATCH V3 net-next 3/3] selftests/net: reap zerocopy completions passed up as ancillary data.

2018-02-25 Thread Sowmini Varadhan

PF_RDS sockets pass up cookies for zerocopy completion as ancillary
data. Update msg_zerocopy to reap this information.

Signed-off-by: Sowmini Varadhan 
---
v2: receive zerocopy completion notification as POLLIN
v3: drop ncookies arg in do_process_zerocopy_cookies; Reverse christmas
tree declarations; check for C_TRUNC; print warning when encountering
ignored cmsghdrs in do_recvmsg_completion

 tools/testing/selftests/net/msg_zerocopy.c |   65 ---
 1 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/net/msg_zerocopy.c 
b/tools/testing/selftests/net/msg_zerocopy.c
index eff9cf2..406cc70 100644
--- a/tools/testing/selftests/net/msg_zerocopy.c
+++ b/tools/testing/selftests/net/msg_zerocopy.c
@@ -344,7 +344,53 @@ static int do_setup_tx(int domain, int type, int protocol)
return fd;
 }
 
-static bool do_recv_completion(int fd)
+static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck)
+{
+   int i;
+
+   if (ck->num > RDS_MAX_ZCOOKIES)
+   error(1, 0, "Returned %d cookies, max expected %d\n",
+ ck->num, RDS_MAX_ZCOOKIES);
+   for (i = 0; i < ck->num; i++)
+   if (cfg_verbose >= 2)
+   fprintf(stderr, "%d\n", ck->cookies[i]);
+   return ck->num;
+}
+
+static bool do_recvmsg_completion(int fd)
+{
+   char cmsgbuf[CMSG_SPACE(sizeof(struct rds_zcopy_cookies))];
+   struct rds_zcopy_cookies *ck;
+   struct cmsghdr *cmsg;
+   struct msghdr msg;
+   bool ret = false;
+
+   memset(, 0, sizeof(msg));
+   msg.msg_control = cmsgbuf;
+   msg.msg_controllen = sizeof(cmsgbuf);
+
+   if (recvmsg(fd, , MSG_DONTWAIT))
+   return ret;
+
+   if (msg.msg_flags & MSG_CTRUNC)
+   error(1, errno, "recvmsg notification: truncated");
+
+   for (cmsg = CMSG_FIRSTHDR(); cmsg; cmsg = CMSG_NXTHDR(, cmsg)) {
+   if (cmsg->cmsg_level == SOL_RDS &&
+   cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) {
+
+   ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg);
+   completions += do_process_zerocopy_cookies(ck);
+   ret = true;
+   break;
+   }
+   error(0, 0, "ignoring cmsg at level %d type %d\n",
+   cmsg->cmsg_level, cmsg->cmsg_type);
+   }
+   return ret;
+}
+
+static bool do_recv_completion(int fd, int domain)
 {
struct sock_extended_err *serr;
struct msghdr msg = {};
@@ -353,6 +399,9 @@ static bool do_recv_completion(int fd)
int ret, zerocopy;
char control[100];
 
+   if (domain == PF_RDS)
+   return do_recvmsg_completion(fd);
+
msg.msg_control = control;
msg.msg_controllen = sizeof(control);
 
@@ -409,20 +458,20 @@ static bool do_recv_completion(int fd)
 }
 
 /* Read all outstanding messages on the errqueue */
-static void do_recv_completions(int fd)
+static void do_recv_completions(int fd, int domain)
 {
-   while (do_recv_completion(fd)) {}
+   while (do_recv_completion(fd, domain)) {}
 }
 
 /* Wait for all remaining completions on the errqueue */
-static void do_recv_remaining_completions(int fd)
+static void do_recv_remaining_completions(int fd, int domain)
 {
int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
 
while (completions < expected_completions &&
   gettimeofday_ms() < tstop) {
-   if (do_poll(fd, POLLERR))
-   do_recv_completions(fd);
+   if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR))
+   do_recv_completions(fd, domain);
}
 
if (completions < expected_completions)
@@ -503,13 +552,13 @@ static void do_tx(int domain, int type, int protocol)
 
while (!do_poll(fd, POLLOUT)) {
if (cfg_zerocopy)
-   do_recv_completions(fd);
+   do_recv_completions(fd, domain);
}
 
} while (gettimeofday_ms() < tstop);
 
if (cfg_zerocopy)
-   do_recv_remaining_completions(fd);
+   do_recv_remaining_completions(fd, domain);
 
if (close(fd))
error(1, errno, "close");
-- 
1.7.1

[PATCH V3 net-next 2/3] rds: deliver zerocopy completion notification with data

2018-02-25 Thread Sowmini Varadhan

This commit is an optimization over commit 01883eda72bd
("rds: support for zcopy completion notification") for PF_RDS sockets.

RDS applications are predominantly request-response transactions, so
it is more efficient to reduce the number of system calls and have
zerocopy completion notification delivered as ancillary data on the
POLLIN channel.

Cookies are passed up as ancillary data (at level SOL_RDS) in a
struct rds_zcopy_cookies when the returned value of recvmsg() is
greater than, or equal to, 0. A max of RDS_MAX_ZCOOKIES may be passed
with each message.

This commit removes support for zerocopy completion notification on
MSG_ERRQUEUE for PF_RDS sockets.

Signed-off-by: Sowmini Varadhan 
---
v2: remove sk_error_queue path; lot of cautionary checks rds_recvmsg_zcookie()
and callers to make sure we dont remove cookies from the queue and then
fail to pass it up to caller
v3: 
  - bounds check on skb->cb to make sure there is enough room for
struct rds_zcopy_cookies as well as the rds_znotifier; 
  - Refactor cautionary checks in rds_recvmsg_zcookie: if no msg_control
has been passed, or if there not enough msg_controllen for a
a rds_zcopy_cookies, return silently (do not return error, as the
caller may have wanted other ancillary data which may happen to fit
in the space provided)
  - return bool form rds_recvmsg_zcookie, some other code cleanup

 include/uapi/linux/errqueue.h |2 --
 include/uapi/linux/rds.h  |7 +++
 net/rds/af_rds.c  |7 +--
 net/rds/message.c |   38 --
 net/rds/rds.h |2 ++
 net/rds/recv.c|   31 ++-
 6 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index 28812ed..dc64cfa 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -20,13 +20,11 @@ struct sock_extended_err {
 #define SO_EE_ORIGIN_ICMP6 3
 #define SO_EE_ORIGIN_TXSTATUS  4
 #define SO_EE_ORIGIN_ZEROCOPY  5
-#define SO_EE_ORIGIN_ZCOOKIE   6
 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
 
 #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1))
 
 #define SO_EE_CODE_ZEROCOPY_COPIED 1
-#defineSO_EE_ORIGIN_MAX_ZCOOKIES   8
 
 /**
  * struct scm_timestamping - timestamps exposed through cmsg
diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 12e3bca..a66b213 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -104,6 +104,7 @@
 #define RDS_CMSG_MASKED_ATOMIC_CSWP9
 #define RDS_CMSG_RXPATH_LATENCY11
 #defineRDS_CMSG_ZCOPY_COOKIE   12
+#defineRDS_CMSG_ZCOPY_COMPLETION   13
 
 #define RDS_INFO_FIRST 1
 #define RDS_INFO_COUNTERS  1
@@ -317,6 +318,12 @@ struct rds_rdma_notify {
 #define RDS_RDMA_DROPPED   3
 #define RDS_RDMA_OTHER_ERROR   4
 
+#defineRDS_MAX_ZCOOKIES8
+struct rds_zcopy_cookies {
+   __u32 num;
+   __u32 cookies[RDS_MAX_ZCOOKIES];
+};
+
 /*
  * Common set of flags for all RDMA related structs
  */
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index a937f18..f712610 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -77,6 +77,7 @@ static int rds_release(struct socket *sock)
rds_send_drop_to(rs, NULL);
rds_rdma_drop_keys(rs);
rds_notify_queue_get(rs, NULL);
+   __skb_queue_purge(>rs_zcookie_queue);
 
spin_lock_bh(_sock_lock);
list_del_init(>rs_item);
@@ -144,7 +145,7 @@ static int rds_getname(struct socket *sock, struct sockaddr 
*uaddr,
  *  -  to signal that a previously congested destination may have become
  * uncongested
  *  -  A notification has been queued to the socket (this can be a congestion
- * update, or a RDMA completion).
+ * update, or a RDMA completion, or a MSG_ZEROCOPY completion).
  *
  * EPOLLOUT is asserted if there is room on the send queue. This does not mean
  * however, that the next sendmsg() call will succeed. If the application tries
@@ -178,7 +179,8 @@ static __poll_t rds_poll(struct file *file, struct socket 
*sock,
spin_unlock(>rs_lock);
}
if (!list_empty(>rs_recv_queue) ||
-   !list_empty(>rs_notify_queue))
+   !list_empty(>rs_notify_queue) ||
+   !skb_queue_empty(>rs_zcookie_queue))
mask |= (EPOLLIN | EPOLLRDNORM);
if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
mask |= (EPOLLOUT | EPOLLWRNORM);
@@ -513,6 +515,7 @@ static int __rds_create(struct socket *sock, struct sock 
*sk, int protocol)
INIT_LIST_HEAD(>rs_recv_queue);
INIT_LIST_HEAD(>rs_notify_queue);
INIT_LIST_HEAD(>rs_cong_list);
+   skb_queue_head_init(>rs_zcookie_queue);
spin_lock_init(>rs_rdma_lock);
rs->rs_rdma_keys = RB_ROOT;

[PATCH V3 net-next 0/3] RDS: optimized notification for zerocopy completion

2018-02-25 Thread Sowmini Varadhan

RDS applications use predominantly request-response, transacation
based IPC, so that ingress and egress traffic are well-balanced,
and it is possible/desirable to reduce system-call overhead by
piggybacking the notifications for zerocopy completion response
with data.

Moreover, it has been pointed out that socket functions block
if sk_err is non-zero, thus if the RDS code does not plan/need
to use sk_error_queue path for completion notification, it
is preferable to remove the sk_errror_queue related paths in
RDS.

Both of these goals are implemented in this series.

v2: removed sk_error_queue support
v3: incorporated additional code review comments (details in each patch)

Sowmini Varadhan (3):
  selftests/net: revert the zerocopy Rx path for PF_RDS
  rds: deliver zerocopy completion notification with data
  selftests/net: reap zerocopy completions passed up as ancillary data.

 include/uapi/linux/errqueue.h  |2 -
 include/uapi/linux/rds.h   |7 ++
 net/rds/af_rds.c   |7 +-
 net/rds/message.c  |   38 -
 net/rds/rds.h  |2 +
 net/rds/recv.c |   31 +++-
 tools/testing/selftests/net/msg_zerocopy.c |  120 
 7 files changed, 111 insertions(+), 96 deletions(-)

[PATCH net] net: phy: Restore phy_resume() locking assumption

2018-02-25 Thread Andrew Lunn

commit f5e64032a799 ("net: phy: fix resume handling") changes the
locking semantics for phy_resume() such that the caller now needs to
hold the phy mutex. Not all call sites were adopted to this new
semantic, resulting in warnings from the added
WARN_ON(!mutex_is_locked(>lock)).  Rather than change the
semantics, add a __phy_resume() and restore the old behavior of
phy_resume().

Reported-by: Heiner Kallweit 
Fixes: f5e64032a799 ("net: phy: fix resume handling")
Signed-off-by: Andrew Lunn 
---
 drivers/net/phy/phy.c|  2 +-
 drivers/net/phy/phy_device.c | 18 +-
 include/linux/phy.h  |  1 +
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index e3e29c2b028b..a6f924fee584 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -819,7 +819,7 @@ void phy_start(struct phy_device *phydev)
break;
case PHY_HALTED:
/* if phy was suspended, bring the physical link up again */
-   phy_resume(phydev);
+   __phy_resume(phydev);
 
/* make sure interrupts are re-enabled for the PHY */
if (phy_interrupt_is_valid(phydev)) {
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index d39ae77707ef..4cfb3851ed1f 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -135,9 +135,7 @@ static int mdio_bus_phy_resume(struct device *dev)
if (!mdio_bus_phy_may_suspend(phydev))
goto no_resume;
 
-   mutex_lock(>lock);
ret = phy_resume(phydev);
-   mutex_unlock(>lock);
if (ret < 0)
return ret;
 
@@ -1041,9 +1039,7 @@ int phy_attach_direct(struct net_device *dev, struct 
phy_device *phydev,
if (err)
goto error;
 
-   mutex_lock(>lock);
phy_resume(phydev);
-   mutex_unlock(>lock);
phy_led_triggers_register(phydev);
 
return err;
@@ -1172,7 +1168,7 @@ int phy_suspend(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_suspend);
 
-int phy_resume(struct phy_device *phydev)
+int __phy_resume(struct phy_device *phydev)
 {
struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver);
int ret = 0;
@@ -1189,6 +1185,18 @@ int phy_resume(struct phy_device *phydev)
 
return ret;
 }
+EXPORT_SYMBOL(__phy_resume);
+
+int phy_resume(struct phy_device *phydev)
+{
+   int ret;
+
+   mutex_lock(>lock);
+   ret = phy_resume(phydev);
+   mutex_unlock(>lock);
+
+   return ret;
+}
 EXPORT_SYMBOL(phy_resume);
 
 int phy_loopback(struct phy_device *phydev, bool enable)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 5a0c3e53e7c2..d7069539f351 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -924,6 +924,7 @@ void phy_device_remove(struct phy_device *phydev);
 int phy_init_hw(struct phy_device *phydev);
 int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
+int __phy_resume(struct phy_device *phydev);
 int phy_loopback(struct phy_device *phydev, bool enable);
 struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
  phy_interface_t interface);
-- 
2.16.2

Re: [Intel-wired-lan] [next-queue PATCH 4/8] igb: Add support for MAC address filters specifying source addresses

2018-02-25 Thread Alexander Duyck

On Fri, Feb 23, 2018 at 5:20 PM, Vinicius Costa Gomes
 wrote:
> Makes it possible to direct packets to queues based on their source
> address. Documents the expected usage of the 'flags' parameter.
>
> Signed-off-by: Vinicius Costa Gomes 
> ---
>  drivers/net/ethernet/intel/igb/e1000_defines.h |  1 +
>  drivers/net/ethernet/intel/igb/igb.h   |  1 +
>  drivers/net/ethernet/intel/igb/igb_main.c  | 35 
> +++---
>  3 files changed, 28 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h 
> b/drivers/net/ethernet/intel/igb/e1000_defines.h
> index 573bf177fd08..c6f552de30dd 100644
> --- a/drivers/net/ethernet/intel/igb/e1000_defines.h
> +++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
> @@ -490,6 +490,7 @@
>   * manageability enabled, allowing us room for 15 multicast addresses.
>   */
>  #define E1000_RAH_AV  0x8000/* Receive descriptor valid */
> +#define E1000_RAH_ASEL_SRC_ADDR 0x0001
>  #define E1000_RAH_QSEL_ENABLE 0x1000
>  #define E1000_RAL_MAC_ADDR_LEN 4
>  #define E1000_RAH_MAC_ADDR_LEN 2
> diff --git a/drivers/net/ethernet/intel/igb/igb.h 
> b/drivers/net/ethernet/intel/igb/igb.h
> index 1c6b8d9176a8..d5cd5f6708d9 100644
> --- a/drivers/net/ethernet/intel/igb/igb.h
> +++ b/drivers/net/ethernet/intel/igb/igb.h
> @@ -472,6 +472,7 @@ struct igb_mac_addr {
>
>  #define IGB_MAC_STATE_DEFAULT  0x1
>  #define IGB_MAC_STATE_IN_USE   0x2
> +#define IGB_MAC_STATE_SRC_ADDR  0x4
>
>  /* board specific private data structure */
>  struct igb_adapter {
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
> b/drivers/net/ethernet/intel/igb/igb_main.c
> index 543aa99892eb..db66b697fe3b 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -6837,8 +6837,13 @@ static void igb_set_default_mac_filter(struct 
> igb_adapter *adapter)
> igb_rar_set_index(adapter, 0);
>  }
>
> +/* Add a MAC filter for 'addr' directing matching traffic to 'queue',
> + * 'flags' is used to indicate what kind of match is made, match is by
> + * default for the destination address, if matching by source address
> + * is desired the flag IGB_MAC_STATE_SRC_ADDR can be used.
> + */
>  static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
> - const u8 queue)
> + const u8 queue, const u8 flags)
>  {
> struct e1000_hw *hw = >hw;
> int rar_entries = hw->mac.rar_entry_count -
> @@ -6858,7 +6863,7 @@ static int igb_add_mac_filter(struct igb_adapter 
> *adapter, const u8 *addr,
>
> ether_addr_copy(adapter->mac_table[i].addr, addr);
> adapter->mac_table[i].queue = queue;
> -   adapter->mac_table[i].state |= IGB_MAC_STATE_IN_USE;
> +   adapter->mac_table[i].state |= (IGB_MAC_STATE_IN_USE | flags);

More unneeded parenthesis.

>
> igb_rar_set_index(adapter, i);
> return i;
> @@ -6867,8 +6872,14 @@ static int igb_add_mac_filter(struct igb_adapter 
> *adapter, const u8 *addr,
> return -ENOSPC;
>  }
>
> +/* Remove a MAC filter for 'addr' directing matching traffic to
> + * 'queue', 'flags' is used to indicate what kind of match need to be
> + * removed, match is by default for the destination address, if
> + * matching by source address is to be removed the flag
> + * IGB_MAC_STATE_SRC_ADDR can be used.
> + */
>  static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
> - const u8 queue)
> + const u8 queue, const u8 flags)
>  {
> struct e1000_hw *hw = >hw;
> int rar_entries = hw->mac.rar_entry_count -
> @@ -6883,14 +6894,15 @@ static int igb_del_mac_filter(struct igb_adapter 
> *adapter, const u8 *addr,
>  * for the VF MAC addresses.
>  */
> for (i = 0; i < rar_entries; i++) {
> -   if (!(adapter->mac_table[i].state & IGB_MAC_STATE_IN_USE))
> +   if (!(adapter->mac_table[i].state &
> + (IGB_MAC_STATE_IN_USE | flags)))

Shouldn't these be two separate checks? If the address isn't in use
why would I care what the flags state is? It probably isn't valid.

> continue;
> if (adapter->mac_table[i].queue != queue)
> continue;
> if (!ether_addr_equal(adapter->mac_table[i].addr, addr))
> continue;
>
> -   adapter->mac_table[i].state &= ~IGB_MAC_STATE_IN_USE;
> +   adapter->mac_table[i].state &= ~(IGB_MAC_STATE_IN_USE | 
> flags);

Maybe instead of just clearing the specific flags we should just clear
the state and reset it back to 0.

> memset(adapter->mac_table[i].addr, 0, ETH_ALEN);
> adapter->mac_table[i].queue = 0;
>
> @@ -6906,7 +6918,8 @@ static int

Re: [Intel-wired-lan] [next-queue PATCH 3/8] igb: Enable the hardware traffic class feature bit for igb models

2018-02-25 Thread Alexander Duyck

On Fri, Feb 23, 2018 at 5:20 PM, Vinicius Costa Gomes
 wrote:
> This will allow functionality depending on the hardware being traffic
> class aware to work. In particular the tc-flower offloading checks
> verifies that this bit is set.
>
> Signed-off-by: Vinicius Costa Gomes 
> ---
>  drivers/net/ethernet/intel/igb/igb_main.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
> b/drivers/net/ethernet/intel/igb/igb_main.c
> index 0ea32be07d71..543aa99892eb 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -2820,8 +2820,10 @@ static int igb_probe(struct pci_dev *pdev, const 
> struct pci_device_id *ent)
>NETIF_F_HW_VLAN_CTAG_TX |
>NETIF_F_RXALL;
>
> -   if (hw->mac.type >= e1000_i350)
> -   netdev->hw_features |= NETIF_F_NTUPLE;
> +   if (hw->mac.type >= e1000_i350) {
> +   netdev->hw_features |= (NETIF_F_NTUPLE | NETIF_F_HW_TC);
> +   netdev->features |= NETIF_F_HW_TC;

The parens aren't needed.

Also you might consider moving this block up to where we have a
similar one for 82576. Then you wouldn't need to set both features and
hw_features in the case of the HW_TC flag.

> +   }
>
> if (pci_using_dac)
> netdev->features |= NETIF_F_HIGHDMA;
> --
> 2.16.2
>
> ___
> Intel-wired-lan mailing list
> intel-wired-...@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan

Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device

2018-02-25 Thread Alexander Duyck

On Fri, Feb 23, 2018 at 3:59 PM, Stephen Hemminger
 wrote:
> On Thu, 22 Feb 2018 13:30:12 -0800
> Alexander Duyck  wrote:
>
>> > Again, I undertand your motivation. Yet I don't like your solution.
>> > But if the decision is made to do this in-driver bonding. I would like
>> > to see it baing done some generic way:
>> > 1) share the same "in-driver bonding core" code with netvsc
>> >put to net/core.
>> > 2) the "in-driver bonding core" will strictly limit the functionality,
>> >like active-backup mode only, one vf, one backup, vf netdev type
>> >check (so noone could enslave a tap or anything else)
>> > If user would need something more, he should employ team/bond.
>
> Sharing would be good, but netvsc world would really like to only have
> one visible network device.

Other than the netdev count are there any other issues we need to be
thinking about?

If I am not mistaken you netvsc doesn't put any broadcast/multicast
filters on the VF. If we ended up doing that in order to support the
virtio based solution would that cause any issues? I just realized we
had overlooked dealing with multicast in our current solution so we
will probably be looking at syncing the multicast list like what
occurs in netvsc, however we will need to do it for both the VF and
the virtio interfaces.

Thanks.

- Alex

Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device

2018-02-25 Thread Alexander Duyck

On Fri, Feb 23, 2018 at 4:03 PM, Stephen Hemminger
 wrote:
> (pruned to reduce thread)
>
> On Wed, 21 Feb 2018 16:17:19 -0800
> Alexander Duyck  wrote:
>
>> >>> FWIW two solutions that immediately come to mind is to export "backup"
>> >>> as phys_port_name of the backup virtio link and/or assign a name to the
>> >>> master like you are doing already.  I think team uses team%d and bond
>> >>> uses bond%d, soft naming of master devices seems quite natural in this
>> >>> case.
>> >>
>> >> I figured I had overlooked something like that.. Thanks for pointing
>> >> this out. Okay so I think the phys_port_name approach might resolve
>> >> the original issue. If I am reading things correctly what we end up
>> >> with is the master showing up as "ens1" for example and the backup
>> >> showing up as "ens1nbackup". Am I understanding that right?
>> >>
>> >> The problem with the team/bond%d approach is that it creates a new
>> >> netdevice and so it would require guest configuration changes.
>> >>
>> >>> IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
>> >>> link is quite neat.
>> >>
>> >> I agree. For non-"backup" virio_net devices would it be okay for us to
>> >> just return -EOPNOTSUPP? I assume it would be and that way the legacy
>> >> behavior could be maintained although the function still exists.
>> >>
>>  - When the 'active' netdev is unplugged OR not present on a destination
>>    system after live migration, the user will see 2 virtio_net netdevs.
>> >>>
>> >>> That's necessary and expected, all configuration applies to the master
>> >>> so master must exist.
>> >>
>> >> With the naming issue resolved this is the only item left outstanding.
>> >> This becomes a matter of form vs function.
>> >>
>> >> The main complaint about the "3 netdev" solution is a bit confusing to
>> >> have the 2 netdevs present if the VF isn't there. The idea is that
>> >> having the extra "master" netdev there if there isn't really a bond is
>> >> a bit ugly.
>> >
>> > Is it this uglier in terms of user experience rather than
>> > functionality? I don't want it dynamically changed between 2-netdev
>> > and 3-netdev depending on the presence of VF. That gets back to my
>> > original question and suggestion earlier: why not just hide the lower
>> > netdevs from udev renaming and such? Which important observability
>> > benefits users may get if exposing the lower netdevs?
>> >
>> > Thanks,
>> > -Siwei
>>
>> The only real advantage to a 2 netdev solution is that it looks like
>> the netvsc solution, however it doesn't behave like it since there are
>> some features like XDP that may not function correctly if they are
>> left enabled in the virtio_net interface.
>>
>> As far as functionality the advantage of not hiding the lower devices
>> is that they are free to be managed. The problem with pushing all of
>> the configuration into the upper device is that you are limited to the
>> intersection of the features of the lower devices. This can be
>> limiting for some setups as some VFs support things like more queues,
>> or better interrupt moderation options than others so trying to make
>> everything work with one config would be ugly.
>>
>
>
> Let's not make XDP the blocker for doing the best solution
> from the end user point of view. XDP is just yet another offload
> thing which needs to be handled.  The current backup device solution
> used in netvsc doesn't handle the full range of offload options
> (things like flow direction, DCB, etc); no one but the HW vendors
> seems to care.

XDP isn't the blocker here. As far as I am concerned we can go either
way, with a 2 netdev or a 3 netdev solution. We just need to make sure
we are aware of all the trade-offs, and make a decision one way or the
other. This is quickly turning into a bikeshed and I would prefer us
to all agree, or at least disagree and commit, on which way to go
before we burn more cycles on a patch set that seems to be getting
tied up in debate.

With the 2 netdev solution we have to limit the functionality so that
we don't break things when we bypass the guts of the driver to hand
traffic off to the VF. Then ends up meaning that we are stuck with an
extra qdisc and Tx queue lock in the transmit path of the VF, and we
cannot rely on any in-driver Rx functionality to work such as
in-driver XDP. However the advantage here is that this is how netvsc
is already doing things.

The issue with the 3 netdev solution is that you are stuck with 2
netdevs ("ens1", "ens1nbackup") when the VF is not present. It could
be argued this isn't a very elegant looking solution, especially when
the VF is not present. With virtio this makes more sense though as you
are still able to expose the full functionality of the lower device so
you don't have to strip or drop any of the existing net device ops if
the "backup" bit is present.

Ultimately I would have preferred to have the 3 netdev solution go
with virtio

[PATCH] ip_tunnel: Do not use mark in skb by default

2018-02-25 Thread Thomas Winter

This reverts commit 5c38bd1b82e1f76f9fa96c1e61c9897cabf1ce45.

skb->mark contains the mark the encapsulated traffic which
can result in incorrect routing decisions being made such
as routing loops if the route chosen is via tunnel itself.
The correct method should be to use tunnel->fwmark.

Signed-off-by: Thomas Winter 
Cc: "David S. Miller" 
Cc: Alexey Kuznetsov 
Cc: Hideaki YOSHIFUJI 
---
 net/ipv4/ip_tunnel.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d786a8441bce..6d21068f9b55 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -710,16 +710,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device 
*dev,
}
}
 
-   if (tunnel->fwmark) {
-   init_tunnel_flow(, protocol, dst, tnl_params->saddr,
-tunnel->parms.o_key, RT_TOS(tos), 
tunnel->parms.link,
-tunnel->fwmark);
-   }
-   else {
-   init_tunnel_flow(, protocol, dst, tnl_params->saddr,
-tunnel->parms.o_key, RT_TOS(tos), 
tunnel->parms.link,
-skb->mark);
-   }
+   init_tunnel_flow(, protocol, dst, tnl_params->saddr,
+tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
+tunnel->fwmark);
 
if (ip_tunnel_encap(skb, tunnel, , ) < 0)
goto tx_error;
-- 
2.16.2

Re: [virtio-dev] [RFC PATCH V2] virtio_pci: Add SR-IOV support

2018-02-25 Thread Mark D Rustad

> On Feb 25, 2018, at 7:20 AM, Yan Vugenfirer  wrote:
> 
> Small mistake in the commit message. Red Hat (Qumranet) vendor ID is 1af4, 
> virtio-net device ID is 1041.
> Should be:
> PF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 15fe
> VF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 05fe

You are so right. I will fix it before I send the updated patch that provides 
the generic helper that Christoph suggested. Thanks for catching my mislabeling.

--
Mark Rustad, mrus...@gmail.com



signature.asc
Description: Message signed with OpenPGP

Re: [PATCH net] bridge: Fix VLAN reference count problem

2018-02-25 Thread Nikolay Aleksandrov

On 25/02/18 21:59, Ido Schimmel wrote:
> When a VLAN is added on a port, a reference is taken on the
> corresponding master VLAN entry. If it does not already exist, then it
> is created and a reference taken.
> 
> However, in the second case a reference is not really taken when
> CONFIG_REFCOUNT_FULL is enabled as refcount_inc() is replaced by
> refcount_inc_not_zero().
> 
> Fix this by using refcount_set() on a newly created master VLAN entry.
> 
> Fixes: 251277598596 ("net, bridge: convert net_bridge_vlan.refcnt from 
> atomic_t to refcount_t")
> Signed-off-by: Ido Schimmel 
> ---
>  net/bridge/br_vlan.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
> index 51935270c651..9896f4975353 100644
> --- a/net/bridge/br_vlan.c
> +++ b/net/bridge/br_vlan.c
> @@ -168,6 +168,8 @@ static struct net_bridge_vlan *br_vlan_get_master(struct 
> net_bridge *br, u16 vid
>   masterv = br_vlan_find(vg, vid);
>   if (WARN_ON(!masterv))
>   return NULL;
> + refcount_set(>refcnt, 1);
> + return masterv;
>   }
>   refcount_inc(>refcnt);
>  
> 

Good catch,

Acked-by: Nikolay Aleksandrov

[PATCH net] bridge: Fix VLAN reference count problem

2018-02-25 Thread Ido Schimmel

When a VLAN is added on a port, a reference is taken on the
corresponding master VLAN entry. If it does not already exist, then it
is created and a reference taken.

However, in the second case a reference is not really taken when
CONFIG_REFCOUNT_FULL is enabled as refcount_inc() is replaced by
refcount_inc_not_zero().

Fix this by using refcount_set() on a newly created master VLAN entry.

Fixes: 251277598596 ("net, bridge: convert net_bridge_vlan.refcnt from atomic_t 
to refcount_t")
Signed-off-by: Ido Schimmel 
---
 net/bridge/br_vlan.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 51935270c651..9896f4975353 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -168,6 +168,8 @@ static struct net_bridge_vlan *br_vlan_get_master(struct 
net_bridge *br, u16 vid
masterv = br_vlan_find(vg, vid);
if (WARN_ON(!masterv))
return NULL;
+   refcount_set(>refcnt, 1);
+   return masterv;
}
refcount_inc(>refcnt);
 
-- 
2.14.3

Re: [for-next 7/7] IB/mlx5: Implement fragmented completion queue (CQ)

2018-02-25 Thread Santosh Shilimkar


n 2/24/2018 1:40 AM, Majd Dibbiny wrote:



On Feb 23, 2018, at 9:13 PM, Saeed Mahameed  wrote:


On Thu, 2018-02-22 at 16:04 -0800, Santosh Shilimkar wrote:
Hi Saeed


On 2/21/2018 12:13 PM, Saeed Mahameed wrote:


[...]



Jason mentioned about this patch to me off-list. We were
seeing similar issue with SRQs & QPs. So wondering whether
you have any plans to do similar change for other resouces
too so that they don't rely on higher order page allocation
for icm tables.



Hi Santosh,

Adding Majd,

Which ULP is in question ? how big are the QPs/SRQs you create that
lead to this problem ?

For icm tables we already allocate only order 0 pages:
see alloc_system_page() in
drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c

But for kernel RDMA SRQ and QP buffers there is a place for
improvement.

Majd, do you know if we have any near future plans for this.


It’s in our plans to move all the buffers to use 0-order pages.

Santosh,

Is this RDS? Do you have persistent failure with some configuration? Can you 
please share more information?


No the issue seen with user verbs and actually MLX4 driver. My
last question was more for both MLX4 and MLX5 drivers icm
allocation for all the resources.

With MLX4 driver, we have seen corruption issues with MLX4_NO_RR
while recycling the issues. So we ended up switching to round robin
bitmap allocation as it was before which was changed by one of
Jacks commit 7c6d74d23 {mlx4_core: Roll back round robin bitmap
allocation commit for CQs, SRQs, and MPTs}

With default round robin, the corruption issue went away but then
its undesired effect of bloating the icm tables till you hit the
resource limit means more memory fragmentation. Since these resources
makes use of higher order allocations and in fragmented memory scenarios
we see contention on mm lock for seconds since compaction layer is
trying to stitch pages which takes time.

If these alloaction don't make use of higher order pages, the issue
can be certainly avoided and hence the reason behind the question.

Ofcourse we wouldn't have ended up with this issue if 'MLX4_NO_RR'
worked without corruption :-)

Regards,
Santosh

Re: [PATCH v2] netfilter: use skb_to_full_sk in ip6_route_me_harder

2018-02-25 Thread Pablo Neira Ayuso

On Sun, Feb 25, 2018 at 11:49:07AM -0800, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> For some reason, Florian forgot to apply to ip6_route_me_harder
> the fix that went in commit 29e09229d9f2 ("netfilter: use
> skb_to_full_sk in ip_route_me_harder")

Applied this one, thanks Eric.

[PATCH RFC net-next 17/20] net/ipv6: introduce fib6_info struct and helpers

2018-02-25 Thread David Ahern

Add fib6_info struct and alloc, destroy, hold and release helpers.

Signed-off-by: David Ahern 
---
 include/net/ip6_fib.h | 57 
 net/ipv6/ip6_fib.c| 60 +++
 2 files changed, 117 insertions(+)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index d867b1696927..70978deac538 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -38,6 +38,7 @@
 #endif
 
 struct rt6_info;
+struct fib6_info;
 
 struct fib6_config {
u32 fc_table;
@@ -132,6 +133,48 @@ struct fib6_nh {
int nh_weight;
 };
 
+struct fib6_info {
+   struct fib6_table   *rt6i_table;
+   struct fib6_info __rcu  *rt6_next;
+   struct fib6_node __rcu  *rt6i_node;
+
+   /* Multipath routes:
+* siblings is a list of fib6_info that have the the same metric/weight,
+* destination, but not the same gateway. nsiblings is just a cache
+* to speed up lookup.
+*/
+   struct list_headrt6i_siblings;
+   unsigned intrt6i_nsiblings;
+
+   atomic_trt6i_ref;
+   struct inet6_dev*rt6i_idev;
+   unsigned long   expires;
+   struct dst_metrics  *fib6_metrics;
+#define fib6_pmtu  fib6_metrics->metrics[RTAX_MTU-1]
+#define fib6_hoplimit  fib6_metrics->metrics[RTAX_HOPLIMIT-1]
+#define fib6_metric_lock   fib6_metrics->metrics[RTAX_LOCK - 1]
+
+   struct rt6key   rt6i_dst;
+   u32 rt6i_flags;
+   struct rt6key   rt6i_src;
+   struct rt6key   rt6i_prefsrc;
+
+   struct rt6_info * __percpu  *rt6i_pcpu;
+   struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
+
+   u32 rt6i_metric;
+   u8  rt6i_protocol;
+   u8  fib6_type;
+   u8  exception_bucket_flushed:1,
+   should_flush:1,
+   dst_nocount:1,
+   dst_nopolicy:1,
+   dst_host:1,
+   unused:3;
+
+   struct fib6_nh  fib6_nh;
+};
+
 struct rt6_info {
struct dst_entrydst;
struct rt6_info __rcu   *rt6_next;
@@ -290,6 +333,20 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 
 void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
 
+struct rt6_info *fib6_info_alloc(gfp_t gfp_flags);
+void fib6_info_destroy(struct rt6_info *f6i);
+
+static inline void fib6_info_hold(struct rt6_info *f6i)
+{
+   atomic_inc(>rt6i_ref);
+}
+
+static inline void fib6_info_release(struct rt6_info *f6i)
+{
+   if (f6i && atomic_dec_and_test(>rt6i_ref))
+   fib6_info_destroy(f6i);
+}
+
 static inline void rt6_hold(struct rt6_info *rt)
 {
atomic_inc(>rt6i_ref);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 63a91db61749..6553550bd09b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -145,6 +145,66 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
   addr[fn_bit >> 5];
 }
 
+struct rt6_info *fib6_info_alloc(gfp_t gfp_flags)
+{
+   struct rt6_info *f6i;
+
+   f6i = kzalloc(sizeof(*f6i), gfp_flags);
+   if (!f6i)
+   return NULL;
+
+   f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+   if (!f6i->rt6i_pcpu) {
+   kfree(f6i);
+   return NULL;
+   }
+
+   INIT_LIST_HEAD(>rt6i_siblings);
+   f6i->fib6_metrics = (struct dst_metrics *)_default_metrics;
+
+   atomic_inc(>rt6i_ref);
+
+   return f6i;
+}
+
+void fib6_info_destroy(struct rt6_info *f6i)
+{
+   struct rt6_exception_bucket *bucket;
+
+   WARN_ON(f6i->rt6i_node);
+
+   bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
+   if (bucket) {
+   f6i->rt6i_exception_bucket = NULL;
+   kfree(bucket);
+   }
+
+   if (f6i->rt6i_pcpu) {
+   int cpu;
+
+   for_each_possible_cpu(cpu) {
+   struct rt6_info **ppcpu_rt;
+   struct rt6_info *pcpu_rt;
+
+   ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+   pcpu_rt = *ppcpu_rt;
+   if (pcpu_rt) {
+   dst_dev_put(_rt->dst);
+   dst_release(_rt->dst);
+   *ppcpu_rt = NULL;
+   }
+   }
+   }
+
+   if (f6i->rt6i_idev)
+   in6_dev_put(f6i->rt6i_idev);
+   if (f6i->fib6_nh.nh_dev)
+

[PATCH RFC net-next 05/20] net/ipv6: Move support functions up in route.c

2018-02-25 Thread David Ahern

Code move only.

Signed-off-by: David Ahern 
---
 net/ipv6/route.c | 119 +++
 1 file changed, 59 insertions(+), 60 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 325a3d30fa2e..8f73335c325a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -78,7 +78,6 @@ enum rt6_nud_state {
RT6_NUD_SUCCEED = 1
 };
 
-static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
 static struct dst_entry*ip6_dst_check(struct dst_entry *dst, u32 
cookie);
 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
 static unsigned int ip6_mtu(const struct dst_entry *dst);
@@ -877,6 +876,65 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 }
 #endif
 
+/*
+ * Misc support functions
+ */
+
+/* called with rcu_lock held */
+static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
+{
+   struct net_device *dev = rt->dst.dev;
+
+   if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
+   /* for copies of local routes, dst->dev needs to be the
+* device if it is a master device, the master device if
+* device is enslaved, and the loopback as the default
+*/
+   if (netif_is_l3_slave(dev) &&
+   !rt6_need_strict(>rt6i_dst.addr))
+   dev = l3mdev_master_dev_rcu(dev);
+   else if (!netif_is_l3_master(dev))
+   dev = dev_net(dev)->loopback_dev;
+   /* last case is netif_is_l3_master(dev) is true in which
+* case we want dev returned to be dev
+*/
+   }
+
+   return dev;
+}
+
+static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
+{
+   BUG_ON(from->from);
+
+   rt->rt6i_flags &= ~RTF_EXPIRES;
+   dst_hold(>dst);
+   rt->from = from;
+   dst_init_metrics(>dst, dst_metrics_ptr(>dst), true);
+}
+
+static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
+{
+   rt->dst.input = ort->dst.input;
+   rt->dst.output = ort->dst.output;
+   rt->rt6i_dst = ort->rt6i_dst;
+   rt->dst.error = ort->dst.error;
+   rt->rt6i_idev = ort->rt6i_idev;
+   if (rt->rt6i_idev)
+   in6_dev_hold(rt->rt6i_idev);
+   rt->dst.lastuse = jiffies;
+   rt->rt6i_gateway = ort->rt6i_gateway;
+   rt->rt6i_flags = ort->rt6i_flags;
+   rt6_set_from(rt, ort);
+   rt->rt6i_metric = ort->rt6i_metric;
+#ifdef CONFIG_IPV6_SUBTREES
+   rt->rt6i_src = ort->rt6i_src;
+#endif
+   rt->rt6i_prefsrc = ort->rt6i_prefsrc;
+   rt->rt6i_table = ort->rt6i_table;
+   rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
+}
+
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
struct in6_addr *saddr)
 {
@@ -1016,29 +1074,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt)
return __ip6_ins_rt(rt, , , NULL);
 }
 
-/* called with rcu_lock held */
-static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
-{
-   struct net_device *dev = rt->dst.dev;
-
-   if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
-   /* for copies of local routes, dst->dev needs to be the
-* device if it is a master device, the master device if
-* device is enslaved, and the loopback as the default
-*/
-   if (netif_is_l3_slave(dev) &&
-   !rt6_need_strict(>rt6i_dst.addr))
-   dev = l3mdev_master_dev_rcu(dev);
-   else if (!netif_is_l3_master(dev))
-   dev = dev_net(dev)->loopback_dev;
-   /* last case is netif_is_l3_master(dev) is true in which
-* case we want dev returned to be dev
-*/
-   }
-
-   return dev;
-}
-
 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
   const struct in6_addr *daddr,
   const struct in6_addr *saddr)
@@ -3121,42 +3156,6 @@ static void rt6_do_redirect(struct dst_entry *dst, 
struct sock *sk, struct sk_bu
neigh_release(neigh);
 }
 
-/*
- * Misc support functions
- */
-
-static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
-{
-   BUG_ON(from->from);
-
-   rt->rt6i_flags &= ~RTF_EXPIRES;
-   dst_hold(>dst);
-   rt->from = from;
-   dst_init_metrics(>dst, dst_metrics_ptr(>dst), true);
-}
-
-static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
-{
-   rt->dst.input = ort->dst.input;
-   rt->dst.output = ort->dst.output;
-   rt->rt6i_dst = ort->rt6i_dst;
-   rt->dst.error = ort->dst.error;
-   rt->rt6i_idev = ort->rt6i_idev;
-   if (rt->rt6i_idev)
-   in6_dev_hold(rt->rt6i_idev);
-   rt->dst.lastuse = jiffies;
-   rt->rt6i_gateway =

[PATCH RFC net-next 11/20] net/ipv6: Add fib6_null_entry

2018-02-25 Thread David Ahern

ip6_null_entry will stay a dst based return for lookups that fail to
match an entry.

Add a new fib6_null_entry which constitutes the root node and leafs
for fibs.

Signed-off-by: David Ahern 
---
 include/net/netns/ipv6.h |  3 ++-
 net/ipv6/ip6_fib.c   | 26 ++--
 net/ipv6/route.c | 62 +---
 3 files changed, 58 insertions(+), 33 deletions(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 987cc4569cb8..c6ee8d21bfda 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -59,7 +59,8 @@ struct netns_ipv6 {
 #endif
struct xt_table *ip6table_nat;
 #endif
-   struct rt6_info *ip6_null_entry;
+   struct rt6_info *fib6_null_entry;
+   struct rt6_info *ip6_null_entry;
struct rt6_statistics   *rt6_stats;
struct timer_list   ip6_fib_timer;
struct hlist_head   *fib_table_hash;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 7bc23b048189..5b03f7e8d850 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -231,7 +231,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, 
u32 id)
if (table) {
table->tb6_id = id;
rcu_assign_pointer(table->tb6_root.leaf,
-  net->ipv6.ip6_null_entry);
+  net->ipv6.fib6_null_entry);
table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(>tb6_peers);
}
@@ -368,7 +368,7 @@ struct fib6_dump_arg {
 
 static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
 {
-   if (rt == arg->net->ipv6.ip6_null_entry)
+   if (rt == arg->net->ipv6.fib6_null_entry)
return;
call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
 }
@@ -639,7 +639,7 @@ static struct fib6_node *fib6_add_1(struct net *net,
/* remove null_entry in the root node */
} else if (fn->fn_flags & RTN_TL_ROOT &&
   rcu_access_pointer(fn->leaf) ==
-  net->ipv6.ip6_null_entry) {
+  net->ipv6.fib6_null_entry) {
RCU_INIT_POINTER(fn->leaf, NULL);
}
 
@@ -1143,9 +1143,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (!sfn)
goto failure;
 
-   
atomic_inc(>nl_net->ipv6.ip6_null_entry->rt6i_ref);
+   
atomic_inc(>nl_net->ipv6.fib6_null_entry->rt6i_ref);
rcu_assign_pointer(sfn->leaf,
-  info->nl_net->ipv6.ip6_null_entry);
+  info->nl_net->ipv6.fib6_null_entry);
sfn->fn_flags = RTN_ROOT;
 
/* Now add the first leaf node to new subtree */
@@ -1184,7 +1184,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (fn->fn_flags & RTN_TL_ROOT) {
/* put back null_entry for root node */
rcu_assign_pointer(fn->leaf,
-   info->nl_net->ipv6.ip6_null_entry);
+   info->nl_net->ipv6.fib6_null_entry);
} else {
atomic_inc(>rt6i_ref);
rcu_assign_pointer(fn->leaf, rt);
@@ -1223,7 +1223,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (!pn_leaf) {
WARN_ON(!pn_leaf);
pn_leaf =
-   info->nl_net->ipv6.ip6_null_entry;
+   info->nl_net->ipv6.fib6_null_entry;
}
 #endif
atomic_inc(_leaf->rt6i_ref);
@@ -1466,7 +1466,7 @@ static struct rt6_info *fib6_find_prefix(struct net *net,
struct fib6_node *child_left, *child_right;
 
if (fn->fn_flags & RTN_ROOT)
-   return net->ipv6.ip6_null_entry;
+   return net->ipv6.fib6_null_entry;
 
while (fn) {
child_left = rcu_dereference_protected(fn->left,
@@ -1503,7 +1503,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 
/* Set fn->leaf to null_entry for root node. */
if (fn->fn_flags & RTN_TL_ROOT) {
-   rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry);
+   rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
return fn;
}
 
@@ -1548,7 +1548,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 #if RT6_DEBUG >= 2

[PATCH v2] netfilter: use skb_to_full_sk in ip6_route_me_harder

2018-02-25 Thread Eric Dumazet

From: Eric Dumazet 

For some reason, Florian forgot to apply to ip6_route_me_harder
the fix that went in commit 29e09229d9f2 ("netfilter: use
skb_to_full_sk in ip_route_me_harder")

Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of 
listener") 
Signed-off-by: Eric Dumazet 
Reported-by: syzbot 
---
 net/ipv6/netfilter.c |9 +
 1 file changed, 5 insertions(+), 4 deletions(-)


diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 
d95ceca7ff8f648ff301d91a2e3eb60fc2050f1c..531d6957af36c4af48176f9360e9d95f78a45d55
 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -21,18 +21,19 @@
 int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 {
const struct ipv6hdr *iph = ipv6_hdr(skb);
+   struct sock *sk = sk_to_full_sk(skb->sk);
unsigned int hh_len;
struct dst_entry *dst;
struct flowi6 fl6 = {
-   .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+   .flowi6_oif = sk ? sk->sk_bound_dev_if : 0,
.flowi6_mark = skb->mark,
-   .flowi6_uid = sock_net_uid(net, skb->sk),
+   .flowi6_uid = sock_net_uid(net, sk),
.daddr = iph->daddr,
.saddr = iph->saddr,
};
int err;
 
-   dst = ip6_route_output(net, skb->sk, );
+   dst = ip6_route_output(net, sk, );
err = dst->error;
if (err) {
IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
@@ -50,7 +51,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
xfrm_decode_session(skb, flowi6_to_flowi(), AF_INET6) == 0) {
skb_dst_set(skb, NULL);
-   dst = xfrm_lookup(net, dst, flowi6_to_flowi(), skb->sk, 0);
+   dst = xfrm_lookup(net, dst, flowi6_to_flowi(), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_dst_set(skb, dst);

[PATCH RFC net-next 10/20] net/ipv6: move expires into rt6_info

2018-02-25 Thread David Ahern

Add expires to rt6_info for FIB entries, and add fib6 helpers to
manage it. Data path use of dst.expires remains.

Signed-off-by: David Ahern 
---
 include/net/ip6_fib.h | 26 +-
 net/ipv6/addrconf.c   |  6 +++---
 net/ipv6/ip6_fib.c|  8 
 net/ipv6/ndisc.c  |  2 +-
 net/ipv6/route.c  | 14 +++---
 5 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index da81669b9c90..3ba0bb7c7a43 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -179,6 +179,7 @@ struct rt6_info {
should_flush:1,
unused:6;
 
+   unsigned long   expires;
struct dst_metrics  *fib6_metrics;
 #define fib6_pmtu  fib6_metrics->metrics[RTAX_MTU-1]
 #define fib6_hoplimit  fib6_metrics->metrics[RTAX_HOPLIMIT-1]
@@ -199,6 +200,26 @@ static inline struct inet6_dev *ip6_dst_idev(struct 
dst_entry *dst)
return ((struct rt6_info *)dst)->rt6i_idev;
 }
 
+static inline void fib6_clean_expires(struct rt6_info *f6i)
+{
+   f6i->rt6i_flags &= ~RTF_EXPIRES;
+   f6i->expires = 0;
+}
+
+static inline void fib6_set_expires(struct rt6_info *f6i,
+   unsigned long expires)
+{
+   f6i->expires = expires;
+   f6i->rt6i_flags |= RTF_EXPIRES;
+}
+
+static inline bool fib6_check_expired(const struct rt6_info *f6i)
+{
+   if (f6i->rt6i_flags & RTF_EXPIRES)
+   return time_after(jiffies, f6i->expires);
+   return false;
+}
+
 static inline void rt6_clean_expires(struct rt6_info *rt)
 {
rt->rt6i_flags &= ~RTF_EXPIRES;
@@ -213,11 +234,6 @@ static inline void rt6_set_expires(struct rt6_info *rt, 
unsigned long expires)
 
 static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
 {
-   struct rt6_info *rt;
-
-   for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from);
-   if (rt && rt != rt0)
-   rt0->dst.expires = rt->dst.expires;
dst_set_expires(>dst, timeout);
rt0->rt6i_flags |= RTF_EXPIRES;
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index eeecef2b83a4..478f45bf13cf 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1202,7 +1202,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned 
long expires, bool del_r
ip6_del_rt(dev_net(ifp->idev->dev), rt);
else {
if (!(rt->rt6i_flags & RTF_EXPIRES))
-   rt6_set_expires(rt, expires);
+   fib6_set_expires(rt, expires);
ip6_rt_put(rt);
}
}
@@ -2648,9 +2648,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, 
int len, bool sllao)
rt = NULL;
} else if (addrconf_finite_timeout(rt_expires)) {
/* not infinity */
-   rt6_set_expires(rt, jiffies + rt_expires);
+   fib6_set_expires(rt, jiffies + rt_expires);
} else {
-   rt6_clean_expires(rt);
+   fib6_clean_expires(rt);
}
} else if (valid_lft) {
clock_t expires = 0;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index faa2b46349df..7bc23b048189 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -886,9 +886,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
if (!(iter->rt6i_flags & RTF_EXPIRES))
return -EEXIST;
if (!(rt->rt6i_flags & RTF_EXPIRES))
-   rt6_clean_expires(iter);
+   fib6_clean_expires(iter);
else
-   rt6_set_expires(iter, rt->dst.expires);
+   fib6_set_expires(iter, rt->expires);
iter->fib6_pmtu = rt->fib6_pmtu;
return -EEXIST;
}
@@ -1975,8 +1975,8 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 *  Routes are expired even if they are in use.
 */
 
-   if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
-   if (time_after(now, rt->dst.expires)) {
+   if (rt->rt6i_flags & RTF_EXPIRES && rt->expires) {
+   if (time_after(now, rt->expires)) {
RT6_TRACE("expiring %p\n", rt);
return -1;
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 33f9e3dc526a..bd804e8cd73c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1318,7

[PATCH RFC net-next 16/20] net/ipv6: Cleanup exception route handling

2018-02-25 Thread David Ahern

IPv6 FIB will only contain FIB entries with exception routes added to
the FIB entry. Remove CACHE and dst checks from fib6 add and delete since
they can never happen once the data type changes.

Fixup the lookup functions to use a f6i name for fib lookups and retain
the current rt name for return variables.

Signed-off-by: David Ahern 
---
 net/ipv6/ip6_fib.c |  16 +--
 net/ipv6/route.c   | 122 ++---
 2 files changed, 71 insertions(+), 67 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5b03f7e8d850..63a91db61749 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1046,7 +1046,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
 static void fib6_start_gc(struct net *net, struct rt6_info *rt)
 {
if (!timer_pending(>ipv6.ip6_fib_timer) &&
-   (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
+   (rt->rt6i_flags & RTF_EXPIRES))
mod_timer(>ipv6.ip6_fib_timer,
  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 }
@@ -1097,8 +1097,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
if (WARN_ON_ONCE(!atomic_read(>dst.__refcnt)))
return -EINVAL;
-   if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
-   return -EINVAL;
 
if (info->nlh) {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -1622,8 +1620,6 @@ static void fib6_del_route(struct fib6_table *table, 
struct fib6_node *fn,
 
RT6_TRACE("fib6_del_route\n");
 
-   WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
-
/* Unlink it */
*rtp = rt->rt6_next;
rt->rt6i_node = NULL;
@@ -1692,21 +1688,11 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
struct rt6_info __rcu **rtp;
struct rt6_info __rcu **rtp_next;
 
-#if RT6_DEBUG >= 2
-   if (rt->dst.obsolete > 0) {
-   WARN_ON(fn);
-   return -ENOENT;
-   }
-#endif
if (!fn || rt == net->ipv6.fib6_null_entry)
return -ENOENT;
 
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
 
-   /* remove cached dst from exception table */
-   if (rt->rt6i_flags & RTF_CACHE)
-   return rt6_remove_exception_rt(rt);
-
/*
 *  Walk the leaf entries looking for ourself
 */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3ea60e932eb9..19b91c60ee55 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1094,35 +1094,36 @@ static struct rt6_info *ip6_pol_route_lookup(struct net 
*net,
 struct fib6_table *table,
 struct flowi6 *fl6, int flags)
 {
-   struct rt6_info *rt, *rt_cache;
+   struct rt6_info *f6i;
struct fib6_node *fn;
+   struct rt6_info *rt;
 
rcu_read_lock();
fn = fib6_lookup(>tb6_root, >daddr, >saddr);
 restart:
-   rt = rcu_dereference(fn->leaf);
-   if (!rt) {
-   rt = net->ipv6.fib6_null_entry;
+   f6i = rcu_dereference(fn->leaf);
+   if (!f6i) {
+   f6i = net->ipv6.fib6_null_entry;
} else {
-   rt = rt6_device_match(net, rt, >saddr,
+   f6i = rt6_device_match(net, f6i, >saddr,
  fl6->flowi6_oif, flags);
-   if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-   rt = rt6_multipath_select(rt, fl6,
+   if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0)
+   f6i = rt6_multipath_select(f6i, fl6,
  fl6->flowi6_oif, flags);
}
-   if (rt == net->ipv6.fib6_null_entry) {
+   if (f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, >saddr);
if (fn)
goto restart;
}
+
/* Search through exception table */
-   rt_cache = rt6_find_cached_rt(rt, >daddr, >saddr);
-   if (rt_cache) {
-   rt = rt_cache;
+   rt = rt6_find_cached_rt(f6i, >daddr, >saddr);
+   if (rt) {
if (ip6_hold_safe(net, , true))
dst_use_noref(>dst, jiffies);
} else {
-   rt = ip6_create_rt_rcu(rt);
+   rt = ip6_create_rt_rcu(f6i);
}
 
rcu_read_unlock();
@@ -1204,9 +1205,6 @@ static struct rt6_info *ip6_rt_cache_alloc(struct 
rt6_info *ort,
 *  Clone the route.
 */
 
-   if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
-   ort = ort->from;
-
rcu_read_lock();
dev = ip6_rt_get_dev_rcu(ort);
rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
@@ -1432,11 +1430,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
struct rt6_exception *rt6_ex;
int err = 0;
 
-   /* ort can't be a cache or pcpu route */
-   if (ort->rt6i_flags & (RTF_CACHE |

[PATCH RFC net-next 08/20] net/ipv6: Defer initialization of dst to data path

2018-02-25 Thread David Ahern

Defer setting dst input, output and error until fib entry is copied.

Signed-off-by: David Ahern 
---
 net/ipv6/route.c | 115 +++
 1 file changed, 74 insertions(+), 41 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ff809ee930c7..b56f56508970 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -918,6 +918,75 @@ static struct net_device *ip6_rt_get_dev_rcu(struct 
rt6_info *rt)
return dev;
 }
 
+static const int fib6_prop[RTN_MAX + 1] = {
+   [RTN_UNSPEC]= 0,
+   [RTN_UNICAST]   = 0,
+   [RTN_LOCAL] = 0,
+   [RTN_BROADCAST] = 0,
+   [RTN_ANYCAST]   = 0,
+   [RTN_MULTICAST] = 0,
+   [RTN_BLACKHOLE] = -EINVAL,
+   [RTN_UNREACHABLE] = -EHOSTUNREACH,
+   [RTN_PROHIBIT]  = -EACCES,
+   [RTN_THROW] = -EAGAIN,
+   [RTN_NAT]   = -EINVAL,
+   [RTN_XRESOLVE]  = -EINVAL,
+};
+
+static int ip6_rt_type_to_error(u8 fib6_type)
+{
+   return fib6_prop[fib6_type];
+}
+
+static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
+{
+   rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
+
+   switch (ort->fib6_type) {
+   case RTN_BLACKHOLE:
+   rt->dst.output = dst_discard_out;
+   rt->dst.input = dst_discard;
+   break;
+   case RTN_PROHIBIT:
+   rt->dst.output = ip6_pkt_prohibit_out;
+   rt->dst.input = ip6_pkt_prohibit;
+   break;
+   case RTN_THROW:
+   case RTN_UNREACHABLE:
+   default:
+   rt->dst.output = ip6_pkt_discard_out;
+   rt->dst.input = ip6_pkt_discard;
+   break;
+   }
+}
+
+static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
+{
+   if (ort->rt6i_flags & RTF_REJECT) {
+   ip6_rt_init_dst_reject(rt, ort);
+   return;
+   }
+
+   rt->dst.error = 0;
+   rt->dst.output = ip6_output;
+
+   if (ort->fib6_type == RTN_LOCAL) {
+   rt->dst.flags |= DST_HOST;
+   rt->dst.input = ip6_input;
+   } else if (ipv6_addr_type(>rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
+   rt->dst.input = ip6_mc_input;
+   } else {
+   rt->dst.input = ip6_forward;
+   }
+
+   if (ort->fib6_nh.nh_lwtstate) {
+   rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
+   lwtunnel_set_redirect(>dst);
+   }
+
+   rt->dst.lastuse = jiffies;
+}
+
 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 {
BUG_ON(from->from);
@@ -930,14 +999,12 @@ static void rt6_set_from(struct rt6_info *rt, struct 
rt6_info *from)
 
 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
 {
-   rt->dst.input = ort->dst.input;
-   rt->dst.output = ort->dst.output;
+   ip6_rt_init_dst(rt, ort);
+
rt->rt6i_dst = ort->rt6i_dst;
-   rt->dst.error = ort->dst.error;
rt->rt6i_idev = ort->rt6i_idev;
if (rt->rt6i_idev)
in6_dev_hold(rt->rt6i_idev);
-   rt->dst.lastuse = jiffies;
rt->rt6i_gateway = ort->fib6_nh.nh_gw;
rt->rt6i_flags = ort->rt6i_flags;
rt6_set_from(rt, ort);
@@ -2210,7 +2277,7 @@ static struct rt6_info *__ip6_route_redirect(struct net 
*net,
continue;
if (rt6_check_expired(rt))
continue;
-   if (rt->dst.error)
+   if (rt->rt6i_flags & RTF_REJECT)
break;
if (!(rt->rt6i_flags & RTF_GATEWAY))
continue;
@@ -2238,7 +2305,7 @@ static struct rt6_info *__ip6_route_redirect(struct net 
*net,
 
if (!rt)
rt = net->ipv6.ip6_null_entry;
-   else if (rt->dst.error) {
+   else if (rt->rt6i_flags & RTF_REJECT) {
rt = net->ipv6.ip6_null_entry;
goto out;
}
@@ -2707,15 +2774,6 @@ static struct rt6_info *ip6_route_info_create(struct 
fib6_config *cfg,
 
addr_type = ipv6_addr_type(>fc_dst);
 
-   if (addr_type & IPV6_ADDR_MULTICAST)
-   rt->dst.input = ip6_mc_input;
-   else if (cfg->fc_flags & RTF_LOCAL)
-   rt->dst.input = ip6_input;
-   else
-   rt->dst.input = ip6_forward;
-
-   rt->dst.output = ip6_output;
-
if (cfg->fc_encap) {
struct lwtunnel_state *lwtstate;
 
@@ -2725,7 +2783,6 @@ static struct rt6_info *ip6_route_info_create(struct 
fib6_config *cfg,
if (err)
goto out;
rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
-   lwtunnel_set_redirect(>dst);
}
 
ipv6_addr_prefix(>rt6i_dst.addr, >fc_dst, cfg->fc_dst_len);
@@ -2765,27 +2822,6 @@ static struct rt6_info *ip6_route_info_create(struct 
fib6_config *cfg,
}
}
rt->rt6i_flags

[PATCH RFC net-next 15/20] net/ipv6: Add gfp_flags to route add functions

2018-02-25 Thread David Ahern

Most FIB entries can be added using memory allocated with GFP_KERNEL.
Add gfp_flags to ip6_route_add and addrconf_dst_alloc. Code paths that
can be reached from the packet path (e.g., ndisc and autoconfig) or
atomic notifiers use GFP_ATOMIC; paths from user context (adding
addresses and routes) use GFP_KERNEL.

Signed-off-by: David Ahern 
---
 include/net/ip6_route.h |  6 --
 net/ipv6/addrconf.c | 39 +++
 net/ipv6/anycast.c  |  2 +-
 net/ipv6/route.c| 18 ++
 4 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index aa59c6eb34e6..24c78fb6ac36 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -98,7 +98,8 @@ void ip6_route_cleanup(void);
 
 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 
-int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
+int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack);
 int ip6_ins_rt(struct net *net, struct rt6_info *rt);
 int ip6_del_rt(struct net *net, struct rt6_info *rt);
 
@@ -134,7 +135,8 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 
struct flowi6 *fl6);
 void fib6_force_start_gc(struct net *net);
 
 struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
-   const struct in6_addr *addr, bool anycast);
+   const struct in6_addr *addr, bool anycast,
+   gfp_t gfp_flags);
 
 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
   int flags);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6ac54498ae71..2a032b932922 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1049,7 +1049,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct 
in6_addr *addr,
goto out;
}
 
-   rt = addrconf_dst_alloc(net, idev, addr, false);
+   rt = addrconf_dst_alloc(net, idev, addr, false, gfp_flags);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
@@ -2295,7 +2295,7 @@ static void  ipv6_try_regen_rndid(struct inet6_dev *idev, 
struct in6_addr *tmpad
 
 static void
 addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
- unsigned long expires, u32 flags)
+ unsigned long expires, u32 flags, gfp_t gfp_flags)
 {
struct fib6_config cfg = {
.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
@@ -2320,7 +2320,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, 
struct net_device *dev,
cfg.fc_flags |= RTF_NONEXTHOP;
 #endif
 
-   ip6_route_add(, NULL);
+   ip6_route_add(, gfp_flags, NULL);
 }
 
 
@@ -2376,7 +2376,7 @@ static void addrconf_add_mroute(struct net_device *dev)
 
ipv6_addr_set(_dst, htonl(0xFF00), 0, 0, 0);
 
-   ip6_route_add(, NULL);
+   ip6_route_add(, GFP_ATOMIC, NULL);
 }
 
 static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
@@ -2661,7 +2661,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, 
int len, bool sllao)
expires = jiffies_to_clock_t(rt_expires);
}
addrconf_prefix_route(>prefix, pinfo->prefix_len,
- dev, expires, flags);
+ dev, expires, flags, GFP_ATOMIC);
}
ip6_rt_put(rt);
}
@@ -2876,7 +2876,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
if (!IS_ERR(ifp)) {
if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
addrconf_prefix_route(>addr, ifp->prefix_len, dev,
- expires, flags);
+ expires, flags, GFP_KERNEL);
}
 
/*
@@ -3024,7 +3024,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 
if (addr.s6_addr32[3]) {
add_addr(idev, , plen, scope);
-   addrconf_prefix_route(, plen, idev->dev, 0, pflags);
+   addrconf_prefix_route(, plen, idev->dev, 0, pflags,
+ GFP_ATOMIC);
return;
}
 
@@ -3049,7 +3050,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 
add_addr(idev, , plen, flag);
addrconf_prefix_route(, plen, idev->dev, 0,
- pflags);
+ pflags, GFP_ATOMIC);
}
}
}
@@ -3089,7 +3090,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
ifp = ipv6_add_addr(idev,

[PATCH RFC net-next 12/20] net/ipv6: Add rt6_info create function for ip6_pol_route_lookup

2018-02-25 Thread David Ahern

ip6_pol_route_lookup is the lookup function for ip6_route_lookup and
rt6_lookup. At the moment it returns either a reference to a FIB entry
or a cached exception. To move FIB entries to a separate struct, this
lookup function need to convert FIB entries to an rt6_info that is
returned to the caller.

Signed-off-by: David Ahern 
---
 net/ipv6/route.c | 25 -
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 31832b170a9f..066203d2f9d3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1053,6 +1053,20 @@ static bool ip6_hold_safe(struct net *net, struct 
rt6_info **prt,
return false;
 }
 
+/* called with rcu_lock held */
+static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
+{
+   struct net_device *dev;
+   struct rt6_info *nrt;
+
+   dev = ip6_rt_get_dev_rcu(rt);
+   nrt = __ip6_dst_alloc(dev_net(dev), dev, 0);
+   if (nrt)
+   ip6_rt_copy_init(nrt, rt);
+
+   return nrt;
+}
+
 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 struct fib6_table *table,
 struct flowi6 *fl6, int flags)
@@ -1080,18 +1094,19 @@ static struct rt6_info *ip6_pol_route_lookup(struct net 
*net,
}
/* Search through exception table */
rt_cache = rt6_find_cached_rt(rt, >daddr, >saddr);
-   if (rt_cache)
+   if (rt_cache) {
rt = rt_cache;
-
-   if (ip6_hold_safe(net, , true))
-   dst_use_noref(>dst, jiffies);
+   if (ip6_hold_safe(net, , true))
+   dst_use_noref(>dst, jiffies);
+   } else {
+   rt = ip6_create_rt_rcu(rt);
+   }
 
rcu_read_unlock();
 
trace_fib6_table_lookup(net, rt, table, fl6);
 
return rt;
-
 }
 
 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
-- 
2.11.0

[PATCH RFC net-next 18/20] net/ipv6: separate handling of FIB entries from dst based routes

2018-02-25 Thread David Ahern

Signed-off-by: David Ahern 
---
 include/net/ip6_fib.h   |   4 +-
 include/net/ip6_route.h |   3 +-
 net/ipv6/addrconf.c |  31 ++---
 net/ipv6/anycast.c  |   7 +-
 net/ipv6/ip6_fib.c  |  50 +--
 net/ipv6/ip6_output.c   |   3 +-
 net/ipv6/ndisc.c|   6 +-
 net/ipv6/route.c| 167 +---
 8 files changed, 121 insertions(+), 150 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 70978deac538..ff16e3d571a2 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -315,9 +315,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 
if (rt->rt6i_flags & RTF_PCPU ||
(unlikely(!list_empty(>rt6i_uncached)) && rt->from))
-   rt = rt->from;
-
-   rt6_get_cookie_safe(rt, );
+   rt6_get_cookie_safe(rt->from, );
 
return cookie;
 }
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 24c78fb6ac36..fcda09a58193 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -113,8 +113,7 @@ static inline int ip6_route_get_saddr(struct net *net, 
struct rt6_info *rt,
  unsigned int prefs,
  struct in6_addr *saddr)
 {
-   struct inet6_dev *idev =
-   rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
+   struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL;
int err = 0;
 
if (rt && rt->rt6i_prefsrc.plen)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2a032b932922..4dd7b4e9de4c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -927,7 +927,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
pr_warn("Freeing alive inet6 address %p\n", ifp);
return;
}
-   ip6_rt_put(ifp->rt);
+   fib6_info_release(ifp->rt);
 
kfree_rcu(ifp, rcu);
 }
@@ -1080,6 +1080,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct 
in6_addr *addr,
ifa->cstamp = ifa->tstamp = jiffies;
ifa->tokenized = false;
 
+   fib6_info_hold(rt);
ifa->rt = rt;
 
ifa->idev = idev;
@@ -1114,8 +1115,12 @@ ipv6_add_addr(struct inet6_dev *idev, const struct 
in6_addr *addr,
inet6addr_notifier_call_chain(NETDEV_UP, ifa);
 out:
if (unlikely(err < 0)) {
-   if (rt)
-   ip6_rt_put(rt);
+   /* one release for the hold taken when rt is set in ifa
+* and a second release for the hold taken on rt create
+*/
+   fib6_info_release(rt);
+   fib6_info_release(rt);
+
if (ifa) {
if (ifa->idev)
in6_dev_put(ifa->idev);
@@ -1203,7 +1208,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned 
long expires, bool del_r
else {
if (!(rt->rt6i_flags & RTF_EXPIRES))
fib6_set_expires(rt, expires);
-   ip6_rt_put(rt);
+   fib6_info_release(rt);
}
}
 }
@@ -2350,8 +2355,7 @@ static struct rt6_info *addrconf_get_prefix_route(const 
struct in6_addr *pfx,
continue;
if ((rt->rt6i_flags & noflags) != 0)
continue;
-   if (!dst_hold_safe(>dst))
-   rt = NULL;
+   fib6_info_hold(rt);
break;
}
 out:
@@ -2663,7 +2667,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, 
int len, bool sllao)
addrconf_prefix_route(>prefix, pinfo->prefix_len,
  dev, expires, flags, GFP_ATOMIC);
}
-   ip6_rt_put(rt);
+   fib6_info_release(rt);
}
 
/* Try to figure out our local address for this prefix */
@@ -3330,9 +3334,14 @@ static int fixup_permanent_addr(struct net *net,
spin_lock(>lock);
prev = ifp->rt;
ifp->rt = rt;
+   fib6_info_hold(rt);
spin_unlock(>lock);
 
-   ip6_rt_put(prev);
+   /* one release for the hold taken when rt is set in ifa
+* and a second release for the hold taken on rt create
+*/
+   fib6_info_release(prev);
+   fib6_info_release(prev);
}
 
if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
@@ -3706,6 +3715,7 @@ static int addrconf_ifdown(struct net_device *dev, int 
how)
 
rt = ifa->rt;
ifa->rt = NULL;
+   fib6_info_release(rt);
} else {
state = ifa->state;
ifa->state = INET6_IFADDR_STATE_DEAD;
@@ -5600,8 +5610,9 @@ static void __ipv6_ifa_notify(int event, struct

[PATCH RFC net-next 03/20] net/ipv6: Pass net to fib6_update_sernum

2018-02-25 Thread David Ahern

Pass net namespace to fib6_update_sernum. Can not be marked const
as fib6_new_sernum will change ipv6.fib6_sernum.

Signed-off-by: David Ahern 
---
 include/net/ip6_fib.h |  2 +-
 net/ipv6/ip6_fib.c|  3 +--
 net/ipv6/route.c  | 10 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 34ec321d6a03..363d4b9d140c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -406,7 +406,7 @@ void __net_exit fib6_notifier_exit(struct net *net);
 unsigned int fib6_tables_seq_read(struct net *net);
 int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 
-void fib6_update_sernum(struct rt6_info *rt);
+void fib6_update_sernum(struct net *net, struct rt6_info *rt);
 void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index cab95cf3b39f..51d2d5a7ba89 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -105,9 +105,8 @@ enum {
FIB6_NO_SERNUM_CHANGE = 0,
 };
 
-void fib6_update_sernum(struct rt6_info *rt)
+void fib6_update_sernum(struct net *net, struct rt6_info *rt)
 {
-   struct net *net = dev_net(rt->dst.dev);
struct fib6_node *fn;
 
fn = rcu_dereference_protected(rt->rt6i_node,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index aa709b644945..153577ded1a7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1344,7 +1344,7 @@ static int rt6_insert_exception(struct rt6_info *nrt,
/* Update fn->fn_sernum to invalidate all cached dst */
if (!err) {
spin_lock_bh(>rt6i_table->tb6_lock);
-   fib6_update_sernum(ort);
+   fib6_update_sernum(net, ort);
spin_unlock_bh(>rt6i_table->tb6_lock);
fib6_force_start_gc(net);
}
@@ -3636,11 +3636,11 @@ void rt6_multipath_rebalance(struct rt6_info *rt)
 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 {
const struct arg_netdev_event *arg = p_arg;
-   const struct net *net = dev_net(arg->dev);
+   struct net *net = dev_net(arg->dev);
 
if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
rt->rt6i_nh_flags &= ~arg->nh_flags;
-   fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
+   fib6_update_sernum_upto_root(net, rt);
rt6_multipath_rebalance(rt);
}
 
@@ -3719,7 +3719,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 {
const struct arg_netdev_event *arg = p_arg;
const struct net_device *dev = arg->dev;
-   const struct net *net = dev_net(dev);
+   struct net *net = dev_net(dev);
 
if (rt == net->ipv6.ip6_null_entry)
return 0;
@@ -3742,7 +3742,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
}
rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
   RTNH_F_LINKDOWN);
-   fib6_update_sernum(rt);
+   fib6_update_sernum(net, rt);
rt6_multipath_rebalance(rt);
}
return -2;
-- 
2.11.0

[PATCH RFC net-next 13/20] net/ipv6: Move dst flags to booleans in fib entries

2018-02-25 Thread David Ahern

Use bool to hold requests for certain dst settings when a FIB entry
is converted to a dst.

Signed-off-by: David Ahern 
---
 include/net/ip6_fib.h |  5 -
 net/ipv6/addrconf.c   |  4 ++--
 net/ipv6/route.c  | 29 -
 3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 3ba0bb7c7a43..d867b1696927 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -177,7 +177,10 @@ struct rt6_info {
u8  fib6_type;
u8  exception_bucket_flushed:1,
should_flush:1,
-   unused:6;
+   dst_nocount:1,
+   dst_nopolicy:1,
+   dst_host:1,
+   unused:3;
 
unsigned long   expires;
struct dst_metrics  *fib6_metrics;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 478f45bf13cf..6ac54498ae71 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1058,7 +1058,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct 
in6_addr *addr,
 
if (net->ipv6.devconf_all->disable_policy ||
idev->cnf.disable_policy)
-   rt->dst.flags |= DST_NOPOLICY;
+   rt->dst_nopolicy = true;
 
neigh_parms_data_state_setall(idev->nd_parms);
 
@@ -5945,7 +5945,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, 
int val)
int cpu;
 
rcu_read_lock();
-   addrconf_set_nopolicy(ifa->rt, val);
+   ifa->rt->dst_nopolicy = val ? true : false;
if (rt->rt6i_pcpu) {
for_each_possible_cpu(cpu) {
struct rt6_info **rtp;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 066203d2f9d3..61f4f0333c73 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -935,6 +935,20 @@ static int ip6_rt_type_to_error(u8 fib6_type)
return fib6_prop[fib6_type];
 }
 
+static unsigned short fib6_info_dst_flags(struct rt6_info *rt)
+{
+   unsigned short flags = 0;
+
+   if (rt->dst_nocount)
+   flags |= DST_NOCOUNT;
+   if (rt->dst_nopolicy)
+   flags |= DST_NOPOLICY;
+   if (rt->dst_host)
+   flags |= DST_HOST;
+
+   return flags;
+}
+
 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
 {
rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
@@ -959,6 +973,8 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, 
struct rt6_info *ort)
 
 static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
 {
+   rt->dst.flags |= fib6_info_dst_flags(ort);
+
if (ort->rt6i_flags & RTF_REJECT) {
ip6_rt_init_dst_reject(rt, ort);
return;
@@ -968,7 +984,6 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct 
rt6_info *ort)
rt->dst.output = ip6_output;
 
if (ort->fib6_type == RTN_LOCAL) {
-   rt->dst.flags |= DST_HOST;
rt->dst.input = ip6_input;
} else if (ipv6_addr_type(>rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
rt->dst.input = ip6_mc_input;
@@ -1056,11 +1071,12 @@ static bool ip6_hold_safe(struct net *net, struct 
rt6_info **prt,
 /* called with rcu_lock held */
 static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
 {
+   unsigned short flags = fib6_info_dst_flags(rt);
struct net_device *dev;
struct rt6_info *nrt;
 
dev = ip6_rt_get_dev_rcu(rt);
-   nrt = __ip6_dst_alloc(dev_net(dev), dev, 0);
+   nrt = __ip6_dst_alloc(dev_net(dev), dev, flags);
if (nrt)
ip6_rt_copy_init(nrt, rt);
 
@@ -1215,12 +1231,13 @@ static struct rt6_info *ip6_rt_cache_alloc(struct 
rt6_info *ort,
 
 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 {
+   unsigned short flags = fib6_info_dst_flags(rt);
struct net_device *dev;
struct rt6_info *pcpu_rt;
 
rcu_read_lock();
dev = ip6_rt_get_dev_rcu(rt);
-   pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
+   pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, flags);
rcu_read_unlock();
if (!pcpu_rt)
return NULL;
@@ -2768,7 +2785,7 @@ static struct rt6_info *ip6_route_info_create(struct 
fib6_config *cfg,
ipv6_addr_prefix(>rt6i_dst.addr, >fc_dst, cfg->fc_dst_len);
rt->rt6i_dst.plen = cfg->fc_dst_len;
if (rt->rt6i_dst.plen == 128)
-   rt->dst.flags |= DST_HOST;
+   rt->dst_host = true;
 
 #ifdef CONFIG_IPV6_SUBTREES
ipv6_addr_prefix(>rt6i_src.addr, >fc_src, cfg->fc_src_len);
@@ -3474,10 +3491,12 @@ struct rt6_info

[PATCH RFC net-next 19/20] net/ipv6: Flip FIB entries to fib6_info

2018-02-25 Thread David Ahern

Signed-off-by: David Ahern 
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  64 ++---
 include/net/if_inet6.h |   4 +-
 include/net/ip6_fib.h  |  38 +--
 include/net/ip6_route.h|  28 +--
 include/net/netns/ipv6.h   |   2 +-
 net/ipv6/addrconf.c|  20 +-
 net/ipv6/anycast.c |   4 +-
 net/ipv6/ip6_fib.c | 114 -
 net/ipv6/ndisc.c   |   2 +-
 net/ipv6/route.c   | 259 ++---
 10 files changed, 267 insertions(+), 268 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 90d01df783b3..f74f65a2161b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -441,7 +441,7 @@ struct mlxsw_sp_fib6_entry {
 
 struct mlxsw_sp_rt6 {
struct list_head list;
-   struct rt6_info *rt;
+   struct fib6_info *rt;
 };
 
 struct mlxsw_sp_lpm_tree {
@@ -3764,7 +3764,7 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group 
*nh_grp,
 
for (i = 0; i < nh_grp->count; i++) {
struct mlxsw_sp_nexthop *nh = _grp->nexthops[i];
-   struct rt6_info *rt = mlxsw_sp_rt6->rt;
+   struct fib6_info *rt = mlxsw_sp_rt6->rt;
 
if (nh->rif && nh->rif->dev == rt->fib6_nh.nh_dev &&
ipv6_addr_equal((const struct in6_addr *) >gw_addr,
@@ -3850,7 +3850,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct 
mlxsw_sp_fib_entry *fib_entry)
fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry,
  common);
list_for_each_entry(mlxsw_sp_rt6, _entry->rt6_list, list) {
-   struct rt6_info *rt = mlxsw_sp_rt6->rt;
+   struct fib6_info *rt = mlxsw_sp_rt6->rt;
 
rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
}
@@ -4629,7 +4629,7 @@ static void mlxsw_sp_router_fib4_del(struct mlxsw_sp 
*mlxsw_sp,
mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
 }
 
-static bool mlxsw_sp_fib6_rt_should_ignore(const struct rt6_info *rt)
+static bool mlxsw_sp_fib6_rt_should_ignore(const struct fib6_info *rt)
 {
/* Packets with link-local destination IP arriving to the router
 * are trapped to the CPU, so no need to program specific routes
@@ -4651,7 +4651,7 @@ static bool mlxsw_sp_fib6_rt_should_ignore(const struct 
rt6_info *rt)
return false;
 }
 
-static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct rt6_info *rt)
+static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct fib6_info *rt)
 {
struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
 
@@ -4664,18 +4664,18 @@ static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct 
rt6_info *rt)
 * memory.
 */
mlxsw_sp_rt6->rt = rt;
-   rt6_hold(rt);
+   fib6_info_hold(rt);
 
return mlxsw_sp_rt6;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static void mlxsw_sp_rt6_release(struct rt6_info *rt)
+static void mlxsw_sp_rt6_release(struct fib6_info *rt)
 {
-   rt6_release(rt);
+   fib6_info_release(rt);
 }
 #else
-static void mlxsw_sp_rt6_release(struct rt6_info *rt)
+static void mlxsw_sp_rt6_release(struct fib6_info *rt)
 {
 }
 #endif
@@ -4686,13 +4686,13 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 
*mlxsw_sp_rt6)
kfree(mlxsw_sp_rt6);
 }
 
-static bool mlxsw_sp_fib6_rt_can_mp(const struct rt6_info *rt)
+static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
 {
/* RTF_CACHE routes are ignored */
return (rt->rt6i_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
 }
 
-static struct rt6_info *
+static struct fib6_info *
 mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 {
return list_first_entry(_entry->rt6_list, struct mlxsw_sp_rt6,
@@ -4701,7 +4701,7 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry 
*fib6_entry)
 
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
-const struct rt6_info *nrt, bool replace)
+const struct fib6_info *nrt, bool replace)
 {
struct mlxsw_sp_fib6_entry *fib6_entry;
 
@@ -4709,7 +4709,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct 
mlxsw_sp_fib_node *fib_node,
return NULL;
 
list_for_each_entry(fib6_entry, _node->entry_list, common.list) {
-   struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+   struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
/* RT6_TABLE_LOCAL and RT6_TABLE_MAIN share the same
 * virtual router.
@@ -4732,7 +4732,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct

[PATCH RFC net-next 14/20] net/ipv6: Create a neigh_lookup for FIB entries

2018-02-25 Thread David Ahern

Refactor dst_neigh_lookup and create a new function that takes the
gateway and device. Since rt6_get_dflt_router returns a FIB entry,
change ndisc_router_discovery to use the new ip6_neigh_lookup.

Signed-off-by: David Ahern 
---
 include/net/ip6_route.h |  5 +
 net/ipv6/ndisc.c|  8 ++--
 net/ipv6/route.c| 33 -
 3 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index bd82ec4a57e6..aa59c6eb34e6 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -268,4 +268,9 @@ static inline bool rt6_duplicate_nexthop(struct rt6_info 
*a, struct rt6_info *b)
   ipv6_addr_equal(>fib6_nh.nh_gw, >fib6_nh.nh_gw) &&
   !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, 
b->fib6_nh.nh_lwtstate);
 }
+
+struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
+  struct net_device *dev,
+  struct sk_buff *skb,
+  const void *daddr);
 #endif
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index bd804e8cd73c..023da106b682 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1276,7 +1276,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
rt = rt6_get_dflt_router(net, _hdr(skb)->saddr, skb->dev);
 
if (rt) {
-   neigh = dst_neigh_lookup(>dst, _hdr(skb)->saddr);
+   neigh = ip6_neigh_lookup(>fib6_nh.nh_gw,
+rt->fib6_nh.nh_dev, NULL,
+ _hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
  "RA: %s got default router without 
neighbour\n",
@@ -1304,7 +1306,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
 
-   neigh = dst_neigh_lookup(>dst, _hdr(skb)->saddr);
+   neigh = ip6_neigh_lookup(>fib6_nh.nh_gw,
+rt->fib6_nh.nh_dev, NULL,
+ _hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
  "RA: %s got default router without 
neighbour\n",
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 61f4f0333c73..ad7a80ad7b59 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -182,12 +182,10 @@ static void rt6_uncached_list_flush_dev(struct net *net, 
struct net_device *dev)
}
 }
 
-static inline const void *choose_neigh_daddr(struct rt6_info *rt,
+static inline const void *choose_neigh_daddr(const struct in6_addr *p,
 struct sk_buff *skb,
 const void *daddr)
 {
-   struct in6_addr *p = >rt6i_gateway;
-
if (!ipv6_addr_any(p))
return (const void *) p;
else if (skb)
@@ -195,18 +193,27 @@ static inline const void *choose_neigh_daddr(struct 
rt6_info *rt,
return daddr;
 }
 
-static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
- struct sk_buff *skb,
- const void *daddr)
+struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
+  struct net_device *dev,
+  struct sk_buff *skb,
+  const void *daddr)
 {
-   struct rt6_info *rt = (struct rt6_info *) dst;
struct neighbour *n;
 
-   daddr = choose_neigh_daddr(rt, skb, daddr);
-   n = __ipv6_neigh_lookup(dst->dev, daddr);
+   daddr = choose_neigh_daddr(gw, skb, daddr);
+   n = __ipv6_neigh_lookup(dev, daddr);
if (n)
return n;
-   return neigh_create(_tbl, daddr, dst->dev);
+   return neigh_create(_tbl, daddr, dev);
+}
+
+static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+   const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
+
+   return ip6_neigh_lookup(>rt6i_gateway, dst->dev, skb, daddr);
 }
 
 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
@@ -214,7 +221,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, 
const void *daddr)
struct net_device *dev = dst->dev;
struct rt6_info *rt = (struct rt6_info *)dst;
 
-   daddr = choose_neigh_daddr(rt, NULL, daddr);
+   daddr = choose_neigh_daddr(>rt6i_gateway, NULL, daddr);
if (!daddr)
return;
if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
@@ -239,7 +246,7 @@ static struct dst_ops ip6_dst_ops_template = {
.update_pmtu=   ip6_rt_update_pmtu,
.redirect

[PATCH RFC net-next 07/20] net/ipv6: Move nexthop data to fib6_nh

2018-02-25 Thread David Ahern

Introduce fib6_nh structure and move nexthop related data from
rt6_info and rt6_info.dst to fib6_nh. References to dev, gateway or
lwtstate from a FIB lookup perspective are converted to use fib6_nh;
datapath references to dst version are left as is.

Signed-off-by: David Ahern 
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  32 ++--
 include/net/ip6_fib.h  |  16 +-
 include/net/ip6_route.h|   6 +-
 net/ipv6/addrconf.c|   2 +-
 net/ipv6/ip6_fib.c |   6 +-
 net/ipv6/route.c   | 164 -
 6 files changed, 127 insertions(+), 99 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 05146970c19c..90d01df783b3 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -2700,9 +2700,9 @@ mlxsw_sp_nexthop6_group_cmp(const struct 
mlxsw_sp_nexthop_group *nh_grp,
struct in6_addr *gw;
int ifindex, weight;
 
-   ifindex = mlxsw_sp_rt6->rt->dst.dev->ifindex;
-   weight = mlxsw_sp_rt6->rt->rt6i_nh_weight;
-   gw = _sp_rt6->rt->rt6i_gateway;
+   ifindex = mlxsw_sp_rt6->rt->fib6_nh.nh_dev->ifindex;
+   weight = mlxsw_sp_rt6->rt->fib6_nh.nh_weight;
+   gw = _sp_rt6->rt->fib6_nh.nh_gw;
if (!mlxsw_sp_nexthop6_group_has_nexthop(nh_grp, gw, ifindex,
 weight))
return false;
@@ -2768,7 +2768,7 @@ mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry 
*fib6_entry, u32 seed)
struct net_device *dev;
 
list_for_each_entry(mlxsw_sp_rt6, _entry->rt6_list, list) {
-   dev = mlxsw_sp_rt6->rt->dst.dev;
+   dev = mlxsw_sp_rt6->rt->fib6_nh.nh_dev;
val ^= dev->ifindex;
}
 
@@ -3766,9 +3766,9 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group 
*nh_grp,
struct mlxsw_sp_nexthop *nh = _grp->nexthops[i];
struct rt6_info *rt = mlxsw_sp_rt6->rt;
 
-   if (nh->rif && nh->rif->dev == rt->dst.dev &&
+   if (nh->rif && nh->rif->dev == rt->fib6_nh.nh_dev &&
ipv6_addr_equal((const struct in6_addr *) >gw_addr,
-   >rt6i_gateway))
+   >fib6_nh.nh_gw))
return nh;
continue;
}
@@ -3825,7 +3825,7 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry 
*fib_entry)
 
if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) {
list_first_entry(_entry->rt6_list, struct mlxsw_sp_rt6,
-list)->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
+list)->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD;
return;
}
 
@@ -3835,9 +3835,9 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry 
*fib_entry)
 
nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
if (nh && nh->offloaded)
-   mlxsw_sp_rt6->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
+   mlxsw_sp_rt6->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD;
else
-   mlxsw_sp_rt6->rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
+   mlxsw_sp_rt6->rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
}
 }
 
@@ -3852,7 +3852,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct 
mlxsw_sp_fib_entry *fib_entry)
list_for_each_entry(mlxsw_sp_rt6, _entry->rt6_list, list) {
struct rt6_info *rt = mlxsw_sp_rt6->rt;
 
-   rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
+   rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
}
 }
 
@@ -4748,8 +4748,8 @@ static bool mlxsw_sp_nexthop6_ipip_type(const struct 
mlxsw_sp *mlxsw_sp,
const struct rt6_info *rt,
enum mlxsw_sp_ipip_type *ret)
 {
-   return rt->dst.dev &&
-  mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->dst.dev, ret);
+   return rt->fib6_nh.nh_dev &&
+  mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh.nh_dev, ret);
 }
 
 static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
@@ -4759,7 +4759,7 @@ static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp 
*mlxsw_sp,
 {
const struct mlxsw_sp_ipip_ops *ipip_ops;
struct mlxsw_sp_ipip_entry *ipip_entry;
-   struct net_device *dev = rt->dst.dev;
+   struct net_device *dev = rt->fib6_nh.nh_dev;
struct mlxsw_sp_rif *rif;
int err;
 
@@ -4802,11 +4802,11 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp 
*mlxsw_sp,
  struct mlxsw_sp_nexthop

[PATCH RFC net-next 20/20] net/ipv6: Remove unused code and variables for rt6_info

2018-02-25 Thread David Ahern

Signed-off-by: David Ahern 
---
 include/net/ip6_fib.h   | 58 +
 net/ipv6/ip6_fib.c  | 22 ---
 net/ipv6/route.c| 27 ++-
 net/ipv6/xfrm6_policy.c |  2 --
 4 files changed, 3 insertions(+), 106 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 7dd8b0cf55ed..723cb8f2a6d9 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -177,31 +177,10 @@ struct fib6_info {
 
 struct rt6_info {
struct dst_entrydst;
-   struct rt6_info __rcu   *rt6_next;
struct fib6_info*from;
 
-   /*
-* Tail elements of dst_entry (__refcnt etc.)
-* and these elements (rarely used in hot path) are in
-* the same cache line.
-*/
-   struct fib6_table   *rt6i_table;
-   struct fib6_node __rcu  *rt6i_node;
-
struct in6_addr rt6i_gateway;
-
-   /* Multipath routes:
-* siblings is a list of rt6_info that have the the same metric/weight,
-* destination, but not the same gateway. nsiblings is just a cache
-* to speed up lookup.
-*/
-   struct list_headrt6i_siblings;
-   unsigned intrt6i_nsiblings;
-
-   atomic_trt6i_ref;
-
-   /* These are in a separate cache line. */
-   struct rt6key   rt6i_dst cacheline_aligned_in_smp;
+   struct rt6key   rt6i_dst;
u32 rt6i_flags;
struct rt6key   rt6i_src;
struct rt6key   rt6i_prefsrc;
@@ -210,27 +189,8 @@ struct rt6_info {
struct uncached_list*rt6i_uncached_list;
 
struct inet6_dev*rt6i_idev;
-   struct rt6_info * __percpu  *rt6i_pcpu;
-   struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
-
-   u32 rt6i_metric;
/* more non-fragment space at head required */
unsigned short  rt6i_nfheader_len;
-   u8  rt6i_protocol;
-   u8  fib6_type;
-   u8  exception_bucket_flushed:1,
-   should_flush:1,
-   dst_nocount:1,
-   dst_nopolicy:1,
-   dst_host:1,
-   unused:3;
-
-   unsigned long   expires;
-   struct dst_metrics  *fib6_metrics;
-#define fib6_pmtu  fib6_metrics->metrics[RTAX_MTU-1]
-#define fib6_hoplimit  fib6_metrics->metrics[RTAX_HOPLIMIT-1]
-#define fib6_metric_lock   fib6_metrics->metrics[RTAX_LOCK - 1]
-   struct fib6_nh  fib6_nh;
 };
 
 #define for_each_fib6_node_rt_rcu(fn)  \
@@ -329,8 +289,6 @@ static inline void ip6_rt_put(struct rt6_info *rt)
dst_release(>dst);
 }
 
-void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
-
 struct fib6_info *fib6_info_alloc(gfp_t gfp_flags);
 void fib6_info_destroy(struct fib6_info *f6i);
 
@@ -345,20 +303,6 @@ static inline void fib6_info_release(struct fib6_info *f6i)
fib6_info_destroy(f6i);
 }
 
-static inline void rt6_hold(struct rt6_info *rt)
-{
-   atomic_inc(>rt6i_ref);
-}
-
-static inline void rt6_release(struct rt6_info *rt)
-{
-   if (atomic_dec_and_test(>rt6i_ref)) {
-   rt6_free_pcpu(rt);
-   dst_dev_put(>dst);
-   dst_release(>dst);
-   }
-}
-
 enum fib6_walk_state {
 #ifdef CONFIG_IPV6_SUBTREES
FWS_S,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d697e408bbf3..b6750a90c355 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -235,28 +235,6 @@ static void node_free(struct net *net, struct fib6_node 
*fn)
net->ipv6.rt6_stats->fib_nodes--;
 }
 
-void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
-{
-   int cpu;
-
-   if (!non_pcpu_rt->rt6i_pcpu)
-   return;
-
-   for_each_possible_cpu(cpu) {
-   struct rt6_info **ppcpu_rt;
-   struct rt6_info *pcpu_rt;
-
-   ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
-   pcpu_rt = *ppcpu_rt;
-   if (pcpu_rt) {
-   dst_dev_put(_rt->dst);
-   dst_release(_rt->dst);
-   *ppcpu_rt = NULL;
-   }
-   }
-}
-EXPORT_SYMBOL_GPL(rt6_free_pcpu);
-
 static void fib6_free_table(struct fib6_table *table)
 {
inetpeer_invalidate_tree(>tb6_peers);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fab34c68f498..041701105a4a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -302,10 +302,6 @@ static const struct rt6_info

[PATCH RFC net-next 06/20] net/ipv6: Save route type in rt6_info flags

2018-02-25 Thread David Ahern

Signed-off-by: David Ahern 
---
 include/net/ip6_fib.h |  1 +
 net/ipv6/addrconf.c   |  2 ++
 net/ipv6/route.c  | 46 --
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 363d4b9d140c..4badcf105bd9 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -174,6 +174,7 @@ struct rt6_info {
int rt6i_nh_weight;
unsigned short  rt6i_nfheader_len;
u8  rt6i_protocol;
+   u8  fib6_type;
u8  exception_bucket_flushed:1,
should_flush:1,
unused:6;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 120affb4ea74..82862d3f0295 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2306,6 +2306,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, 
struct net_device *dev,
.fc_flags = RTF_UP | flags,
.fc_nlinfo.nl_net = dev_net(dev),
.fc_protocol = RTPROT_KERNEL,
+   .fc_type = RTN_UNICAST,
};
 
cfg.fc_dst = *pfx;
@@ -2369,6 +2370,7 @@ static void addrconf_add_mroute(struct net_device *dev)
.fc_ifindex = dev->ifindex,
.fc_dst_len = 8,
.fc_flags = RTF_UP,
+   .fc_type = RTN_UNICAST,
.fc_nlinfo.nl_net = dev_net(dev),
};
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8f73335c325a..2de9af0bead1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -307,6 +307,7 @@ static const struct rt6_info ip6_null_entry_template = {
.rt6i_protocol  = RTPROT_KERNEL,
.rt6i_metric= ~(u32) 0,
.rt6i_ref   = ATOMIC_INIT(1),
+   .fib6_type  = RTN_UNREACHABLE,
 };
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -324,6 +325,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
.rt6i_protocol  = RTPROT_KERNEL,
.rt6i_metric= ~(u32) 0,
.rt6i_ref   = ATOMIC_INIT(1),
+   .fib6_type  = RTN_PROHIBIT,
 };
 
 static const struct rt6_info ip6_blk_hole_entry_template = {
@@ -339,6 +341,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
.rt6i_protocol  = RTPROT_KERNEL,
.rt6i_metric= ~(u32) 0,
.rt6i_ref   = ATOMIC_INIT(1),
+   .fib6_type  = RTN_BLACKHOLE,
 };
 
 #endif
@@ -2609,6 +2612,11 @@ static struct rt6_info *ip6_route_info_create(struct 
fib6_config *cfg,
goto out;
}
 
+   if (cfg->fc_type > RTN_MAX) {
+   NL_SET_ERR_MSG(extack, "Invalid route type");
+   goto out;
+   }
+
if (cfg->fc_dst_len > 128) {
NL_SET_ERR_MSG(extack, "Invalid prefix length");
goto out;
@@ -2721,6 +2729,8 @@ static struct rt6_info *ip6_route_info_create(struct 
fib6_config *cfg,
rt->rt6i_metric = cfg->fc_metric;
rt->rt6i_nh_weight = 1;
 
+   rt->fib6_type = cfg->fc_type;
+
/* We cannot add true routes via loopback here,
   they would result in kernel looping; promote them to reject routes
 */
@@ -3205,6 +3215,7 @@ static struct rt6_info *rt6_add_route_info(struct net 
*net,
.fc_flags   = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
  RTF_UP | RTF_PREF(pref),
.fc_protocol = RTPROT_RA,
+   .fc_type = RTN_UNICAST,
.fc_nlinfo.portid = 0,
.fc_nlinfo.nlh = NULL,
.fc_nlinfo.nl_net = net,
@@ -3261,6 +3272,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net,
.fc_flags   = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
.fc_protocol = RTPROT_RA,
+   .fc_type = RTN_UNICAST,
.fc_nlinfo.portid = 0,
.fc_nlinfo.nlh = NULL,
.fc_nlinfo.nl_net = net,
@@ -3336,6 +3348,7 @@ static void rtmsg_to_fib6_config(struct net *net,
cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
cfg->fc_src_len = rtmsg->rtmsg_src_len;
cfg->fc_flags = rtmsg->rtmsg_flags;
+   cfg->fc_type = rtmsg->rtmsg_type;
 
cfg->fc_nlinfo.nl_net = net;
 
@@ -3456,10 +3469,13 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 
rt->rt6i_protocol = RTPROT_KERNEL;
rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
-   if (anycast)
+   if (anycast) {
+   rt->fib6_type = RTN_ANYCAST;
rt->rt6i_flags |= RTF_ANYCAST;
-   else
+   } else {
+   rt->fib6_type = RTN_LOCAL;
rt->rt6i_flags |= RTF_LOCAL;
+   }
 
rt->rt6i_gateway  = *addr;
rt->rt6i_dst.addr = *addr;
@@ -4370,30 +4386,8 @@

[PATCH RFC net-next 04/20] net/ipv6: Pass net namespace to route functions

2018-02-25 Thread David Ahern

Pass network namespace reference into route add, delete and get
functions.

Signed-off-by: David Ahern 
---
 include/net/ip6_route.h | 12 ++-
 net/ipv6/addrconf.c | 33 --
 net/ipv6/anycast.c  | 10 +
 net/ipv6/ndisc.c| 12 ++-
 net/ipv6/route.c| 54 +
 5 files changed, 66 insertions(+), 55 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 27d23a65f3cd..ef0d8977e2d0 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -99,8 +99,8 @@ void ip6_route_cleanup(void);
 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 
 int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
-int ip6_ins_rt(struct rt6_info *);
-int ip6_del_rt(struct rt6_info *);
+int ip6_ins_rt(struct net *net, struct rt6_info *rt);
+int ip6_del_rt(struct net *net, struct rt6_info *rt);
 
 void rt6_flush_exceptions(struct rt6_info *rt);
 int rt6_remove_exception_rt(struct rt6_info *rt);
@@ -133,7 +133,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 
struct flowi6 *fl6);
 
 void fib6_force_start_gc(struct net *net);
 
-struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
+struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
const struct in6_addr *addr, bool anycast);
 
 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
@@ -143,9 +143,11 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct 
net_device *dev,
  * support functions for ND
  *
  */
-struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr,
+struct rt6_info *rt6_get_dflt_router(struct net *net,
+const struct in6_addr *addr,
 struct net_device *dev);
-struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
+struct rt6_info *rt6_add_dflt_router(struct net *net,
+const struct in6_addr *gwaddr,
 struct net_device *dev, unsigned int pref);
 
 void rt6_purge_dflt_routers(struct net *net);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4facfe0b1888..120affb4ea74 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1049,7 +1049,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct 
in6_addr *addr,
goto out;
}
 
-   rt = addrconf_dst_alloc(idev, addr, false);
+   rt = addrconf_dst_alloc(net, idev, addr, false);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
@@ -1199,7 +1199,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned 
long expires, bool del_r
   0, RTF_GATEWAY | RTF_DEFAULT);
if (rt) {
if (del_rt)
-   ip6_del_rt(rt);
+   ip6_del_rt(dev_net(ifp->idev->dev), rt);
else {
if (!(rt->rt6i_flags & RTF_EXPIRES))
rt6_set_expires(rt, expires);
@@ -2642,7 +2642,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, 
int len, bool sllao)
if (rt) {
/* Autoconf prefix route */
if (valid_lft == 0) {
-   ip6_del_rt(rt);
+   ip6_del_rt(net, rt);
rt = NULL;
} else if (addrconf_finite_timeout(rt_expires)) {
/* not infinity */
@@ -3304,7 +3304,8 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
-static int fixup_permanent_addr(struct inet6_dev *idev,
+static int fixup_permanent_addr(struct net *net,
+   struct inet6_dev *idev,
struct inet6_ifaddr *ifp)
 {
/* !rt6i_node means the host route was removed from the
@@ -3314,7 +3315,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
if (!ifp->rt || !ifp->rt->rt6i_node) {
struct rt6_info *rt, *prev;
 
-   rt = addrconf_dst_alloc(idev, >addr, false);
+   rt = addrconf_dst_alloc(net, idev, >addr, false);
if (IS_ERR(rt))
return PTR_ERR(rt);
 
@@ -3338,7 +3339,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
return 0;
 }
 
-static void addrconf_permanent_addr(struct net_device *dev)
+static void addrconf_permanent_addr(struct net *net, struct net_device *dev)
 {
struct inet6_ifaddr *ifp, *tmp;
struct inet6_dev *idev;
@@ -3351,7 +3352,7 @@ static void addrconf_permanent_addr(struct net_device 
*dev)
 
list_for_each_entry_safe(ifp, tmp, >addr_list, if_list) {
if ((ifp->flags & IFA_F_PERMANENT) &&
-

Re: [PATCH] netfilter: use skb_to_full_sk in ip6_route_me_harder

2018-02-25 Thread Eric Dumazet

On Sun, 2018-02-25 at 11:47 -0800, Eric Dumazet wrote:
> 
> 
> No idea what happened, but it looks like this garbage should not affect
>patchwork.
> 
> Tell me if a resend is needed, thanks.


Hmm... I will send a V2, sorry for this mess.

[PATCH RFC net-next 09/20] net/ipv6: move metrics from dst to rt6_info

2018-02-25 Thread David Ahern

Similar to IPv4, add fib metrics to the fib struct, which at the moment
is rt6_info. Will be moved to fib6_info in a later patch.

Copy metrics into dst by reference using refcount.

Signed-off-by: David Ahern 
---
 include/net/ip6_fib.h |  13 +--
 net/core/dst.c|   1 +
 net/ipv6/ip6_fib.c|  49 +
 net/ipv6/ndisc.c  |   5 +-
 net/ipv6/route.c  | 267 +++---
 5 files changed, 115 insertions(+), 220 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 7678ae3de44a..da81669b9c90 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -94,11 +94,6 @@ struct fib6_gc_args {
 #define FIB6_SUBTREE(fn)   (rcu_dereference_protected((fn)->subtree, 1))
 #endif
 
-struct mx6_config {
-   const u32 *mx;
-   DECLARE_BITMAP(mx_valid, RTAX_MAX);
-};
-
 /*
  * routing information
  *
@@ -176,7 +171,6 @@ struct rt6_info {
struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
u32 rt6i_metric;
-   u32 rt6i_pmtu;
/* more non-fragment space at head required */
unsigned short  rt6i_nfheader_len;
u8  rt6i_protocol;
@@ -185,6 +179,10 @@ struct rt6_info {
should_flush:1,
unused:6;
 
+   struct dst_metrics  *fib6_metrics;
+#define fib6_pmtu  fib6_metrics->metrics[RTAX_MTU-1]
+#define fib6_hoplimit  fib6_metrics->metrics[RTAX_HOPLIMIT-1]
+#define fib6_metric_lock   fib6_metrics->metrics[RTAX_LOCK - 1]
struct fib6_nh  fib6_nh;
 };
 
@@ -388,8 +386,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct 
rt6_info *, void *arg),
void *arg);
 
 int fib6_add(struct fib6_node *root, struct rt6_info *rt,
-struct nl_info *info, struct mx6_config *mxc,
-struct netlink_ext_ack *extack);
+struct nl_info *info, struct netlink_ext_ack *extack);
 int fib6_del(struct rt6_info *rt, struct nl_info *info);
 
 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
diff --git a/net/core/dst.c b/net/core/dst.c
index 5f70bc832bec..2c23eadd6f7e 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -59,6 +59,7 @@ const struct dst_metrics dst_default_metrics = {
 */
.refcnt = REFCOUNT_INIT(1),
 };
+EXPORT_SYMBOL(dst_default_metrics);
 
 void dst_init(struct dst_entry *dst, struct dst_ops *ops,
  struct net_device *dev, int initial_ref, int initial_obsolete,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 134d86483f3c..faa2b46349df 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -800,38 +800,6 @@ static struct fib6_node *fib6_add_1(struct net *net,
return ln;
 }
 
-static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
-{
-   int i;
-
-   for (i = 0; i < RTAX_MAX; i++) {
-   if (test_bit(i, mxc->mx_valid))
-   mp[i] = mxc->mx[i];
-   }
-}
-
-static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
-{
-   if (!mxc->mx)
-   return 0;
-
-   if (dst->flags & DST_HOST) {
-   u32 *mp = dst_metrics_write_ptr(dst);
-
-   if (unlikely(!mp))
-   return -ENOMEM;
-
-   fib6_copy_metrics(mp, mxc);
-   } else {
-   dst_init_metrics(dst, mxc->mx, false);
-
-   /* We've stolen mx now. */
-   mxc->mx = NULL;
-   }
-
-   return 0;
-}
-
 static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
  struct net *net)
 {
@@ -865,7 +833,7 @@ static void fib6_purge_rt(struct rt6_info *rt, struct 
fib6_node *fn,
  */
 
 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
-   struct nl_info *info, struct mx6_config *mxc,
+   struct nl_info *info,
struct netlink_ext_ack *extack)
 {
struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
@@ -880,7 +848,6 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
int found = 0;
bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
u16 nlflags = NLM_F_EXCL;
-   int err;
 
if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
nlflags |= NLM_F_APPEND;
@@ -922,7 +889,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
rt6_clean_expires(iter);
else
rt6_set_expires(iter, rt->dst.expires);
-   iter->rt6i_pmtu = rt->rt6i_pmtu;
+   iter->fib6_pmtu = rt->fib6_pmtu;

[PATCH RFC net-next 00/20] net/ipv6: Separate data structures for FIB and data path

2018-02-25 Thread David Ahern

IPv6 uses the same data struct for both control plane (FIB entries) and
data path (dst entries). This struct has elements needed for both paths
adding memory overhead and complexity (taking a dst hold in most places
but an additional reference on rt6i_ref in a few). Furthermore, because
of the dst_alloc tie, all FIB entries are allocated with GFP_ATOMIC.

This patch set separates FIB entries from dst entries, better aligning
IPv6 code with IPv4, simplifying the reference counting and allowing
FIB entries added by userspace (not autoconf) to use GFP_KERNEL. It is
first step to a number of performance and scalability changes.

The end result of this patch set:
  - FIB entries (fib6_info):
/* size: 208, cachelines: 4, members: 25 */
/* sum members: 207, holes: 1, sum holes: 1 */

  - dst entries (rt6_info)
   /* size: 240, cachelines: 4, members: 12 */

Versus the the single rt6_info struct today for both paths:
  /* size: 320, cachelines: 5, members: 28 */

This amounts to a 35% reduction in memory use for FIB entries and a
25% reduction for dst entries.

With respect to locking FIB entries use RCU and a single atomic
counter with fib6_info_hold and fib6_info_release helpers to manage
the reference counting. dst entries use only the traditional dst
refcounts with dst_hold and dst_release.

FIB entries for host routes are referenced by inet6_ifaddr and
ifacaddr6. In both cases, additional holds are taken -- similar to
what is done for devices.

This set is the first of many changes to improve the scalability of the
IPv6 code. Follow on changes include:
- consolidating duplicate fib6_info references like IPv4 does with
  duplicate fib_info

- moving fib6_info into a slab cache to avoid allocation roundups to
  power of 2 (the 208 size becomes a 256 actual allocation)

- Allow FIB lookups without generating a dst (e.g., most rt6_lookup
  users just want to verify the egress device). Means moving dst
  allocation to the other side of fib6_rule_lookup which again aligns
  with IPv4 behavior

- using separate standalone nexthop objects which have performance
  benefits beyond fib_info consolidation

At this point I am not seeing any refcount leaks or underflows, no
oops or bug_ons, or warnings from kasan, so I think it is ready for
others to beat up on it finding errors in code paths I have missed.

David Ahern (20):
  net: Move fib_convert_metrics to dst core
  vrf: Move fib6_table into net_vrf
  net/ipv6: Pass net to fib6_update_sernum
  net/ipv6: Pass net namespace to route functions
  net/ipv6: Move support functions up in route.c
  net/ipv6: Save route type in rt6_info flags
  net/ipv6: Move nexthop data to fib6_nh
  net/ipv6: Defer initialization of dst to data path
  net/ipv6: move metrics from dst to rt6_info
  net/ipv6: move expires into rt6_info
  net/ipv6: Add fib6_null_entry
  net/ipv6: Add rt6_info create function for ip6_pol_route_lookup
  net/ipv6: Move dst flags to booleans in fib entries
  net/ipv6: Create a neigh_lookup for FIB entries
  net/ipv6: Add gfp_flags to route add functions
  net/ipv6: Cleanup exception route handling
  net/ipv6: introduce fib6_info struct and helpers
  net/ipv6: separate handling of FIB entries from dst based routes
  net/ipv6: Flip FIB entries to fib6_info
  net/ipv6: Remove unused code and variables for rt6_info

 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |   96 +-
 drivers/net/vrf.c  |   25 +-
 include/net/dst.h  |3 +
 include/net/if_inet6.h |4 +-
 include/net/ip6_fib.h  |  146 ++-
 include/net/ip6_route.h|   46 +-
 include/net/netns/ipv6.h   |3 +-
 net/core/dst.c |   49 +
 net/ipv4/fib_semantics.c   |   43 +-
 net/ipv6/addrconf.c|  131 +-
 net/ipv6/anycast.c |   21 +-
 net/ipv6/ip6_fib.c |  344 +++--
 net/ipv6/ip6_output.c  |3 +-
 net/ipv6/ndisc.c   |   35 +-
 net/ipv6/route.c   | 1361 ++--
 net/ipv6/xfrm6_policy.c|2 -
 16 files changed, 1183 insertions(+), 1129 deletions(-)

-- 
2.11.0

[PATCH RFC net-next 01/20] net: Move fib_convert_metrics to dst core

2018-02-25 Thread David Ahern

Move logic of fib_convert_metrics into dst_metrics_convert. This allows
the code that converts netlink attributes into metrics struct to be
re-used in a later patch by IPv6.

This is mostly a code move only; it involvesthe following changes to
variable names:
  - fi->fib_net becomes net
  - fc_mx and fc_mx_len are passed as inputs pulled from cfg
  - metrics array is passed as an input from fi->fib_metrics->metrics

Signed-off-by: David Ahern 
---
 include/net/dst.h|  3 +++
 net/core/dst.c   | 48 
 net/ipv4/fib_semantics.c | 43 ++-
 3 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index c63d2c37f6e9..36fa8c894c65 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -193,6 +193,9 @@ static inline void dst_metric_set(struct dst_entry *dst, 
int metric, u32 val)
p[metric-1] = val;
 }
 
+int dst_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
+   u32 *metrics);
+
 /* Kernel-internal feature bits that are unallocated in user space. */
 #define DST_FEATURE_ECN_CA (1 << 31)
 
diff --git a/net/core/dst.c b/net/core/dst.c
index 007aa0b08291..5f70bc832bec 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -248,6 +249,53 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, 
unsigned long old)
 }
 EXPORT_SYMBOL(__dst_destroy_metrics_generic);
 
+int dst_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
+   u32 *metrics)
+{
+   bool ecn_ca = false;
+   struct nlattr *nla;
+   int remaining;
+
+   if (!fc_mx)
+   return 0;
+
+   nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) {
+   int type = nla_type(nla);
+   u32 val;
+
+   if (!type)
+   continue;
+   if (type > RTAX_MAX)
+   return -EINVAL;
+
+   if (type == RTAX_CC_ALGO) {
+   char tmp[TCP_CA_NAME_MAX];
+
+   nla_strlcpy(tmp, nla, sizeof(tmp));
+   val = tcp_ca_get_key_by_name(net, tmp, _ca);
+   if (val == TCP_CA_UNSPEC)
+   return -EINVAL;
+   } else {
+   val = nla_get_u32(nla);
+   }
+   if (type == RTAX_ADVMSS && val > 65535 - 40)
+   val = 65535 - 40;
+   if (type == RTAX_MTU && val > 65535 - 15)
+   val = 65535 - 15;
+   if (type == RTAX_HOPLIMIT && val > 255)
+   val = 255;
+   if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+   return -EINVAL;
+   metrics[type - 1] = val;
+   }
+
+   if (ecn_ca)
+   metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(dst_metrics_convert);
+
 static struct dst_ops md_dst_ops = {
.family =   AF_UNSPEC,
 };
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index cd46d7666598..db48d719b2b4 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1019,47 +1019,8 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, 
__be32 fib_prefsrc)
 static int
 fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
 {
-   bool ecn_ca = false;
-   struct nlattr *nla;
-   int remaining;
-
-   if (!cfg->fc_mx)
-   return 0;
-
-   nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
-   int type = nla_type(nla);
-   u32 val;
-
-   if (!type)
-   continue;
-   if (type > RTAX_MAX)
-   return -EINVAL;
-
-   if (type == RTAX_CC_ALGO) {
-   char tmp[TCP_CA_NAME_MAX];
-
-   nla_strlcpy(tmp, nla, sizeof(tmp));
-   val = tcp_ca_get_key_by_name(fi->fib_net, tmp, _ca);
-   if (val == TCP_CA_UNSPEC)
-   return -EINVAL;
-   } else {
-   val = nla_get_u32(nla);
-   }
-   if (type == RTAX_ADVMSS && val > 65535 - 40)
-   val = 65535 - 40;
-   if (type == RTAX_MTU && val > 65535 - 15)
-   val = 65535 - 15;
-   if (type == RTAX_HOPLIMIT && val > 255)
-   val = 255;
-   if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
-   return -EINVAL;
-   fi->fib_metrics->metrics[type - 1] = val;
-   }
-
-   if (ecn_ca)
-   fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= 
DST_FEATURE_ECN_CA;
-

[PATCH RFC net-next 02/20] vrf: Move fib6_table into net_vrf

2018-02-25 Thread David Ahern

A later patch removes rt6i_table from rt6_info. Save the ipv6
table for a VRF in net_vrf. fib tables can not be deleted so
no reference counting or locking is required.

Signed-off-by: David Ahern 
---
 drivers/net/vrf.c | 25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 9ce0182223a0..7d5407eede6c 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -48,6 +48,9 @@ static unsigned int vrf_net_id;
 struct net_vrf {
struct rtable __rcu *rth;
struct rt6_info __rcu   *rt6;
+#if IS_ENABLED(CONFIG_IPV6)
+   struct fib6_table   *fib6_table;
+#endif
u32 tb_id;
 };
 
@@ -496,7 +499,6 @@ static int vrf_rt6_create(struct net_device *dev)
int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM;
struct net_vrf *vrf = netdev_priv(dev);
struct net *net = dev_net(dev);
-   struct fib6_table *rt6i_table;
struct rt6_info *rt6;
int rc = -ENOMEM;
 
@@ -504,8 +506,8 @@ static int vrf_rt6_create(struct net_device *dev)
if (!ipv6_mod_enabled())
return 0;
 
-   rt6i_table = fib6_new_table(net, vrf->tb_id);
-   if (!rt6i_table)
+   vrf->fib6_table = fib6_new_table(net, vrf->tb_id);
+   if (!vrf->fib6_table)
goto out;
 
/* create a dst for routing packets out a VRF device */
@@ -513,7 +515,6 @@ static int vrf_rt6_create(struct net_device *dev)
if (!rt6)
goto out;
 
-   rt6->rt6i_table = rt6i_table;
rt6->dst.output = vrf_output6;
 
rcu_assign_pointer(vrf->rt6, rt6);
@@ -944,22 +945,8 @@ static struct rt6_info *vrf_ip6_route_lookup(struct net 
*net,
 int flags)
 {
struct net_vrf *vrf = netdev_priv(dev);
-   struct fib6_table *table = NULL;
-   struct rt6_info *rt6;
-
-   rcu_read_lock();
-
-   /* fib6_table does not have a refcnt and can not be freed */
-   rt6 = rcu_dereference(vrf->rt6);
-   if (likely(rt6))
-   table = rt6->rt6i_table;
-
-   rcu_read_unlock();
-
-   if (!table)
-   return NULL;
 
-   return ip6_pol_route(net, table, ifindex, fl6, flags);
+   return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, flags);
 }
 
 static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev,
-- 
2.11.0

Re: [PATCH] netfilter: use skb_to_full_sk in ip6_route_me_harder

2018-02-25 Thread Pablo Neira Ayuso

On Sun, Feb 25, 2018 at 11:43:39AM -0800, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> For some reason, Florian forgot to apply to ip6_route_me_harder
> the fix that went in commit 29e09229d9f2 ("netfilter: use
> skb_to_full_sk in ip_route_me_harder")

Applied, thanks Eric.

Re: [PATCH] netfilter: use skb_to_full_sk in ip6_route_me_harder

2018-02-25 Thread Eric Dumazet

On Sun, 2018-02-25 at 11:43 -0800, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> For some reason, Florian forgot to apply to ip6_route_me_harder
> the fix that went in commit 29e09229d9f2 ("netfilter: use
> skb_to_full_sk in ip_route_me_harder")
> 
> Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead 
> of listener") 
> Signed-off-by: Eric Dumazet 
> Reported-by: syzbot 
> ---
>  net/ipv6/netfilter.c |9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
> index
> d95ceca7ff8f648ff301d91a2e3eb60fc2050f1c..531d6957af36c4af48176f9360e9d
> 95f78a45d55 100644
> --- a/net/ipv6/netfilter.c
> +++ b/net/ipv6/netfilter.c
> @@ -21,18 +21,19 @@
>  int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
>  {
>   const struct ipv6hdr *iph = ipv6_hdr(skb);
> + struct sock *sk = sk_to_full_sk(skb->sk);
>   unsigned int hh_len;
>   struct dst_entry *dst;
>   struct flowi6 fl6 = {
> - .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
> + .flowi6_oif = sk ? sk->sk_bound_dev_if : 0,
>   .flowi6_mark = skb->mark,
> - .flowi6_uid = sock_net_uid(net, skb->sk),
> + .flowi6_uid = sock_net_uid(net, sk),
>   .daddr = iph->daddr,
>   .saddr = iph->saddr,
>   };
>   int err;
>  
> - dst = ip6_route_output(net, skb->sk, );
> + dst = ip6_route_output(net, sk, );
>   err = dst->error;
>   if (err) {
>   IP6_INC_STATS(net, ip6_dst_idev(dst),
> IPSTATS_MIB_OUTNOROUTES);
> @@ -50,7 +51,7 @@ int ip6_route_me_harder(struct net *net, struct
> sk_buff *skb)
>   if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
>   xfrm_decode_session(skb, flowi6_to_flowi(), AF_INET6)
> == 0) {
>   skb_dst_set(skb, NULL);
> - dst = xfrm_lookup(net, dst, flowi6_to_flowi(),
> skb->sk, 0);
> + dst = xfrm_lookup(net, dst, flowi6_to_flowi(), sk,
> 0);
>   if (IS_ERR(dst))
>   return PTR_ERR(dst);
>   skb_dst_set(skb, dst);


No idea what happened, but it looks like this garbage should not affect
   patchwork.

Tell me if a resend is needed, thanks.

Re: [PATCH] netfilter: use skb_to_full_sk in ip6_route_me_harder

2018-02-25 Thread Florian Westphal

Eric Dumazet  wrote:
> From: Eric Dumazet 
> 
> For some reason, Florian forgot to apply to ip6_route_me_harder
> the fix that went in commit 29e09229d9f2 ("netfilter: use
> skb_to_full_sk in ip_route_me_harder")

Indeed, sorry about that, thanks for taking care of this.

[PATCH] netfilter: use skb_to_full_sk in ip6_route_me_harder

2018-02-25 Thread Eric Dumazet

From: Eric Dumazet 

For some reason, Florian forgot to apply to ip6_route_me_harder
the fix that went in commit 29e09229d9f2 ("netfilter: use
skb_to_full_sk in ip_route_me_harder")

Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of 
listener") 
Signed-off-by: Eric Dumazet 
Reported-by: syzbot 
---
 net/ipv6/netfilter.c |9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index
d95ceca7ff8f648ff301d91a2e3eb60fc2050f1c..531d6957af36c4af48176f9360e9d
95f78a45d55 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -21,18 +21,19 @@
 int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 {
    const struct ipv6hdr *iph = ipv6_hdr(skb);
+   struct sock *sk = sk_to_full_sk(skb->sk);
    unsigned int hh_len;
    struct dst_entry *dst;
    struct flowi6 fl6 = {
-   .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+   .flowi6_oif = sk ? sk->sk_bound_dev_if : 0,
    .flowi6_mark = skb->mark,
-   .flowi6_uid = sock_net_uid(net, skb->sk),
+   .flowi6_uid = sock_net_uid(net, sk),
    .daddr = iph->daddr,
    .saddr = iph->saddr,
    };
    int err;
 
-   dst = ip6_route_output(net, skb->sk, );
+   dst = ip6_route_output(net, sk, );
    err = dst->error;
    if (err) {
    IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_OUTNOROUTES);
@@ -50,7 +51,7 @@ int ip6_route_me_harder(struct net *net, struct
sk_buff *skb)
    if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
    xfrm_decode_session(skb, flowi6_to_flowi(), AF_INET6)
== 0) {
    skb_dst_set(skb, NULL);
-   dst = xfrm_lookup(net, dst, flowi6_to_flowi(),
skb->sk, 0);
+   dst = xfrm_lookup(net, dst, flowi6_to_flowi(), sk,
0);
    if (IS_ERR(dst))
    return PTR_ERR(dst);
    skb_dst_set(skb, dst);

Re: [PATCH net-next 0/5] Modernize bitbanged GPIO MDIO

2018-02-25 Thread Andrew Lunn

On Sun, Feb 25, 2018 at 01:51:27PM +0100, Linus Walleij wrote:
> This kills off the platform data support from the bitbanged
> GPIO-based MDIO driver and moves it over to using GPIO
> descriptors exclusively.

Hi Linus

I like where this ends up. I wounder about the path it takes to get
there. There seems to be quite a lot of code which gets moved around
and then in the end deleted. Maybe changing the order of the patches
would help. Converting to devm_gpiod_get_index() first would remove
all the active_low flags. Then remove all the unused reset callback,
irq, etc?

 Thanks
Andrew

Re: [PATCH nf] netfilter: ebtables: CONFIG_COMPAT: don't trust userland offsets

2018-02-25 Thread Pablo Neira Ayuso

On Mon, Feb 19, 2018 at 01:24:15AM +0100, Florian Westphal wrote:
> We need to make sure the offsets are not out of range of the
> total size.
> Also check that they are in ascending order.
> 
> The WARN_ON triggered by syzkaller (it sets panic_on_warn) is
> changed to also bail out, no point in continuing parsing.
> 
> Briefly tested with simple ruleset of
> -A INPUT --limit 1/s' --log
> plus jump to custom chains using 32bit ebtables binary.

Also applied, thanks.

Re: [PATCH nf] netfilter: bridge: ebt_among: add missing match size checks

2018-02-25 Thread Pablo Neira Ayuso

On Mon, Feb 19, 2018 at 03:01:45AM +0100, Florian Westphal wrote:
> ebt_among is special, it has a dynamic match size and is exempt
> from the central size checks.
> 
> Therefore it must check that the size of the match structure
> provided from userspace is sane by making sure em->match_size
> is at least the minimum size of the expected structure.
> 
> The module has such a check, but its only done after accessing
> a structure that might be out of bounds.
> 
> tested with: ebtables -A INPUT ... \
> --among-dst fe:fe:fe:fe:fe:fe
> --among-dst fe:fe:fe:fe:fe:fe --among-src 
> fe:fe:fe:fe:ff:f,fe:fe:fe:fe:fe:fb,fe:fe:fe:fe:fc:fd,fe:fe:fe:fe:fe:fd,fe:fe:fe:fe:fe:fe
> --among-src 
> fe:fe:fe:fe:ff:f,fe:fe:fe:fe:fe:fa,fe:fe:fe:fe:fe:fd,fe:fe:fe:fe:fe:fe,fe:fe:fe:fe:fe:fe

Applied, thanks Florian.

Re: [PATCH net-next 1/5] net: fib_rules: support for match on ip_proto, sport and dport

2018-02-25 Thread Roopa Prabhu

On Sun, Feb 25, 2018 at 7:04 AM, Nikolay Aleksandrov
 wrote:
> On 25/02/18 07:44, Roopa Prabhu wrote:
>> From: Roopa Prabhu 
>>
>> uapi for ip_proto, sport and dport range match
>> in fib rules.
>>
>> Signed-off-by: Roopa Prabhu 
>> ---
>>  include/net/fib_rules.h| 31 +-
>>  include/uapi/linux/fib_rules.h |  8 
>>  net/core/fib_rules.c   | 94 
>> +-
>>  3 files changed, 130 insertions(+), 3 deletions(-)
>>
>
> You should probably update validate_rulemsg() as well, these aren't added in 
> the per-proto
> policies and nothing validates if the attribute data is actually there. Maybe 
> I'm missing
> something obvious, but it looks like many other FRA_ attributes don't have 
> such checks.


yeah, I added the sport checks there initially and later removed it
since I did not see any of the
other FRA_* attributes there. and then ended up adding it in the
respective rule add and del functions where other attributes
 were validated for consistency. I can submit a follow on patch to
move all FRA_* attribute validations to validate_rulemsg().

I can sure start with the new attribute validation in this series in
validate_rulemsg in v2




>
>> diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
>> index b3d2162..6d99202 100644
>> --- a/include/net/fib_rules.h
>> +++ b/include/net/fib_rules.h
>> @@ -11,6 +11,11 @@
>>  #include 
>>  #include 
>>
>> +struct fib_port_range {
>> + __u16 start;
>> + __u16 end;
>> +};
>> +
>>  struct fib_kuid_range {
>>   kuid_t start;
>>   kuid_t end;
>> @@ -27,7 +32,7 @@ struct fib_rule {
>>   u8  action;
>>   u8  l3mdev;
>>   u8  proto;
>> - /* 1 byte hole, try to use */
>> + u8  ip_proto;
>>   u32 target;
>>   __be64  tun_id;
>>   struct fib_rule __rcu   *ctarget;
>> @@ -40,6 +45,8 @@ struct fib_rule {
>>   chariifname[IFNAMSIZ];
>>   charoifname[IFNAMSIZ];
>>   struct fib_kuid_range   uid_range;
>> + struct fib_port_range   sport_range;
>> + struct fib_port_range   dport_range;
>>   struct rcu_head rcu;
>>  };
>>
>> @@ -144,6 +151,28 @@ static inline u32 frh_get_table(struct fib_rule_hdr 
>> *frh, struct nlattr **nla)
>>   return frh->table;
>>  }
>>
>> +static inline bool fib_rule_port_inrange(struct fib_port_range *a,
>> +  __be16 port)
>> +{
>> + if (!a->start)
>> + return true;
>
> Can start be == 0 ?
> IIUC this check is unnecessary because when you're adding the new rule,
> you do a check for start > 0 so it shouldn't be possible to be 0.
>
>> + return ntohs(port) >= a->start &&
>> + ntohs(port) <= a->end;
>> +}
>> +
>> +static inline bool fib_rule_port_range_valid(const struct fib_port_range *a)
>> +{
>> + return a->start > 0 && a->end < 0x &&
>> + a->start <= a->end;
>
> nit: alignment (also can be on a single line)
>
>> +}
>> +
>> +static inline bool fib_rule_port_range_compare(struct fib_port_range *a,
>> +struct fib_port_range *b)
>> +{
>> + return a->start == b->start &&
>> + a->end == b->end;
>
> nit: alignment (also can be on a single line)
>
>> +}
>> +
>>  struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *,
>>struct net *);
>>  void fib_rules_unregister(struct fib_rules_ops *);
>> diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
>> index 77d90ae..232df14 100644
>> --- a/include/uapi/linux/fib_rules.h
>> +++ b/include/uapi/linux/fib_rules.h
>> @@ -35,6 +35,11 @@ struct fib_rule_uid_range {
>>   __u32   end;
>>  };
>>
>> +struct fib_rule_port_range {
>> + __u16   start;
>> + __u16   end;
>> +};
>> +
>>  enum {
>>   FRA_UNSPEC,
>>   FRA_DST,/* destination address */
>> @@ -59,6 +64,9 @@ enum {
>>   FRA_L3MDEV, /* iif or oif is l3mdev goto its table */
>>   FRA_UID_RANGE,  /* UID range */
>>   FRA_PROTOCOL,   /* Originator of the rule */
>> + FRA_IP_PROTO,   /* ip proto */
>> + FRA_SPORT_RANGE, /* sport */
>> + FRA_DPORT_RANGE, /* dport */
>>   __FRA_MAX
>>  };
>>
>> diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
>> index a6aea80..5008235 100644
>> --- a/net/core/fib_rules.c
>> +++ b/net/core/fib_rules.c
>> @@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule)
>>   if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
>>   !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
>>   return false;
>> + if (fib_rule_port_range_valid(>sport_range))
>> + return false;
>> + if

Re: [PATCH net-next 5/5] ipv6: route: dissect flow in input path if fib rules need it

2018-02-25 Thread Roopa Prabhu

On Sun, Feb 25, 2018 at 7:10 AM, Nikolay Aleksandrov
 wrote:
> On 25/02/18 07:44, Roopa Prabhu wrote:
>> From: Roopa Prabhu 
>>
>> Dissect flow in fwd path if fib rules require it. Controlled by
>> a flag to avoid penatly for the common case. Flag is set when fib
>> rules with sport, dport and proto match that require flow dissect
>> are installed. Also passes the dissected hash keys to the multipath
>> hash function when applicable to avoid dissecting the flow again.
>> icmp packets will continue to use inner header for hash
>> calculations.
>>
>> Signed-off-by: Roopa Prabhu 
>> ---
>>  include/net/ip6_route.h  |  3 ++-
>>  include/net/netns/ipv6.h |  1 +
>>  net/ipv6/fib6_rules.c|  5 +
>>  net/ipv6/icmp.c  |  2 +-
>>  net/ipv6/route.c | 45 -
>>  5 files changed, 45 insertions(+), 11 deletions(-)
>>
>> diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
>> index 27d23a6..218f89c 100644
>> --- a/include/net/ip6_route.h
>> +++ b/include/net/ip6_route.h
>> @@ -127,7 +127,8 @@ static inline int ip6_route_get_saddr(struct net *net, 
>> struct rt6_info *rt,
>>
>>  struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
>>   const struct in6_addr *saddr, int oif, int flags);
>> -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb);
>> +u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb,
>> +struct flow_keys *hkeys);
>>
>>  struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 
>> *fl6);
>>
>> diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
>> index 987cc45..7aca00e 100644
>> --- a/include/net/netns/ipv6.h
>> +++ b/include/net/netns/ipv6.h
>> @@ -72,6 +72,7 @@ struct netns_ipv6 {
>>   unsigned longip6_rt_last_gc;
>>  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
>>   bool fib6_has_custom_rules;
>> + bool fib6_rules_require_fldissect;
>>   struct rt6_info *ip6_prohibit_entry;
>>   struct rt6_info *ip6_blk_hole_entry;
>>   struct fib6_table   *fib6_local_tbl;
>> diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
>> index 678d664..e3a7861 100644
>> --- a/net/ipv6/fib6_rules.c
>> +++ b/net/ipv6/fib6_rules.c
>> @@ -267,6 +267,11 @@ static int fib6_rule_configure(struct fib_rule *rule, 
>> struct sk_buff *skb,
>>   rule6->dst.plen = frh->dst_len;
>>   rule6->tclass = frh->tos;
>>
>> + if (rule->ip_proto ||
>> + fib_rule_port_range_valid(>sport_range) ||
>> + fib_rule_port_range_valid(>dport_range))
>> + net->ipv6.fib6_rules_require_fldissect = true;
>> +
>>   net->ipv6.fib6_has_custom_rules = true;
>>   err = 0;
>>  errout:
>> diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
>> index 4fa4f1b..b0778d3 100644
>> --- a/net/ipv6/icmp.c
>> +++ b/net/ipv6/icmp.c
>> @@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 
>> code, __u32 info,
>>   fl6.fl6_icmp_type = type;
>>   fl6.fl6_icmp_code = code;
>>   fl6.flowi6_uid = sock_net_uid(net, NULL);
>> - fl6.mp_hash = rt6_multipath_hash(, skb);
>> + fl6.mp_hash = rt6_multipath_hash(, skb, NULL);
>>   security_skb_classify_flow(skb, flowi6_to_flowi());
>>
>>   sk = icmpv6_xmit_lock(net);
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index aa709b6..778212b 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -460,7 +460,7 @@ static struct rt6_info *rt6_multipath_select(struct 
>> rt6_info *match,
>>* case it will always be non-zero. Otherwise now is the time to do it.
>>*/
>>   if (!fl6->mp_hash)
>> - fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
>> + fl6->mp_hash = rt6_multipath_hash(fl6, NULL, NULL);
>>
>>   if (fl6->mp_hash <= atomic_read(>rt6i_nh_upper_bound))
>>   return match;
>> @@ -1786,10 +1786,12 @@ struct dst_entry *ip6_route_input_lookup(struct net 
>> *net,
>>  EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
>>
>>  static void ip6_multipath_l3_keys(const struct sk_buff *skb,
>> -   struct flow_keys *keys)
>> +   struct flow_keys *keys,
>> +   struct flow_keys *flkeys)
>>  {
>>   const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
>>   const struct ipv6hdr *key_iph = outer_iph;
>> + struct flow_keys *_flkeys = flkeys;
>>   const struct ipv6hdr *inner_iph;
>>   const struct icmp6hdr *icmph;
>>   struct ipv6hdr _inner_iph;
>> @@ -1811,22 +1813,31 @@ static void ip6_multipath_l3_keys(const struct 
>> sk_buff *skb,
>>   goto out;
>>
>>   key_iph = inner_iph;
>> + _flkeys = NULL;
>>  out:
>>   memset(keys, 0, sizeof(*keys));
>>   keys->control.addr_type =

Re: [PATCH V2 net-next 2/3] rds: deliver zerocopy completion notification with data

2018-02-25 Thread Willem de Bruijn

On Sun, Feb 25, 2018 at 11:20 AM, Sowmini Varadhan
 wrote:
> On (02/25/18 10:56), Willem de Bruijn wrote:
>> > @@ -91,22 +85,19 @@ static void rds_rm_zerocopy_callback(struct rds_sock 
>> > *rs,
>> > spin_unlock_irqrestore(>lock, flags);
>> > mm_unaccount_pinned_pages(>z_mmp);
>> > consume_skb(rds_skb_from_znotifier(znotif));
>> > -   sk->sk_error_report(sk);
>> > +   /* caller should wake up POLLIN */
>>
>> sk->sk_data_ready(sk);
>
> yes, this was my first thought, but everything else in rds
> is calling rds_wake_sk_sleep (this is even done in
> rds_recv_incoming(), which actually queues up the data),
> so I chose to align with that model (and call this in the caller
> of rds_rm_zerocopy_callback()

Ah, understood. Perhaps say "wakes" instead of "should wake".
I mistakenly read this as a todo.

>> Without the error queue, the struct no longer needs to be an skb,
>> per se. Converting to a different struct with list_head is definitely
>> a longer patch. But kmalloc will be cheaper than alloc_skb.
>> Perhaps something to try (as separate follow-on work).
>
> right, I was thinking along these exact lines as well,
> and was already planning a follow-up.
>
>> > +   if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || !skb_peek(q))
>> > +   return 0;
>>
>> Racy read?
>
> Can you elaborate? I only put the skb_peek to quickly
> bail for sockets that are not using zerocopy at all-
> if you race against something that's queuing data, and
> miss it on the peek, the next read/recv should find it.
> Am I missing some race?

It''s a lockless access. But intentionally so, then. You're right, as long as
the subsequent skb_dequeue handles the case where the queue is
empty, it seems okay to optimistically probe lockless first.

>>
>> > +
>> > +   if (!msg->msg_control ||
>>
>> I'd move this first, so that the cookie queue need not even be probed
>> in the common case.
>
> you mean before the check for SOCK_ZEROCOPY?

Yes

>> > +   msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
>> > +   return 0;
>>
>> if caller does not satisfy the contract on controllen size, can be
>> more explicit and return an error.
>
> if SOCK_ZEROCOPY has been set, but the recv did not specify a cmsghdr,
> you mean?

I mean if SOCK_ZEROCOPY has been set and the caller calls recvmsg
with a control buffer, but one that is too small to handle zerocopy cookie
notifications.

>> > +   ncookies = rds_recvmsg_zcookie(rs, msg);
>
> Will take care of the remaining comments in V3.

Re: Potential issue with f5e64032a799 "net: phy: fix resume handling"

2018-02-25 Thread Andrew Lunn

On Sun, Feb 25, 2018 at 02:00:43PM +0100, Heiner Kallweit wrote:
> Am 03.02.2018 um 21:17 schrieb Andrew Lunn:
> > On Sat, Feb 03, 2018 at 05:41:54PM +0100, Heiner Kallweit wrote:
> >> This commit forces callers of phy_resume() and phy_suspend() to hold
> >> mutex phydev->lock. This was done for calls to phy_resume() and
> >> phy_suspend() in phylib, however there are more callers in network
> >> drivers. I'd assume that these other calls issue a warning now
> >> because of the lock not being held.
> >> So is there something I miss or would this have to be fixed?
> > 
> > Hi Heiner
> > 
> > This is a good point.
> > 
> > Yes, it looks like some fixes are needed. But what exactly?
> > 
> The issue with phy_suspend/phy_resume and the changed locking
> behavior is still open AFAICS. There was a proposed fix
> https://www.mail-archive.com/netdev@vger.kernel.org/msg215455.html
> and then the discussion stopped.
> I think we need the fix before 4.16 leaves the rc phase.

Hi Heiner

I have a patch i will post later today.

  Andrew

Re: [PATCH 0/2] mark some slabs as visible not mergeable

2018-02-25 Thread Stephen Hemminger

On Sat, 24 Feb 2018 11:04:52 -0800
Stephen Hemminger  wrote:

> This fixes an old bug in iproute2's ss command because it was
> reading slabinfo to get statistics. There isn't a better API
> to do this, and one can argue that /proc is a UAPI that must
> not change.
> 
> Therefore this patch set adds a flag to slab to give another
> reason to prevent merging, and then uses it in network code.
> 
> The patches are against davem's linux-net tree and should also
> goto stable as well.
> 
> Stephen Hemminger (2):
>   slab: add flag to block merging of UAPI elements
>   net: mark slab's used by ss as UAPI
> 
>  include/linux/slab.h | 6 ++
>  mm/slab_common.c | 2 +-
>  net/ipv4/tcp.c   | 3 ++-
>  net/ipv4/tcp_ipv4.c  | 2 +-
>  net/ipv6/tcp_ipv6.c  | 2 +-
>  net/socket.c | 6 +++---
>  6 files changed, 14 insertions(+), 7 deletions(-)
> 

The kbuild reports need more root cause investigation before applying.

Re: [PATCH V2 net-next 2/3] rds: deliver zerocopy completion notification with data

2018-02-25 Thread Sowmini Varadhan

On (02/25/18 10:56), Willem de Bruijn wrote:
> > @@ -91,22 +85,19 @@ static void rds_rm_zerocopy_callback(struct rds_sock 
> > *rs,
> > spin_unlock_irqrestore(>lock, flags);
> > mm_unaccount_pinned_pages(>z_mmp);
> > consume_skb(rds_skb_from_znotifier(znotif));
> > -   sk->sk_error_report(sk);
> > +   /* caller should wake up POLLIN */
> 
> sk->sk_data_ready(sk);

yes, this was my first thought, but everything else in rds
is calling rds_wake_sk_sleep (this is even done in
rds_recv_incoming(), which actually queues up the data), 
so I chose to align with that model (and call this in the caller 
of rds_rm_zerocopy_callback()

> Without the error queue, the struct no longer needs to be an skb,
> per se. Converting to a different struct with list_head is definitely
> a longer patch. But kmalloc will be cheaper than alloc_skb.
> Perhaps something to try (as separate follow-on work).

right, I was thinking along these exact lines as well,
and was already planning a follow-up.

> > +   if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || !skb_peek(q))
> > +   return 0;
> 
> Racy read?

Can you elaborate? I only put the skb_peek to quickly
bail for sockets that are not using zerocopy at all- 
if you race against something that's queuing data, and 
miss it on the peek, the next read/recv should find it.
Am I missing some race?


> 
> > +
> > +   if (!msg->msg_control ||
> 
> I'd move this first, so that the cookie queue need not even be probed
> in the common case.

you mean before the check for SOCK_ZEROCOPY?

> > +   msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
> > +   return 0;
> 
> if caller does not satisfy the contract on controllen size, can be
> more explicit and return an error.

if SOCK_ZEROCOPY has been set, but the recv did not specify a cmsghdr,
you mean?

> > +   ncookies = rds_recvmsg_zcookie(rs, msg);

Will take care of the remaining comments in V3.

Re: [PATCH V2 net-next 3/3] selftests/net: reap zerocopy completions passed up as ancillary data.

2018-02-25 Thread Willem de Bruijn

On Fri, Feb 23, 2018 at 5:08 PM, Sowmini Varadhan
 wrote:
> PF_RDS sockets pass up cookies for zerocopy completion as ancillary
> data. Update msg_zerocopy to reap this information.
>
> Signed-off-by: Sowmini Varadhan 
> ---
> v2: receive zerocopy completion notification as POLLIN
>
>  tools/testing/selftests/net/msg_zerocopy.c |   60 
> 
>  1 files changed, 52 insertions(+), 8 deletions(-)
>
> diff --git a/tools/testing/selftests/net/msg_zerocopy.c 
> b/tools/testing/selftests/net/msg_zerocopy.c
> index eff9cf2..8c466e8 100644
> --- a/tools/testing/selftests/net/msg_zerocopy.c
> +++ b/tools/testing/selftests/net/msg_zerocopy.c
> @@ -344,7 +344,48 @@ static int do_setup_tx(int domain, int type, int 
> protocol)
> return fd;
>  }
>
> -static bool do_recv_completion(int fd)
> +static int do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck)
> +{
> +   int ncookies, i;

Since this is example code, let's be pedantic about type: uint32_t.
Or don't use a local variable.

> +
> +   ncookies = ck->num;
> +   if (ncookies > RDS_MAX_ZCOOKIES)
> +   error(1, 0, "Returned %d cookies, max expected %d\n",
> + ncookies, RDS_MAX_ZCOOKIES);
> +   for (i = 0; i < ncookies; i++)
> +   if (cfg_verbose >= 2)
> +   fprintf(stderr, "%d\n", ck->cookies[i]);
> +   return ncookies;
> +}
> +
> +static int do_recvmsg_completion(int fd)
> +{
> +   struct msghdr msg;
> +   char cmsgbuf[256];

more precise: CMSG_SPACE(sizeof(*ck));

> +   struct cmsghdr *cmsg;
> +   bool ret = false;
> +   struct rds_zcopy_cookies *ck;

then this must move above. Reverse christmas tree is preferred, anyway.

> +
> +   memset(, 0, sizeof(msg));
> +   msg.msg_control = cmsgbuf;
> +   msg.msg_controllen = sizeof(cmsgbuf);
> +
> +   if (recvmsg(fd, , MSG_DONTWAIT))
> +   return ret;

check msg_flags & MSG_CTRUNC

> +   for (cmsg = CMSG_FIRSTHDR(); cmsg; cmsg = CMSG_NXTHDR(, 
> cmsg)) {
> +   if (cmsg->cmsg_level == SOL_RDS &&
> +   cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) {
> +
> +   ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg);
> +   completions += do_process_zerocopy_cookies(ck);
> +   ret = true;
> +   break;
> +   }

maybe warn on unexpected other type or level

> +   }
> +   return ret;
> +}
> +
> +static bool do_recv_completion(int fd, int domain)
>  {
> struct sock_extended_err *serr;
> struct msghdr msg = {};
> @@ -353,6 +394,9 @@ static bool do_recv_completion(int fd)
> int ret, zerocopy;
> char control[100];
>
> +   if (domain == PF_RDS)
> +   return do_recvmsg_completion(fd);
> +
> msg.msg_control = control;
> msg.msg_controllen = sizeof(control);
>
> @@ -409,20 +453,20 @@ static bool do_recv_completion(int fd)
>  }
>
>  /* Read all outstanding messages on the errqueue */
> -static void do_recv_completions(int fd)
> +static void do_recv_completions(int fd, int domain)
>  {
> -   while (do_recv_completion(fd)) {}
> +   while (do_recv_completion(fd, domain)) {}
>  }
>
>  /* Wait for all remaining completions on the errqueue */
> -static void do_recv_remaining_completions(int fd)
> +static void do_recv_remaining_completions(int fd, int domain)
>  {
> int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
>
> while (completions < expected_completions &&
>gettimeofday_ms() < tstop) {
> -   if (do_poll(fd, POLLERR))
> -   do_recv_completions(fd);
> +   if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR))
> +   do_recv_completions(fd, domain);
> }
>
> if (completions < expected_completions)
> @@ -503,13 +547,13 @@ static void do_tx(int domain, int type, int protocol)
>
> while (!do_poll(fd, POLLOUT)) {
> if (cfg_zerocopy)
> -   do_recv_completions(fd);
> +   do_recv_completions(fd, domain);
> }
>
> } while (gettimeofday_ms() < tstop);
>
> if (cfg_zerocopy)
> -   do_recv_remaining_completions(fd);
> +   do_recv_remaining_completions(fd, domain);
>
> if (close(fd))
> error(1, errno, "close");
> --
> 1.7.1
>

Re: [PATCH V2 net-next 2/3] rds: deliver zerocopy completion notification with data

2018-02-25 Thread Willem de Bruijn

On Fri, Feb 23, 2018 at 5:08 PM, Sowmini Varadhan
 wrote:
> This commit is an optimization of the commit 01883eda72bd
> ("rds: support for zcopy completion notification") for PF_RDS sockets.
>
> RDS applications are predominantly request-response transactions, so
> it is more efficient to reduce the number of system calls and have
> zerocopy completion notification delivered as ancillary data on the
> POLLIN channel.
>
> Cookies are passed up as ancillary data (at level SOL_RDS) in a
> struct rds_zcopy_cookies when the returned value of recvmsg() is
> greater than, or equal to, 0. A max of RDS_MAX_ZCOOKIES may be passed
> with each message.
>
> This commit removes support for zerocopy completion notification on
> MSG_ERRQUEUE for PF_RDS sockets.
>
> Signed-off-by: Sowmini Varadhan 
> ---

> diff --git a/net/rds/message.c b/net/rds/message.c
> index 6518345..2e8bdaf 100644
> --- a/net/rds/message.c
> +++ b/net/rds/message.c
> @@ -58,32 +58,26 @@ void rds_message_addref(struct rds_message *rm)
>
>  static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
>  {
> -   struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
> -   int ncookies;
> -   u32 *ptr;
> +   struct rds_zcopy_cookies *ck = (struct rds_zcopy_cookies *)skb->cb;

please add a bounds check (not necessarily right here)

  BUILD_BUG_ON(sizeof(*ck) > sizeof(skb->cb));

>  static void rds_rm_zerocopy_callback(struct rds_sock *rs,
>  struct rds_znotifier *znotif)
>  {
> -   struct sock *sk = rds_rs_to_sk(rs);
> struct sk_buff *skb, *tail;
> -   struct sock_exterr_skb *serr;
> unsigned long flags;
> struct sk_buff_head *q;
> u32 cookie = znotif->z_cookie;
> +   struct rds_zcopy_cookies *ck;
>
> -   q = >sk_error_queue;
> +   q = >rs_zcookie_queue;
> spin_lock_irqsave(>lock, flags);
> tail = skb_peek_tail(q);
>
> @@ -91,22 +85,19 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
> spin_unlock_irqrestore(>lock, flags);
> mm_unaccount_pinned_pages(>z_mmp);
> consume_skb(rds_skb_from_znotifier(znotif));
> -   sk->sk_error_report(sk);
> +   /* caller should wake up POLLIN */

sk->sk_data_ready(sk);

> @@ -362,8 +354,7 @@ int rds_message_copy_from_user(struct rds_message *rm, 
> struct iov_iter *from,
> int total_copied = 0;
> struct sk_buff *skb;
>
> -   skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32),
> -   GFP_KERNEL);
> +   skb = alloc_skb(0, GFP_KERNEL);

Without the error queue, the struct no longer needs to be an skb,
per se. Converting to a different struct with list_head is definitely
a longer patch. But kmalloc will be cheaper than alloc_skb.
Perhaps something to try (as separate follow-on work).

> +static int rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
> +{
> +   struct sk_buff *skb;
> +   struct sk_buff_head *q = >rs_zcookie_queue;
> +   struct rds_zcopy_cookies *done;
> +   int ret;
> +
> +   if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || !skb_peek(q))
> +   return 0;

Racy read?

> +
> +   if (!msg->msg_control ||

I'd move this first, so that the cookie queue need not even be probed
in the common case.

> +   msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
> +   return 0;

if caller does not satisfy the contract on controllen size, can be
more explicit and return an error.

> +
> +   skb = skb_dequeue(q);
> +   if (!skb)
> +   return 0;
> +   done = (struct rds_zcopy_cookies *)skb->cb;
> +   ret = done->num;

done->num is guaranteed to be >= 1, so ret is not strictly needed.

> +   if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
> +done)) {
> +   skb_queue_head(q, skb);
> +   ret = 0;
> +   } else {
> +   consume_skb(skb);
> +   }
> +   return ret;
> +}
> +
>  int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
> int msg_flags)
>  {
> @@ -586,6 +615,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, 
> size_t size,
> int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
> DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
> struct rds_incoming *inc = NULL;
> +   int ncookies;
>
> /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
> timeo = sock_rcvtimeo(sk, nonblock);
> @@ -611,7 +641,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, 
> size_t size,
>
> if (!rds_next_incoming(rs, )) {
> if (nonblock) {
> -   ret = -EAGAIN;
> +   ncookies = rds_recvmsg_zcookie(rs, msg);
> +

Re: [PATCH net-next 1/5] net: fib_rules: support for match on ip_proto, sport and dport

2018-02-25 Thread Nikolay Aleksandrov

On 25/02/18 17:04, Nikolay Aleksandrov wrote:
> On 25/02/18 07:44, Roopa Prabhu wrote:
>> From: Roopa Prabhu 
>>
>> uapi for ip_proto, sport and dport range match
>> in fib rules.
>>
>> Signed-off-by: Roopa Prabhu 
>> ---
[snip]
>>  struct rcu_head rcu;
>>  };
>>  
>> @@ -144,6 +151,28 @@ static inline u32 frh_get_table(struct fib_rule_hdr 
>> *frh, struct nlattr **nla)
>>  return frh->table;
>>  }
>>  
>> +static inline bool fib_rule_port_inrange(struct fib_port_range *a,
>> + __be16 port)
>> +{
>> +if (!a->start)
>> +return true;
> 
> Can start be == 0 ?
> IIUC this check is unnecessary because when you're adding the new rule,
> you do a check for start > 0 so it shouldn't be possible to be 0.

Nevermind this comment, I spoke too soon and saw the match later. :-)

> 
>> +return ntohs(port) >= a->start &&
>> +ntohs(port) <= a->end;
>> +}
>> +
>> +static inline bool fib_rule_port_range_valid(const struct fib_port_range *a)
>> +{
>> +return a->start > 0 && a->end < 0x &&
>> +a->start <= a->end;
> 
> nit: alignment (also can be on a single line)
> 
>> +}
>> +
>> +static inline bool fib_rule_port_range_compare(struct fib_port_range *a,
>> +   struct fib_port_range *b)
>> +{
>> +return a->start == b->start &&
>> +a->end == b->end;
> 
> nit: alignment (also can be on a single line)
> 
>> +}
>> +
>>  struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *,
>>   struct net *);
>>  void fib_rules_unregister(struct fib_rules_ops *);
>> diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
>> index 77d90ae..232df14 100644
>> --- a/include/uapi/linux/fib_rules.h
>> +++ b/include/uapi/linux/fib_rules.h
>> @@ -35,6 +35,11 @@ struct fib_rule_uid_range {
>>  __u32   end;
>>  };
>>  
>> +struct fib_rule_port_range {
>> +__u16   start;
>> +__u16   end;
>> +};
>> +
>>  enum {
>>  FRA_UNSPEC,
>>  FRA_DST,/* destination address */
>> @@ -59,6 +64,9 @@ enum {
>>  FRA_L3MDEV, /* iif or oif is l3mdev goto its table */
>>  FRA_UID_RANGE,  /* UID range */
>>  FRA_PROTOCOL,   /* Originator of the rule */
>> +FRA_IP_PROTO,   /* ip proto */
>> +FRA_SPORT_RANGE, /* sport */
>> +FRA_DPORT_RANGE, /* dport */
>>  __FRA_MAX
>>  };
>>  
>> diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
>> index a6aea80..5008235 100644
>> --- a/net/core/fib_rules.c
>> +++ b/net/core/fib_rules.c
>> @@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule)
>>  if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
>>  !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
>>  return false;
>> +if (fib_rule_port_range_valid(>sport_range))
>> +return false;
>> +if (fib_rule_port_range_valid(>dport_range))
>> +return false;
>>  return true;
>>  }
>>  EXPORT_SYMBOL_GPL(fib_rule_matchall);
>> @@ -221,6 +225,12 @@ static int nla_put_uid_range(struct sk_buff *skb, 
>> struct fib_kuid_range *range)
>>  return nla_put(skb, FRA_UID_RANGE, sizeof(out), );
>>  }
>>  
>> +static int nla_put_port_range(struct sk_buff *skb, int attrtype,
>> +  struct fib_port_range *range)
>> +{
>> +return nla_put(skb, attrtype, sizeof(*range), range);
>> +}
>> +
>>  static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
>>struct flowi *fl, int flags,
>>struct fib_lookup_arg *arg)
>> @@ -425,6 +435,17 @@ static int rule_exists(struct fib_rules_ops *ops, 
>> struct fib_rule_hdr *frh,
>>  !uid_eq(r->uid_range.end, rule->uid_range.end))
>>  continue;
>>  
>> +if (r->ip_proto != rule->ip_proto)
>> +continue;
>> +
>> +if (!fib_rule_port_range_compare(>sport_range,
>> + >sport_range))
>> +continue;
>> +
>> +if (!fib_rule_port_range_compare(>dport_range,
>> + >dport_range))
>> +continue;
>> +
>>  if (!ops->compare(r, frh, tb))
>>  continue;
>>  return 1;
>> @@ -432,6 +453,20 @@ static int rule_exists(struct fib_rules_ops *ops, 
>> struct fib_rule_hdr *frh,
>>  return 0;
>>  }
>>  
>> +static int nla_get_port_range(struct nlattr *pattr,
>> +  struct fib_port_range *port_range)
>> +{
>> +const struct fib_port_range *pr = nla_data(pattr);
>> +
>> +if (!fib_rule_port_range_valid(pr))
>> +return -EINVAL;
>> +
>> +port_range->start = pr->start;
>> +port_range->end = pr->end;
>> +
>> +return 0;
>> +}
>> +
>>  int

Re: [virtio-dev] [RFC PATCH V2] virtio_pci: Add SR-IOV support

2018-02-25 Thread Yan Vugenfirer



> On 22 Feb 2018, at 19:52, Mark Rustad  wrote:
> 
> Hardware-realized virtio-pci devices can implement SR-IOV, so this
> patch enables its use. The device in question is an upcoming Intel
> NIC that implements both a virtio-net PF and virtio-net VFs. These
> are hardware realizations of what has been up to now been a software
> interface.
> 
> The device in question has the following 4-part PCI IDs:
> 
> PF: device: 1af4 vendor: 1041 subvendor: 8086 subdevice: 15fe
> VF: device: 1af4 vendor: 1041 subvendor: 8086 subdevice: 05fe

Small mistake in the commit message. Red Hat (Qumranet) vendor ID is 1af4, 
virtio-net device ID is 1041.
Should be:
PF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 15fe
VF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 05fe

Best regards,
Yan.

> 
> The patch needs no check for device ID, because the callback will
> never be made for devices that do not assert the capability or
> when run on a platform incapable of SR-IOV.
> 
> One reason for this patch is because the hardware requires the
> vendor ID of a VF to be the same as the vendor ID of the PF that
> created it. So it seemed logical to simply have a fully-functioning
> virtio-net PF create the VFs. This patch makes that possible.
> 
> Signed-off-by: Mark Rustad 
> Reviewed-by: Alexander Duyck 
> ---
> Changes in V2:
> - Simplified logic from previous version, removed added driver variable
> - Disable SR-IOV on driver removal excapt when VFs are assigned
> - Sent as RFC to virtio-dev, linux-pci, netdev, lkml and others
> ---
> drivers/virtio/virtio_pci_common.c |   47 
> 1 file changed, 47 insertions(+)
> 
> diff --git a/drivers/virtio/virtio_pci_common.c 
> b/drivers/virtio/virtio_pci_common.c
> index 48d4d1cf1cb6..78b53ffc4cee 100644
> --- a/drivers/virtio/virtio_pci_common.c
> +++ b/drivers/virtio/virtio_pci_common.c
> @@ -572,6 +572,47 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
>   return rc;
> }
> 
> +#ifdef CONFIG_PCI_IOV
> +static int virtio_pci_sriov_disable(struct pci_dev *pci_dev)
> +{
> + /* If vfs are assigned we cannot shut down SR-IOV without causing
> +  * issues, so just leave the hardware available.
> +  */
> + if (pci_vfs_assigned(pci_dev)) {
> + dev_warn(_dev->dev,
> +  "Unloading driver while VFs are assigned - VFs will 
> not be deallocated\n");
> + return -EPERM;
> + }
> + pci_disable_sriov(pci_dev);
> + return 0;
> +}
> +
> +static int virtio_pci_sriov_enable(struct pci_dev *pci_dev, int num_vfs)
> +{
> + int rc = 0;
> +
> + if (pci_num_vf(pci_dev))
> + return -EINVAL;
> +
> + rc = pci_enable_sriov(pci_dev, num_vfs);
> + if (rc) {
> + dev_warn(_dev->dev, "Failed to enable PCI sriov: %d\n", rc);
> + return rc;
> + }
> + dev_info(_dev->dev, "SR-IOV enabled with %d VFs\n", num_vfs);
> + return num_vfs;
> +}
> +
> +static int virtio_pci_sriov_configure(struct pci_dev *dev, int num_vfs)
> +{
> + if (num_vfs)
> + return virtio_pci_sriov_enable(dev, num_vfs);
> + if (!pci_num_vf(dev))
> + return -EINVAL;
> + return virtio_pci_sriov_disable(dev);
> +}
> +#endif /* CONFIG_PCI_IOV */
> +
> static void virtio_pci_remove(struct pci_dev *pci_dev)
> {
>   struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
> @@ -584,6 +625,9 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
>   else
>   virtio_pci_modern_remove(vp_dev);
> 
> +#ifdef CONFIG_PCI_IOV
> + virtio_pci_sriov_disable(pci_dev);
> +#endif
>   pci_disable_device(pci_dev);
>   put_device(dev);
> }
> @@ -596,6 +640,9 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
> #ifdef CONFIG_PM_SLEEP
>   .driver.pm  = _pci_pm_ops,
> #endif
> +#ifdef CONFIG_PCI_IOV
> + .sriov_configure = virtio_pci_sriov_configure,
> +#endif
> };
> 
> module_pci_driver(virtio_pci_driver);
> 
> 
> -
> To unsubscribe, e-mail: virtio-dev-unsubscr...@lists.oasis-open.org
> For additional commands, e-mail: virtio-dev-h...@lists.oasis-open.org
>

Re: [PATCH net-next 5/5] ipv6: route: dissect flow in input path if fib rules need it

2018-02-25 Thread Nikolay Aleksandrov

On 25/02/18 07:44, Roopa Prabhu wrote:
> From: Roopa Prabhu 
> 
> Dissect flow in fwd path if fib rules require it. Controlled by
> a flag to avoid penatly for the common case. Flag is set when fib
> rules with sport, dport and proto match that require flow dissect
> are installed. Also passes the dissected hash keys to the multipath
> hash function when applicable to avoid dissecting the flow again.
> icmp packets will continue to use inner header for hash
> calculations.
> 
> Signed-off-by: Roopa Prabhu 
> ---
>  include/net/ip6_route.h  |  3 ++-
>  include/net/netns/ipv6.h |  1 +
>  net/ipv6/fib6_rules.c|  5 +
>  net/ipv6/icmp.c  |  2 +-
>  net/ipv6/route.c | 45 -
>  5 files changed, 45 insertions(+), 11 deletions(-)
> 
> diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
> index 27d23a6..218f89c 100644
> --- a/include/net/ip6_route.h
> +++ b/include/net/ip6_route.h
> @@ -127,7 +127,8 @@ static inline int ip6_route_get_saddr(struct net *net, 
> struct rt6_info *rt,
>  
>  struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
>   const struct in6_addr *saddr, int oif, int flags);
> -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb);
> +u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb,
> +struct flow_keys *hkeys);
>  
>  struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 
> *fl6);
>  
> diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
> index 987cc45..7aca00e 100644
> --- a/include/net/netns/ipv6.h
> +++ b/include/net/netns/ipv6.h
> @@ -72,6 +72,7 @@ struct netns_ipv6 {
>   unsigned longip6_rt_last_gc;
>  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
>   bool fib6_has_custom_rules;
> + bool fib6_rules_require_fldissect;
>   struct rt6_info *ip6_prohibit_entry;
>   struct rt6_info *ip6_blk_hole_entry;
>   struct fib6_table   *fib6_local_tbl;
> diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
> index 678d664..e3a7861 100644
> --- a/net/ipv6/fib6_rules.c
> +++ b/net/ipv6/fib6_rules.c
> @@ -267,6 +267,11 @@ static int fib6_rule_configure(struct fib_rule *rule, 
> struct sk_buff *skb,
>   rule6->dst.plen = frh->dst_len;
>   rule6->tclass = frh->tos;
>  
> + if (rule->ip_proto ||
> + fib_rule_port_range_valid(>sport_range) ||
> + fib_rule_port_range_valid(>dport_range))
> + net->ipv6.fib6_rules_require_fldissect = true;
> +
>   net->ipv6.fib6_has_custom_rules = true;
>   err = 0;
>  errout:
> diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
> index 4fa4f1b..b0778d3 100644
> --- a/net/ipv6/icmp.c
> +++ b/net/ipv6/icmp.c
> @@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 
> code, __u32 info,
>   fl6.fl6_icmp_type = type;
>   fl6.fl6_icmp_code = code;
>   fl6.flowi6_uid = sock_net_uid(net, NULL);
> - fl6.mp_hash = rt6_multipath_hash(, skb);
> + fl6.mp_hash = rt6_multipath_hash(, skb, NULL);
>   security_skb_classify_flow(skb, flowi6_to_flowi());
>  
>   sk = icmpv6_xmit_lock(net);
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index aa709b6..778212b 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -460,7 +460,7 @@ static struct rt6_info *rt6_multipath_select(struct 
> rt6_info *match,
>* case it will always be non-zero. Otherwise now is the time to do it.
>*/
>   if (!fl6->mp_hash)
> - fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
> + fl6->mp_hash = rt6_multipath_hash(fl6, NULL, NULL);
>  
>   if (fl6->mp_hash <= atomic_read(>rt6i_nh_upper_bound))
>   return match;
> @@ -1786,10 +1786,12 @@ struct dst_entry *ip6_route_input_lookup(struct net 
> *net,
>  EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
>  
>  static void ip6_multipath_l3_keys(const struct sk_buff *skb,
> -   struct flow_keys *keys)
> +   struct flow_keys *keys,
> +   struct flow_keys *flkeys)
>  {
>   const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
>   const struct ipv6hdr *key_iph = outer_iph;
> + struct flow_keys *_flkeys = flkeys;
>   const struct ipv6hdr *inner_iph;
>   const struct icmp6hdr *icmph;
>   struct ipv6hdr _inner_iph;
> @@ -1811,22 +1813,31 @@ static void ip6_multipath_l3_keys(const struct 
> sk_buff *skb,
>   goto out;
>  
>   key_iph = inner_iph;
> + _flkeys = NULL;
>  out:
>   memset(keys, 0, sizeof(*keys));
>   keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
> - keys->addrs.v6addrs.src = key_iph->saddr;
> - keys->addrs.v6addrs.dst = key_iph->daddr;
> - keys->tags.flow_label = ip6_flowinfo(key_iph);
> - keys->basic.ip_proto

Re: [PATCH net-next 4/5] ipv4: route: dissect flow in input path if fib rules need it

2018-02-25 Thread Nikolay Aleksandrov

On 25/02/18 07:44, Roopa Prabhu wrote:
> From: Roopa Prabhu 
> 
> Dissect flow in fwd path if fib rules require it. Controlled by
> a flag to avoid penatly for the common case. Flag is set when fib
> rules with sport, dport and proto match that require flow dissect
> are installed. Also passes the dissected hash keys to the multipath
> hash function when applicable to avoid dissecting the flow again.
> icmp packets will continue to use inner header for hash
> calculations (Thanks to Nikolay Aleksandrov for some review here).
> 
> Signed-off-by: Roopa Prabhu 
> ---
>  include/net/ip_fib.h |  2 +-
>  include/net/netns/ipv4.h |  1 +
>  net/ipv4/fib_rules.c |  6 ++
>  net/ipv4/fib_semantics.c |  2 +-
>  net/ipv4/route.c | 52 
> +++-
>  5 files changed, 47 insertions(+), 16 deletions(-)
> 
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index f805243..5ada772 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -371,7 +371,7 @@ int fib_sync_up(struct net_device *dev, unsigned int 
> nh_flags);
>  
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>  int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
> -const struct sk_buff *skb);
> +const struct sk_buff *skb, struct flow_keys *flkeys);
>  #endif
>  void fib_select_multipath(struct fib_result *res, int hash);
>  void fib_select_path(struct net *net, struct fib_result *res,
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 44668c2..87b8fdc 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -52,6 +52,7 @@ struct netns_ipv4 {
>  #ifdef CONFIG_IP_MULTIPLE_TABLES
>   struct fib_rules_ops*rules_ops;
>   boolfib_has_custom_rules;
> + boolfib_rules_require_fldissect;
>   struct fib_table __rcu  *fib_main;
>   struct fib_table __rcu  *fib_default;
>  #endif
> diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
> index 9d55c90..83aa786 100644
> --- a/net/ipv4/fib_rules.c
> +++ b/net/ipv4/fib_rules.c
> @@ -253,6 +253,11 @@ static int fib4_rule_configure(struct fib_rule *rule, 
> struct sk_buff *skb,
>   }
>  #endif
>  
> + if (rule->ip_proto ||
> + fib_rule_port_range_valid(>sport_range) ||
> + fib_rule_port_range_valid(>dport_range))
> + net->ipv4.fib_rules_require_fldissect = true;
> +
>   rule4->src_len = frh->src_len;
>   rule4->srcmask = inet_make_mask(rule4->src_len);
>   rule4->dst_len = frh->dst_len;
> @@ -398,6 +403,7 @@ int __net_init fib4_rules_init(struct net *net)
>   goto fail;
>   net->ipv4.rules_ops = ops;
>   net->ipv4.fib_has_custom_rules = false;
> + net->ipv4.fib_rules_require_fldissect = false;
>   return 0;
>  
>  fail:
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index cd46d76..b0c398a 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -1770,7 +1770,7 @@ void fib_select_path(struct net *net, struct fib_result 
> *res,
>  
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   if (res->fi->fib_nhs > 1) {
> - int h = fib_multipath_hash(res->fi, fl4, skb);
> + int h = fib_multipath_hash(res->fi, fl4, skb, NULL);
>  
>   fib_select_multipath(res, h);
>   }
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 26eefa2..72dd6c6 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1783,7 +1783,7 @@ static void ip_multipath_l3_keys(const struct sk_buff 
> *skb,
>  
>  /* if skb is set it will be used and fl4 can be NULL */
>  int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
> -const struct sk_buff *skb)
> +const struct sk_buff *skb, struct flow_keys *flkeys)
>  {
>   struct net *net = fi->fib_net;
>   struct flow_keys hash_keys;
> @@ -1810,14 +1810,23 @@ int fib_multipath_hash(const struct fib_info *fi, 
> const struct flowi4 *fl4,
>   if (skb->l4_hash)
>   return skb_get_hash_raw(skb) >> 1;
>   memset(_keys, 0, sizeof(hash_keys));
> - skb_flow_dissect_flow_keys(skb, , flag);
>  
> - hash_keys.control.addr_type = 
> FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> - hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
> - hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
> - hash_keys.ports.src = keys.ports.src;
> - hash_keys.ports.dst = keys.ports.dst;
> - hash_keys.basic.ip_proto = keys.basic.ip_proto;
> + if (flkeys) {
> + hash_keys.control.addr_type = 
> FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> + hash_keys.addrs.v4addrs.src = 
>

Re: [PATCH net-next 1/5] net: fib_rules: support for match on ip_proto, sport and dport

2018-02-25 Thread Nikolay Aleksandrov

On 25/02/18 07:44, Roopa Prabhu wrote:
> From: Roopa Prabhu 
> 
> uapi for ip_proto, sport and dport range match
> in fib rules.
> 
> Signed-off-by: Roopa Prabhu 
> ---
>  include/net/fib_rules.h| 31 +-
>  include/uapi/linux/fib_rules.h |  8 
>  net/core/fib_rules.c   | 94 
> +-
>  3 files changed, 130 insertions(+), 3 deletions(-)
> 

You should probably update validate_rulemsg() as well, these aren't added in 
the per-proto
policies and nothing validates if the attribute data is actually there. Maybe 
I'm missing
something obvious, but it looks like many other FRA_ attributes don't have such 
checks.

> diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
> index b3d2162..6d99202 100644
> --- a/include/net/fib_rules.h
> +++ b/include/net/fib_rules.h
> @@ -11,6 +11,11 @@
>  #include 
>  #include 
>  
> +struct fib_port_range {
> + __u16 start;
> + __u16 end;
> +};
> +
>  struct fib_kuid_range {
>   kuid_t start;
>   kuid_t end;
> @@ -27,7 +32,7 @@ struct fib_rule {
>   u8  action;
>   u8  l3mdev;
>   u8  proto;
> - /* 1 byte hole, try to use */
> + u8  ip_proto;
>   u32 target;
>   __be64  tun_id;
>   struct fib_rule __rcu   *ctarget;
> @@ -40,6 +45,8 @@ struct fib_rule {
>   chariifname[IFNAMSIZ];
>   charoifname[IFNAMSIZ];
>   struct fib_kuid_range   uid_range;
> + struct fib_port_range   sport_range;
> + struct fib_port_range   dport_range;
>   struct rcu_head rcu;
>  };
>  
> @@ -144,6 +151,28 @@ static inline u32 frh_get_table(struct fib_rule_hdr 
> *frh, struct nlattr **nla)
>   return frh->table;
>  }
>  
> +static inline bool fib_rule_port_inrange(struct fib_port_range *a,
> +  __be16 port)
> +{
> + if (!a->start)
> + return true;

Can start be == 0 ?
IIUC this check is unnecessary because when you're adding the new rule,
you do a check for start > 0 so it shouldn't be possible to be 0.

> + return ntohs(port) >= a->start &&
> + ntohs(port) <= a->end;
> +}
> +
> +static inline bool fib_rule_port_range_valid(const struct fib_port_range *a)
> +{
> + return a->start > 0 && a->end < 0x &&
> + a->start <= a->end;

nit: alignment (also can be on a single line)

> +}
> +
> +static inline bool fib_rule_port_range_compare(struct fib_port_range *a,
> +struct fib_port_range *b)
> +{
> + return a->start == b->start &&
> + a->end == b->end;

nit: alignment (also can be on a single line)

> +}
> +
>  struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *,
>struct net *);
>  void fib_rules_unregister(struct fib_rules_ops *);
> diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
> index 77d90ae..232df14 100644
> --- a/include/uapi/linux/fib_rules.h
> +++ b/include/uapi/linux/fib_rules.h
> @@ -35,6 +35,11 @@ struct fib_rule_uid_range {
>   __u32   end;
>  };
>  
> +struct fib_rule_port_range {
> + __u16   start;
> + __u16   end;
> +};
> +
>  enum {
>   FRA_UNSPEC,
>   FRA_DST,/* destination address */
> @@ -59,6 +64,9 @@ enum {
>   FRA_L3MDEV, /* iif or oif is l3mdev goto its table */
>   FRA_UID_RANGE,  /* UID range */
>   FRA_PROTOCOL,   /* Originator of the rule */
> + FRA_IP_PROTO,   /* ip proto */
> + FRA_SPORT_RANGE, /* sport */
> + FRA_DPORT_RANGE, /* dport */
>   __FRA_MAX
>  };
>  
> diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
> index a6aea80..5008235 100644
> --- a/net/core/fib_rules.c
> +++ b/net/core/fib_rules.c
> @@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule)
>   if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
>   !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
>   return false;
> + if (fib_rule_port_range_valid(>sport_range))
> + return false;
> + if (fib_rule_port_range_valid(>dport_range))
> + return false;
>   return true;
>  }
>  EXPORT_SYMBOL_GPL(fib_rule_matchall);
> @@ -221,6 +225,12 @@ static int nla_put_uid_range(struct sk_buff *skb, struct 
> fib_kuid_range *range)
>   return nla_put(skb, FRA_UID_RANGE, sizeof(out), );
>  }
>  
> +static int nla_put_port_range(struct sk_buff *skb, int attrtype,
> +   struct fib_port_range *range)
> +{
> + return nla_put(skb, attrtype, sizeof(*range), range);
> +}
> +
>  static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
> struct flowi *fl, int flags,
>

Re: [PATCH] DT: net: renesas,ravb: document R8A77980 bindings

2018-02-25 Thread Sergei Shtylyov

On 02/25/2018 11:48 AM, Sergei Shtylyov wrote:

 Renesas R-Car V3H (R8A77980) SoC has the R-Car gen3 compatible EtherAVB 
 >>> device, so document the SoC specific bindings.>> Signed-off-by: 
> Sergei Shtylyov >> --->>> The patch 
> is against DaveM's 'net-next.git' repo but I wouldn't mind if it's>>> applied 
> to 'net.git' instead. :-) David, I see this patch was marked as "not 
> applicable" in patchwork. Why, because>> it was posted during the merge 
> window?> > No because I thought the relevant architecture tree would take a 
> mere> DT update.

   Sorry for this crap -- I was testing my new mini-keyboard and something went 
wrong...

MBR, Sergei

Re: [PATCH net-next] sh_eth: TSU_QTAG0/1 registers the same as TSU_QTAGM0/1

2018-02-25 Thread Sergei Shtylyov

On 02/25/2018 04:14 PM, Geert Uytterhoeven wrote:

>> The TSU_QTAG0/1 registers found in the Gigabit Ether controllers actually
>> have the same long name  as the TSU_QTAGM0/1 registers in the early Ether
>> controllers:  Qtag Addition/Deletion Set Register (Port 0/1 to 1/0); thus
>> there's no need to make a difference in sh_eth_tsu_init() between those
>> controllers. Unfortunately, we can't just remove TSU_QTAG0/1 from the
>> register *enum* because that would break the ethtool register dump...
>>
>> Fixes: b0ca2a21f769 ("sh_eth: Add support of SH7763 to sh_eth")
>> Signed-off-by: Sergei Shtylyov 
> 
> Thanks for your patch!
> 
>> --- net-next.orig/drivers/net/ethernet/renesas/sh_eth.c
>> +++ net-next/drivers/net/ethernet/renesas/sh_eth.c
> 
>> @@ -2097,8 +2097,6 @@ static size_t __sh_eth_get_regs(struct n
>> add_tsu_reg(TSU_FWSL0);
>> add_tsu_reg(TSU_FWSL1);
>> add_tsu_reg(TSU_FWSLC);
>> -   add_tsu_reg(TSU_QTAG0);
>> -   add_tsu_reg(TSU_QTAG1);
> 
> Shouldn't you keep the above for ethtool register dump?

   Why dump the same registers twice? These are no longer marked as valid in 
the dump
buffer and a dump user's only source of info about the valid registers is the 
bitmap
at the start of the buffer...
   Note that the dump is only done for the registers actually used by the 
driver, and
these 2 regs are no longer used anywhere...

>> add_tsu_reg(TSU_QTAGM0);
>> add_tsu_reg(TSU_QTAGM1);
>> add_tsu_reg(TSU_FWSR);

MBR, Sergei

Re: [PATCH net-next] sh_eth: TSU_QTAG0/1 registers the same as TSU_QTAGM0/1

2018-02-25 Thread Geert Uytterhoeven

Hi Sergei,

On Sat, Feb 24, 2018 at 6:28 PM, Sergei Shtylyov
 wrote:
> The TSU_QTAG0/1 registers found in the Gigabit Ether controllers actually
> have the same long name  as the TSU_QTAGM0/1 registers in the early Ether
> controllers:  Qtag Addition/Deletion Set Register (Port 0/1 to 1/0); thus
> there's no need to make a difference in sh_eth_tsu_init() between those
> controllers. Unfortunately, we can't just remove TSU_QTAG0/1 from the
> register *enum* because that would break the ethtool register dump...
>
> Fixes: b0ca2a21f769 ("sh_eth: Add support of SH7763 to sh_eth")
> Signed-off-by: Sergei Shtylyov 

Thanks for your patch!

> --- net-next.orig/drivers/net/ethernet/renesas/sh_eth.c
> +++ net-next/drivers/net/ethernet/renesas/sh_eth.c

> @@ -2097,8 +2097,6 @@ static size_t __sh_eth_get_regs(struct n
> add_tsu_reg(TSU_FWSL0);
> add_tsu_reg(TSU_FWSL1);
> add_tsu_reg(TSU_FWSLC);
> -   add_tsu_reg(TSU_QTAG0);
> -   add_tsu_reg(TSU_QTAG1);

Shouldn't you keep the above for ethtool register dump?

> add_tsu_reg(TSU_QTAGM0);
> add_tsu_reg(TSU_QTAGM1);
> add_tsu_reg(TSU_FWSR);

> --- net-next.orig/drivers/net/ethernet/renesas/sh_eth.h
> +++ net-next/drivers/net/ethernet/renesas/sh_eth.h
> @@ -118,8 +118,8 @@ enum {
> TSU_FWSL0,
> TSU_FWSL1,
> TSU_FWSLC,
> -   TSU_QTAG0,
> -   TSU_QTAG1,
> +   TSU_QTAG0,  /* Same as TSU_QTAGM0 */
> +   TSU_QTAG1,  /* Same as TSU_QTAGM1 */
> TSU_QTAGM0,
> TSU_QTAGM1,
> TSU_FWSR,

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

Re: Potential issue with f5e64032a799 "net: phy: fix resume handling"

2018-02-25 Thread Heiner Kallweit

Am 03.02.2018 um 21:17 schrieb Andrew Lunn:
> On Sat, Feb 03, 2018 at 05:41:54PM +0100, Heiner Kallweit wrote:
>> This commit forces callers of phy_resume() and phy_suspend() to hold
>> mutex phydev->lock. This was done for calls to phy_resume() and
>> phy_suspend() in phylib, however there are more callers in network
>> drivers. I'd assume that these other calls issue a warning now
>> because of the lock not being held.
>> So is there something I miss or would this have to be fixed?
> 
> Hi Heiner
> 
> This is a good point.
> 
> Yes, it looks like some fixes are needed. But what exactly?
> 
The issue with phy_suspend/phy_resume and the changed locking
behavior is still open AFAICS. There was a proposed fix
https://www.mail-archive.com/netdev@vger.kernel.org/msg215455.html
and then the discussion stopped.
I think we need the fix before 4.16 leaves the rc phase.

Heiner


> The phy state machine will suspend and resume the phy is you call
> phy_stop() and phy_start() in the MAC suspend and resume functions.
> 
> A few examples:
> 
> tc35815_suspend(), ravb_suspend() via ravb_close(), sh_eth_suspend()
> via sh_eth_close(), fec_suspend(), mpc52xx_fec_of_suspend() via
> mpc52xx_fec_close(), ucc_geth_suspend(), etc...
> 
> So i suspect those drivers which call phy_suspend()/phy_resume()
> should really be modified to call phy_stop()/phy_start().
> 
> hns_nic_config_phy_loopback() is just funky, and probably needs the
> help of the hns guys to fix.
> 
> dsa_slave_suspend() already does a phy_stop(), so the phy_suspend()
> can be removed.
> 
> The comments in lpc_eth_open() suggest the phy_resume() is needed, so
> locks should be added. socfpga_dwmac_resume() seems to be the same.
> 
> Andrew
>

[PATCH net-next 4/5] net: mdio-gpio: Merge platform data into state

2018-02-25 Thread Linus Walleij

There is no instantiation without DT data, we can now move
to a single state container and merge the DT property
retrieveal into mdio_gpio_bus_init().

We decomission the phy_mask, phy_ignore_ta_mask and
irqs array and the reset() callback that were all just
sitting unused and taking up space.

If bitbanged GPIOs need to set up reset() callbacks these
should be done in the device tree using proper bindings.

If bitbanged GPIOs need to handle IRQs, these should be
done in the device tree using the proper bindings.

Signed-off-by: Linus Walleij 
---
 drivers/net/phy/mdio-gpio.c | 130 
 1 file changed, 48 insertions(+), 82 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 96c953d086c6..9146077b5278 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -30,61 +30,11 @@
 #include 
 #include 
 
-struct mdio_gpio_platform_data {
-   /* GPIO numbers for bus pins */
-   unsigned int mdc;
-   unsigned int mdio;
-   unsigned int mdo;
-
-   bool mdc_active_low;
-   bool mdio_active_low;
-   bool mdo_active_low;
-
-   u32 phy_mask;
-   u32 phy_ignore_ta_mask;
-   int irqs[PHY_MAX_ADDR];
-   /* reset callback */
-   int (*reset)(struct mii_bus *bus);
-};
-
 struct mdio_gpio_info {
struct mdiobb_ctrl ctrl;
struct gpio_desc *mdc, *mdio, *mdo;
 };
 
-static void *mdio_gpio_of_get_data(struct device *dev)
-{
-   struct device_node *np = dev->of_node;
-   struct mdio_gpio_platform_data *pdata;
-   enum of_gpio_flags flags;
-   int ret;
-
-   pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
-   if (!pdata)
-   return NULL;
-
-   ret = of_get_gpio_flags(np, 0, );
-   if (ret < 0)
-   return NULL;
-
-   pdata->mdc = ret;
-   pdata->mdc_active_low = flags & OF_GPIO_ACTIVE_LOW;
-
-   ret = of_get_gpio_flags(np, 1, );
-   if (ret < 0)
-   return NULL;
-   pdata->mdio = ret;
-   pdata->mdio_active_low = flags & OF_GPIO_ACTIVE_LOW;
-
-   ret = of_get_gpio_flags(np, 2, );
-   if (ret > 0) {
-   pdata->mdo = ret;
-   pdata->mdo_active_low = flags & OF_GPIO_ACTIVE_LOW;
-   }
-
-   return pdata;
-}
-
 static void mdio_dir(struct mdiobb_ctrl *ctrl, int dir)
 {
struct mdio_gpio_info *bitbang =
@@ -142,31 +92,60 @@ static const struct mdiobb_ops mdio_gpio_ops = {
 };
 
 static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
- struct mdio_gpio_info *bitbang,
- struct mdio_gpio_platform_data *pdata,
- int bus_id)
+ struct mdio_gpio_info *bitbang)
 {
-   struct mii_bus *new_bus;
-   int i;
-   int mdc, mdio, mdo;
+   unsigned long mdo_flags = GPIOF_OUT_INIT_HIGH;
unsigned long mdc_flags = GPIOF_OUT_INIT_LOW;
unsigned long mdio_flags = GPIOF_DIR_IN;
-   unsigned long mdo_flags = GPIOF_OUT_INIT_HIGH;
+   struct device_node *np = dev->of_node;
+   enum of_gpio_flags flags;
+   struct mii_bus *new_bus;
+   bool mdio_active_low;
+   bool mdc_active_low;
+   bool mdo_active_low;
+   unsigned int mdio;
+   unsigned int mdc;
+   unsigned int mdo;
+   int bus_id;
+   int ret, i;
+
+   ret = of_get_gpio_flags(np, 0, );
+   if (ret < 0)
+   return NULL;
+
+   mdc = ret;
+   mdc_active_low = flags & OF_GPIO_ACTIVE_LOW;
+
+   ret = of_get_gpio_flags(np, 1, );
+   if (ret < 0)
+   return NULL;
+   mdio = ret;
+   mdio_active_low = flags & OF_GPIO_ACTIVE_LOW;
+
+   ret = of_get_gpio_flags(np, 2, );
+   if (ret > 0) {
+   mdo = ret;
+   mdo_active_low = flags & OF_GPIO_ACTIVE_LOW;
+   } else {
+   mdo = 0;
+   }
+
+   bus_id = of_alias_get_id(np, "mdio-gpio");
+   if (bus_id < 0) {
+   dev_warn(dev, "failed to get alias id\n");
+   bus_id = 0;
+   }
 
bitbang->ctrl.ops = _gpio_ops;
-   bitbang->ctrl.reset = pdata->reset;
-   mdc = pdata->mdc;
bitbang->mdc = gpio_to_desc(mdc);
-   if (pdata->mdc_active_low)
+   if (mdc_active_low)
mdc_flags = GPIOF_OUT_INIT_HIGH | GPIOF_ACTIVE_LOW;
-   mdio = pdata->mdio;
bitbang->mdio = gpio_to_desc(mdio);
-   if (pdata->mdio_active_low)
+   if (mdio_active_low)
mdio_flags |= GPIOF_ACTIVE_LOW;
-   mdo = pdata->mdo;
if (mdo) {
bitbang->mdo = gpio_to_desc(mdo);
-   if (pdata->mdo_active_low)
+   if (mdo_active_low)
mdo_flags = GPIOF_OUT_INIT_LOW | GPIOF_ACTIVE_LOW;
}
 
@@ -175,10 +154,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct

[PATCH net-next 3/5] net: mdio-gpio: Remove non-DT probe path

2018-02-25 Thread Linus Walleij

This driver can now only be created using the device tree.
Remove the platform data probe path and require OF_MDIO
in Kconfig.

Signed-off-by: Linus Walleij 
---
 drivers/net/phy/Kconfig |  2 +-
 drivers/net/phy/mdio-gpio.c | 21 ++---
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index bdfbabb86ee0..27efc5d6fbe2 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -92,7 +92,7 @@ config MDIO_CAVIUM
 
 config MDIO_GPIO
tristate "GPIO lib-based bitbanged MDIO buses"
-   depends on MDIO_BITBANG && GPIOLIB
+   depends on MDIO_BITBANG && GPIOLIB && OF_MDIO
---help---
  Supports GPIO lib-based MDIO busses.
 
diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index d95bb45eb67b..96c953d086c6 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -239,16 +239,11 @@ static int mdio_gpio_probe(struct platform_device *pdev)
if (!bitbang)
return -ENOMEM;
 
-   if (pdev->dev.of_node) {
-   pdata = mdio_gpio_of_get_data(dev);
-   bus_id = of_alias_get_id(dev->of_node, "mdio-gpio");
-   if (bus_id < 0) {
-   dev_warn(dev, "failed to get alias id\n");
-   bus_id = 0;
-   }
-   } else {
-   pdata = dev_get_platdata(dev);
-   bus_id = pdev->id;
+   pdata = mdio_gpio_of_get_data(dev);
+   bus_id = of_alias_get_id(dev->of_node, "mdio-gpio");
+   if (bus_id < 0) {
+   dev_warn(dev, "failed to get alias id\n");
+   bus_id = 0;
}
 
if (!pdata)
@@ -258,11 +253,7 @@ static int mdio_gpio_probe(struct platform_device *pdev)
if (!new_bus)
return -ENODEV;
 
-   if (dev->of_node)
-   ret = of_mdiobus_register(new_bus, dev->of_node);
-   else
-   ret = mdiobus_register(new_bus);
-
+   ret = of_mdiobus_register(new_bus, dev->of_node);
if (ret)
mdio_gpio_bus_deinit(dev);
 
-- 
2.14.3

[PATCH net-next 2/5] net: mdio-gpio: Allocate state in probe()

2018-02-25 Thread Linus Walleij

Allocate the state container for the driver, struct mdio_gpio_info
inside the probe() function instead of in the mdio_gpio_bus_init()
function. Create the local struct device *dev variable in probe()
and pass that around instead of constantly dereferencing the
struct platform_data.

Signed-off-by: Linus Walleij 
---
 drivers/net/phy/mdio-gpio.c | 34 ++
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 6d669f24c0e6..d95bb45eb67b 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -52,14 +52,14 @@ struct mdio_gpio_info {
struct gpio_desc *mdc, *mdio, *mdo;
 };
 
-static void *mdio_gpio_of_get_data(struct platform_device *pdev)
+static void *mdio_gpio_of_get_data(struct device *dev)
 {
-   struct device_node *np = pdev->dev.of_node;
+   struct device_node *np = dev->of_node;
struct mdio_gpio_platform_data *pdata;
enum of_gpio_flags flags;
int ret;
 
-   pdata = devm_kzalloc(>dev, sizeof(*pdata), GFP_KERNEL);
+   pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
if (!pdata)
return NULL;
 
@@ -142,21 +142,17 @@ static const struct mdiobb_ops mdio_gpio_ops = {
 };
 
 static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
+ struct mdio_gpio_info *bitbang,
  struct mdio_gpio_platform_data *pdata,
  int bus_id)
 {
struct mii_bus *new_bus;
-   struct mdio_gpio_info *bitbang;
int i;
int mdc, mdio, mdo;
unsigned long mdc_flags = GPIOF_OUT_INIT_LOW;
unsigned long mdio_flags = GPIOF_DIR_IN;
unsigned long mdo_flags = GPIOF_OUT_INIT_HIGH;
 
-   bitbang = devm_kzalloc(dev, sizeof(*bitbang), GFP_KERNEL);
-   if (!bitbang)
-   goto out;
-
bitbang->ctrl.ops = _gpio_ops;
bitbang->ctrl.reset = pdata->reset;
mdc = pdata->mdc;
@@ -234,35 +230,41 @@ static void mdio_gpio_bus_destroy(struct device *dev)
 static int mdio_gpio_probe(struct platform_device *pdev)
 {
struct mdio_gpio_platform_data *pdata;
+   struct device *dev = >dev;
+   struct mdio_gpio_info *bitbang;
struct mii_bus *new_bus;
int ret, bus_id;
 
+   bitbang = devm_kzalloc(dev, sizeof(*bitbang), GFP_KERNEL);
+   if (!bitbang)
+   return -ENOMEM;
+
if (pdev->dev.of_node) {
-   pdata = mdio_gpio_of_get_data(pdev);
-   bus_id = of_alias_get_id(pdev->dev.of_node, "mdio-gpio");
+   pdata = mdio_gpio_of_get_data(dev);
+   bus_id = of_alias_get_id(dev->of_node, "mdio-gpio");
if (bus_id < 0) {
-   dev_warn(>dev, "failed to get alias id\n");
+   dev_warn(dev, "failed to get alias id\n");
bus_id = 0;
}
} else {
-   pdata = dev_get_platdata(>dev);
+   pdata = dev_get_platdata(dev);
bus_id = pdev->id;
}
 
if (!pdata)
return -ENODEV;
 
-   new_bus = mdio_gpio_bus_init(>dev, pdata, bus_id);
+   new_bus = mdio_gpio_bus_init(dev, bitbang, pdata, bus_id);
if (!new_bus)
return -ENODEV;
 
-   if (pdev->dev.of_node)
-   ret = of_mdiobus_register(new_bus, pdev->dev.of_node);
+   if (dev->of_node)
+   ret = of_mdiobus_register(new_bus, dev->of_node);
else
ret = mdiobus_register(new_bus);
 
if (ret)
-   mdio_gpio_bus_deinit(>dev);
+   mdio_gpio_bus_deinit(dev);
 
return ret;
 }
-- 
2.14.3

[PATCH net-next 5/5] net: mdio-gpio: Move to gpiod API

2018-02-25 Thread Linus Walleij

Move the bitbanged GPIO based MDIO driver over to using the
gpiolib GPIO descriptors and transfer the line inversion
and optional MDO handling semantics to gpiolib.

The driver has been parsing the device tree to handle GPIO
semantics on its own, but this is completely unnecessary as
the gpiolib can handle all inversion and optional line
semantics.

This cuts down the code a lot and makes the driver simpler.

Switch mdio_gpio_bus_init() to return an error pointer and
handle this in probe() so we can back out of e.g. -EPROBE_DEFER
properly if we need to.

After this the GPIO MDIO driver only use GPIO descriptors
and is completely decoupled from the old GPIO API.

Signed-off-by: Linus Walleij 
---
 drivers/net/phy/mdio-gpio.c | 81 +
 1 file changed, 23 insertions(+), 58 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 9146077b5278..5740f16a0f30 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -25,9 +25,7 @@
 #include 
 #include 
 #include 
-#include 
-
-#include 
+#include 
 #include 
 
 struct mdio_gpio_info {
@@ -94,41 +92,23 @@ static const struct mdiobb_ops mdio_gpio_ops = {
 static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
  struct mdio_gpio_info *bitbang)
 {
-   unsigned long mdo_flags = GPIOF_OUT_INIT_HIGH;
-   unsigned long mdc_flags = GPIOF_OUT_INIT_LOW;
-   unsigned long mdio_flags = GPIOF_DIR_IN;
struct device_node *np = dev->of_node;
-   enum of_gpio_flags flags;
struct mii_bus *new_bus;
-   bool mdio_active_low;
-   bool mdc_active_low;
-   bool mdo_active_low;
-   unsigned int mdio;
-   unsigned int mdc;
-   unsigned int mdo;
int bus_id;
int ret, i;
 
-   ret = of_get_gpio_flags(np, 0, );
-   if (ret < 0)
-   return NULL;
-
-   mdc = ret;
-   mdc_active_low = flags & OF_GPIO_ACTIVE_LOW;
-
-   ret = of_get_gpio_flags(np, 1, );
-   if (ret < 0)
-   return NULL;
-   mdio = ret;
-   mdio_active_low = flags & OF_GPIO_ACTIVE_LOW;
-
-   ret = of_get_gpio_flags(np, 2, );
-   if (ret > 0) {
-   mdo = ret;
-   mdo_active_low = flags & OF_GPIO_ACTIVE_LOW;
-   } else {
-   mdo = 0;
-   }
+   bitbang->mdc =
+   devm_gpiod_get_index(dev, NULL, 0, GPIOD_OUT_LOW);
+   if (IS_ERR(bitbang->mdc))
+   return ERR_CAST(bitbang->mdc);
+   bitbang->mdio =
+   devm_gpiod_get_index(dev, NULL, 1, GPIOD_IN);
+   if (IS_ERR(bitbang->mdio))
+   return ERR_CAST(bitbang->mdio);
+   bitbang->mdo =
+   devm_gpiod_get_index_optional(dev, NULL, 2, GPIOD_OUT_HIGH);
+   if (IS_ERR(bitbang->mdo))
+   return ERR_CAST(bitbang->mdo);
 
bus_id = of_alias_get_id(np, "mdio-gpio");
if (bus_id < 0) {
@@ -137,27 +117,21 @@ static struct mii_bus *mdio_gpio_bus_init(struct device 
*dev,
}
 
bitbang->ctrl.ops = _gpio_ops;
-   bitbang->mdc = gpio_to_desc(mdc);
-   if (mdc_active_low)
-   mdc_flags = GPIOF_OUT_INIT_HIGH | GPIOF_ACTIVE_LOW;
-   bitbang->mdio = gpio_to_desc(mdio);
-   if (mdio_active_low)
-   mdio_flags |= GPIOF_ACTIVE_LOW;
-   if (mdo) {
-   bitbang->mdo = gpio_to_desc(mdo);
-   if (mdo_active_low)
-   mdo_flags = GPIOF_OUT_INIT_LOW | GPIOF_ACTIVE_LOW;
-   }
 
new_bus = alloc_mdio_bitbang(>ctrl);
-   if (!new_bus)
+   if (!new_bus) {
+   ret = -ENOMEM;
goto out;
+   }
 
new_bus->name = "GPIO Bitbanged MDIO",
new_bus->parent = dev;
 
-   if (new_bus->phy_mask == ~0)
+   if (new_bus->phy_mask == ~0) {
+   dev_err(dev, "no PHY in mask\n");
+   ret = -ENODEV;
goto out_free_bus;
+   }
 
for (i = 0; i < PHY_MAX_ADDR; i++)
if (!new_bus->irq[i])
@@ -168,15 +142,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device 
*dev,
else
strncpy(new_bus->id, "gpio", MII_BUS_ID_SIZE);
 
-   if (devm_gpio_request_one(dev, mdc, mdc_flags, "mdc"))
-   goto out_free_bus;
-
-   if (devm_gpio_request_one(dev, mdio, mdio_flags, "mdio"))
-   goto out_free_bus;
-
-   if (mdo && devm_gpio_request_one(dev, mdo, mdo_flags, "mdo"))
-   goto out_free_bus;
-
dev_set_drvdata(dev, new_bus);
 
return new_bus;
@@ -184,7 +149,7 @@ static struct mii_bus *mdio_gpio_bus_init(struct device 
*dev,
 out_free_bus:
free_mdio_bitbang(new_bus);
 out:
-   return NULL;
+   return ERR_PTR(ret);
 }
 
 static void mdio_gpio_bus_deinit(struct device *dev)
@@ -216,8 +181,8 @@ static int mdio_gpio_probe(struct platform_device *pdev)
return

[PATCH net-next 1/5] net: mdio-gpio: Localize platform data

2018-02-25 Thread Linus Walleij

It is late on the day for platforms using platform data to pass
information to drivers. As of today, the only thing in the kernel
including the  file is the MDIO
GPIO driver itself.

Essentially it is exposing a kernel-internal interface unused by
any in-kernel code.

Let's decomission this and make the MDIO GPIO driver more
self-contained by starting to move this struct into the driver.

Signed-off-by: Linus Walleij 
---
 MAINTAINERS |  1 -
 drivers/net/phy/mdio-gpio.c | 19 ++-
 include/linux/platform_data/mdio-gpio.h | 33 -
 3 files changed, 18 insertions(+), 35 deletions(-)
 delete mode 100644 include/linux/platform_data/mdio-gpio.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 3bdc260e36b7..e5a1a06c09e8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5333,7 +5333,6 @@ F:include/linux/*mdio*.h
 F: include/linux/of_net.h
 F: include/linux/phy.h
 F: include/linux/phy_fixed.h
-F: include/linux/platform_data/mdio-gpio.h
 F: include/linux/platform_data/mdio-bcm-unimac.h
 F: include/trace/events/mdio.h
 F: include/uapi/linux/mdio.h
diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 4333c6e14742..6d669f24c0e6 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -24,12 +24,29 @@
 #include 
 #include 
 #include 
+#include 
 #include 
-#include 
 
 #include 
 #include 
 
+struct mdio_gpio_platform_data {
+   /* GPIO numbers for bus pins */
+   unsigned int mdc;
+   unsigned int mdio;
+   unsigned int mdo;
+
+   bool mdc_active_low;
+   bool mdio_active_low;
+   bool mdo_active_low;
+
+   u32 phy_mask;
+   u32 phy_ignore_ta_mask;
+   int irqs[PHY_MAX_ADDR];
+   /* reset callback */
+   int (*reset)(struct mii_bus *bus);
+};
+
 struct mdio_gpio_info {
struct mdiobb_ctrl ctrl;
struct gpio_desc *mdc, *mdio, *mdo;
diff --git a/include/linux/platform_data/mdio-gpio.h 
b/include/linux/platform_data/mdio-gpio.h
deleted file mode 100644
index 11f00cdabe3d..
--- a/include/linux/platform_data/mdio-gpio.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * MDIO-GPIO bus platform data structures
- *
- * Copyright (C) 2008, Paulius Zaleckas 
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-
-#ifndef __LINUX_MDIO_GPIO_H
-#define __LINUX_MDIO_GPIO_H
-
-#include 
-
-struct mdio_gpio_platform_data {
-   /* GPIO numbers for bus pins */
-   unsigned int mdc;
-   unsigned int mdio;
-   unsigned int mdo;
-
-   bool mdc_active_low;
-   bool mdio_active_low;
-   bool mdo_active_low;
-
-   u32 phy_mask;
-   u32 phy_ignore_ta_mask;
-   int irqs[PHY_MAX_ADDR];
-   /* reset callback */
-   int (*reset)(struct mii_bus *bus);
-};
-
-#endif /* __LINUX_MDIO_GPIO_H */
-- 
2.14.3

[PATCH net-next 0/5] Modernize bitbanged GPIO MDIO

2018-02-25 Thread Linus Walleij

This kills off the platform data support from the bitbanged
GPIO-based MDIO driver and moves it over to using GPIO
descriptors exclusively.

We are certainly not going to merge any more platforms into
the kernel using platform data, and nothing is using it at the
moment. The only concern would be out-of-tree platforms, and
those are not the concern of the kernel community. They need
to move to use device tree (or ACPI etc) like everyone else.

This was tested on the bit-banged GPIO MDIO on the D-Link
DNS-313 and works fine for me.

Linus Walleij (5):
  net: mdio-gpio: Localize platform data
  net: mdio-gpio: Allocate state in probe()
  net: mdio-gpio: Remove non-DT probe path
  net: mdio-gpio: Merge platform data into state
  net: mdio-gpio: Move to gpiod API

 MAINTAINERS |   1 -
 drivers/net/phy/Kconfig |   2 +-
 drivers/net/phy/mdio-gpio.c | 151 ++--
 include/linux/platform_data/mdio-gpio.h |  33 ---
 4 files changed, 47 insertions(+), 140 deletions(-)
 delete mode 100644 include/linux/platform_data/mdio-gpio.h

-- 
2.14.3

Re: [PATCH iproute2-next v3 2/8] iplink: Correctly report error when network device isn't found

2018-02-25 Thread Serhey Popovych

David Ahern wrote:
> On 2/22/18 6:02 AM, Serhey Popovych wrote:
>> @@ -650,6 +658,9 @@ int iplink_parse(int argc, char **argv, struct 
>> iplink_req *req,
>>  bool drv = strcmp(*argv, "xdpdrv") == 0;
>>  bool offload = strcmp(*argv, "xdpoffload") == 0;
>>  
>> +if (offload)
>> +has_dev(*dev, dev_index);
>> +
> 
> I think this is actually the wrong direction. seems to me argv should be
> passed to xdp_parse rather than the generic, drv, offload bool's. That
> function can then the check on which option it is and has the knowledge
> about whether a device is needed or not.

Okay, I probably will prepare another change instead that accounts your
suggestions. Will add it to v4 later.

> 
> 
>>  NEXT_ARG();
>>  if (xdp_parse(, , req, dev_index,
>>generic, drv, offload))




signature.asc
Description: OpenPGP digital signature

1 2 >

1 - 100 of 104 matches

Mail list logo