Re: [PATCH net] sch_netem: fix skb leak in netem_enqueue()

2018-03-02 Thread Eric Dumazet
On Fri, 2018-03-02 at 10:44 -0800, Stephen Hemminger wrote:
> On Fri,  2 Mar 2018 21:16:48 +0300
> 
> Since this is a generic problem why is not fixed in qdisc_drop instead?

AFAIK only netem and tbf might segment GSO packets so far.

I am not sure we want to add code in qdisc_drop() that is used under
stress on normal qdiscs and are inline code.




[PATCH net-next] liquidio: Added ndo_get_phys_port_id support

2018-03-02 Thread Felix Manlunas
From: Intiyaz Basha 

Added support to the ndo_get_phys_port_id() callback to provide
port specific unique id to the netdev layer.

Port id needs to be unique across different liquidio devices in the system.
So used MAC address for port_id.

Usage: cat /sys/class/net//phys_port_id

Signed-off-by: Intiyaz Basha 
Acked-by: Derek Chickles 
Signed-off-by: Felix Manlunas 
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c   | 13 +
 drivers/net/ethernet/cavium/liquidio/lio_main.c   |  1 +
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c|  1 +
 drivers/net/ethernet/cavium/liquidio/octeon_network.h |  3 +++
 4 files changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c 
b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 8b1ee83..8bb4cfb 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -1081,3 +1081,16 @@ int octeon_setup_interrupt(struct octeon_device *oct, 
u32 num_ioqs)
}
return 0;
 }
+
+int liquidio_get_phys_port_id(struct net_device *netdev,
+ struct netdev_phys_item_id *ppid)
+{
+   struct lio *lio = GET_LIO(netdev);
+   u8 addr[ETH_ALEN];
+
+   u64_to_ether_addr(be64_to_cpu(lio->linfo.hw_addr), addr);
+   ppid->id_len = ETH_ALEN;
+   memcpy(ppid->id, addr, ppid->id_len);
+
+   return 0;
+}
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index a5eecd8..e376b9d 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3424,6 +3424,7 @@ static const struct net_device_ops lionetdevops = {
.ndo_set_vf_vlan= liquidio_set_vf_vlan,
.ndo_get_vf_config  = liquidio_get_vf_config,
.ndo_set_vf_link_state  = liquidio_set_vf_link_state,
+   .ndo_get_phys_port_id   = liquidio_get_phys_port_id,
 };
 
 /** \brief Entry point for the liquidio module
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index fd70a48..dbff977 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -2244,6 +2244,7 @@ static const struct net_device_ops lionetdevops = {
.ndo_set_features   = liquidio_set_features,
.ndo_udp_tunnel_add = liquidio_add_vxlan_port,
.ndo_udp_tunnel_del = liquidio_del_vxlan_port,
+   .ndo_get_phys_port_id   = liquidio_get_phys_port_id,
 };
 
 static int lio_nic_info(struct octeon_recv_info *recv_info, void *buf)
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h 
b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index f2d1a07..ea7536f 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -184,6 +184,9 @@ int octeon_setup_interrupt(struct octeon_device *oct, u32 
num_ioqs);
  */
 void liquidio_set_ethtool_ops(struct net_device *netdev);
 
+int liquidio_get_phys_port_id(struct net_device *netdev,
+ struct netdev_phys_item_id *ppid);
+
 #define SKB_ADJ_MASK  0x3F
 #define SKB_ADJ   (SKB_ADJ_MASK + 1)
 


Re: pull-request: bpf-next 2018-03-03

2018-03-02 Thread David Miller
From: Daniel Borkmann 
Date: Sat,  3 Mar 2018 02:42:01 +0100

> The following pull-request contains BPF updates for your *net-next* tree.
> 
> The main changes are:
> 
> 1) Extend bpftool to build up CFG information of eBPF programs and add an
>option to dump this in DOT format such that this can later be used with
>DOT graphic tools (xdot, graphviz, etc) to visualize it. Part of the
>analysis performed is sub-program detection and basic-block partitioning,
>from Jiong.
> 
> 2) Multiple enhancements for bpftool's batch mode, more specifically the
>parser now understands comments (#), continuation lines (\), and arguments
>enclosed between quotes. Also, allow to read from stdin via '-' as input
>file, all from Quentin.
> 
> 3) Improve BPF kselftests by i) unifying the rlimit handling into a helper
>that is then used by all tests, and ii) add support for testing tail calls
>to test_verifier plus add tests covering all corner cases. The latter is
>especially useful for testing JITs, from Daniel.
> 
> 4) Remove x64 JIT's bpf_flush_icache() since flush_icache_range() is a noop
>on x64, from Daniel.
> 
> 5) Fix one more occasion in BPF samples where we do not detach the BPF program
>from the cgroup after completion, from Prashant.
> 
> Please consider pulling these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git

Pulled, thanks Daniel.


[PATCH net-next] liquidio: Corrected Rx bytes counting

2018-03-02 Thread Felix Manlunas
From: Intiyaz Basha 

Corrected stats mismatch between Host Tx and its peer Rx stats

Signed-off-by: Intiyaz Basha 
Acked-by: Derek Chickles 
Signed-off-by: Felix Manlunas 
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c 
b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 90da33c..8bb4cfb 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -571,7 +571,8 @@ static int octeon_setup_droq(struct octeon_device *oct, int 
q_no, int num_descs,
 
napi_gro_receive(napi, skb);
 
-   droq->stats.rx_bytes_received += len;
+   droq->stats.rx_bytes_received += len -
+   rh->r_dh.len * BYTES_PER_DHLEN_UNIT;
droq->stats.rx_pkts_received++;
} else {
recv_buffer_free(skb);
-- 
1.8.3.1



[PATCH bpf] bpf: fix bpf_skb_adjust_net/bpf_skb_proto_xlat to deal with gso sctp skbs

2018-03-02 Thread Daniel Borkmann
From: Daniel Axtens 

SCTP GSO skbs have a gso_size of GSO_BY_FRAGS, so any sort of
unconditionally mangling of that will result in nonsense value
and would corrupt the skb later on.

Therefore, i) add two helpers skb_increase_gso_size() and
skb_decrease_gso_size() that would throw a one time warning and
bail out for such skbs and ii) refuse and return early with an
error in those BPF helpers that are affected. We do need to bail
out as early as possible from there before any changes on the
skb have been performed.

Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
Co-authored-by: Daniel Borkmann 
Signed-off-by: Daniel Axtens 
Cc: Marcelo Ricardo Leitner 
Acked-by: Alexei Starovoitov 
---
 [ Also squashed the two into one, so that this is better for
   backporting and has no other dependency. ]

 Documentation/networking/segmentation-offloads.txt | 11 +++-
 include/linux/skbuff.h | 22 
 net/core/filter.c  | 60 +++---
 3 files changed, 73 insertions(+), 20 deletions(-)

diff --git a/Documentation/networking/segmentation-offloads.txt 
b/Documentation/networking/segmentation-offloads.txt
index d47480b..23a8dd9 100644
--- a/Documentation/networking/segmentation-offloads.txt
+++ b/Documentation/networking/segmentation-offloads.txt
@@ -153,8 +153,15 @@ To signal this, gso_size is set to the special value 
GSO_BY_FRAGS.
 
 Therefore, any code in the core networking stack must be aware of the
 possibility that gso_size will be GSO_BY_FRAGS and handle that case
-appropriately. (For size checks, the skb_gso_validate_*_len family of
-helpers do this automatically.)
+appropriately.
+
+There are a couple of helpers to make this easier:
+
+ - For size checks, the skb_gso_validate_*_len family of helpers correctly
+   considers GSO_BY_FRAGS.
+
+ - For manipulating packets, skb_increase_gso_size and skb_decrease_gso_size
+   will check for GSO_BY_FRAGS and WARN if asked to manipulate these skbs.
 
 This also affects drivers with the NETIF_F_FRAGLIST & NETIF_F_GSO_SCTP bits
 set. Note also that NETIF_F_GSO_SCTP is included in NETIF_F_GSO_SOFTWARE.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c1e66bd..8c67c33 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4038,6 +4038,12 @@ static inline bool skb_is_gso_v6(const struct sk_buff 
*skb)
return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
 }
 
+/* Note: Should be called only if skb_is_gso(skb) is true */
+static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
+{
+   return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
+}
+
 static inline void skb_gso_reset(struct sk_buff *skb)
 {
skb_shinfo(skb)->gso_size = 0;
@@ -4045,6 +4051,22 @@ static inline void skb_gso_reset(struct sk_buff *skb)
skb_shinfo(skb)->gso_type = 0;
 }
 
+static inline void skb_increase_gso_size(struct skb_shared_info *shinfo,
+u16 increment)
+{
+   if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
+   return;
+   shinfo->gso_size += increment;
+}
+
+static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo,
+u16 decrement)
+{
+   if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
+   return;
+   shinfo->gso_size -= decrement;
+}
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb);
 
 static inline bool skb_warn_if_lro(const struct sk_buff *skb)
diff --git a/net/core/filter.c b/net/core/filter.c
index 0c121ad..48aa7c7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2087,6 +2087,10 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
u32 off = skb_mac_header_len(skb);
int ret;
 
+   /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+   if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+   return -ENOTSUPP;
+
ret = skb_cow(skb, len_diff);
if (unlikely(ret < 0))
return ret;
@@ -2096,19 +2100,21 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
return ret;
 
if (skb_is_gso(skb)) {
+   struct skb_shared_info *shinfo = skb_shinfo(skb);
+
/* SKB_GSO_TCPV4 needs to be changed into
 * SKB_GSO_TCPV6.
 */
-   if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
-   skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
-   skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV6;
+   if (shinfo->gso_type & SKB_GSO_TCPV4) {
+   shinfo->gso_type &= ~SKB_GSO_TCPV4;
+   shinfo->gso_type |=  SKB_GSO_TCPV6;
}
 
/* Due to IPv6 header, MSS needs to be downgraded. */
-   skb_shinfo(skb)->gso_size -= len_diff;
+   

[PATCH] net/wireless: fix spaces and grammar copy/paste in vendor Kconfig help text

2018-03-02 Thread Randy Dunlap
From: Randy Dunlap <rdun...@infradead.org>

Lots of the wireless driver vendor Kconfig symol help text says
"questions about  cards." (2 spaces between "about" and "cards")

Besides dropping one of those spaces, it also needs some other word
inserted there. Instead of putting each vendor's name there, I chose
to say "these" cards in all of the Kconfig help text.

Cc: Kalle Valo <kv...@codeaurora.org>
Signed-off-by: Randy Dunlap <rdun...@infradead.org>
---
 drivers/net/wireless/admtek/Kconfig|4 ++--
 drivers/net/wireless/ath/Kconfig   |4 ++--
 drivers/net/wireless/atmel/Kconfig |4 ++--
 drivers/net/wireless/broadcom/Kconfig  |4 ++--
 drivers/net/wireless/cisco/Kconfig |4 ++--
 drivers/net/wireless/intel/Kconfig |4 ++--
 drivers/net/wireless/intersil/Kconfig  |4 ++--
 drivers/net/wireless/marvell/Kconfig   |4 ++--
 drivers/net/wireless/mediatek/Kconfig  |4 ++--
 drivers/net/wireless/quantenna/Kconfig |4 ++--
 drivers/net/wireless/ralink/Kconfig|4 ++--
 drivers/net/wireless/realtek/Kconfig   |4 ++--
 drivers/net/wireless/rsi/Kconfig   |4 ++--
 drivers/net/wireless/st/Kconfig|4 ++--
 drivers/net/wireless/ti/Kconfig|4 ++--
 drivers/net/wireless/zydas/Kconfig |4 ++--
 16 files changed, 32 insertions(+), 32 deletions(-)

--- linux-next-20180302.orig/drivers/net/wireless/admtek/Kconfig
+++ linux-next-20180302/drivers/net/wireless/admtek/Kconfig
@@ -5,8 +5,8 @@ config WLAN_VENDOR_ADMTEK
  If you have a wireless card belonging to this class, say Y.
 
  Note that the answer to this question doesn't directly affect the
- kernel: saying N will just cause the configurator to skip all
- the questions about  cards. If you say Y, you will be asked for
+ kernel: saying N will just cause the configurator to skip all the
+ questions about these cards. If you say Y, you will be asked for
  your specific card in the following questions.
 
 if WLAN_VENDOR_ADMTEK
--- linux-next-20180302.orig/drivers/net/wireless/ath/Kconfig
+++ linux-next-20180302/drivers/net/wireless/ath/Kconfig
@@ -8,8 +8,8 @@ config WLAN_VENDOR_ATH
  If you have a wireless card belonging to this class, say Y.
 
  Note that the answer to this question doesn't directly affect the
- kernel: saying N will just cause the configurator to skip all
- the questions about  cards. If you say Y, you will be asked for
+ kernel: saying N will just cause the configurator to skip all the
+ questions about these cards. If you say Y, you will be asked for
  your specific card in the following questions.
 
  For more information and documentation on this module you can visit:
--- linux-next-20180302.orig/drivers/net/wireless/atmel/Kconfig
+++ linux-next-20180302/drivers/net/wireless/atmel/Kconfig
@@ -5,8 +5,8 @@ config WLAN_VENDOR_ATMEL
  If you have a wireless card belonging to this class, say Y.
 
  Note that the answer to this question doesn't directly affect the
- kernel: saying N will just cause the configurator to skip all
- the questions about  cards. If you say Y, you will be asked for
+ kernel: saying N will just cause the configurator to skip all the
+ questions about these cards. If you say Y, you will be asked for
  your specific card in the following questions.
 
 if WLAN_VENDOR_ATMEL
--- linux-next-20180302.orig/drivers/net/wireless/broadcom/Kconfig
+++ linux-next-20180302/drivers/net/wireless/broadcom/Kconfig
@@ -5,8 +5,8 @@ config WLAN_VENDOR_BROADCOM
  If you have a wireless card belonging to this class, say Y.
 
  Note that the answer to this question doesn't directly affect the
- kernel: saying N will just cause the configurator to skip all
- the questions about  cards. If you say Y, you will be asked for
+ kernel: saying N will just cause the configurator to skip all the
+ questions about these cards. If you say Y, you will be asked for
  your specific card in the following questions.
 
 if WLAN_VENDOR_BROADCOM
--- linux-next-20180302.orig/drivers/net/wireless/cisco/Kconfig
+++ linux-next-20180302/drivers/net/wireless/cisco/Kconfig
@@ -5,8 +5,8 @@ config WLAN_VENDOR_CISCO
  If you have a wireless card belonging to this class, say Y.
 
  Note that the answer to this question doesn't directly affect the
- kernel: saying N will just cause the configurator to skip all
- the questions about  cards. If you say Y, you will be asked for
+ kernel: saying N will just cause the configurator to skip all the
+ questions about these cards. If you say Y, you will be asked for
  your specific card in the following questions.
 
 if WLAN_VENDOR_CISCO
--- linux-next-20180302.orig/drivers/net/wireless/intel/Kconfig
+++ linux-nex

[PATCH net-next 1/1] net sched actions: corrected extack message

2018-03-02 Thread Roman Mashak
Signed-off-by: Roman Mashak 
---
 net/sched/act_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 1f65d6a..a54fa7b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1083,7 +1083,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, 
struct list_head *actions,
 
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
 RTM_NEWACTION, 0, 0) <= 0) {
-   NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while 
deleting TC action");
+   NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while 
adding TC action");
kfree_skb(skb);
return -EINVAL;
}
-- 
2.7.4



pull-request: bpf-next 2018-03-03

2018-03-02 Thread Daniel Borkmann
Hi David,

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Extend bpftool to build up CFG information of eBPF programs and add an
   option to dump this in DOT format such that this can later be used with
   DOT graphic tools (xdot, graphviz, etc) to visualize it. Part of the
   analysis performed is sub-program detection and basic-block partitioning,
   from Jiong.

2) Multiple enhancements for bpftool's batch mode, more specifically the
   parser now understands comments (#), continuation lines (\), and arguments
   enclosed between quotes. Also, allow to read from stdin via '-' as input
   file, all from Quentin.

3) Improve BPF kselftests by i) unifying the rlimit handling into a helper
   that is then used by all tests, and ii) add support for testing tail calls
   to test_verifier plus add tests covering all corner cases. The latter is
   especially useful for testing JITs, from Daniel.

4) Remove x64 JIT's bpf_flush_icache() since flush_icache_range() is a noop
   on x64, from Daniel.

5) Fix one more occasion in BPF samples where we do not detach the BPF program
   from the cgroup after completion, from Prashant.

Please consider pulling these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git

Thanks a lot!



The following changes since commit 3808b51911fe1a2bf8d6f4f2836d4c51aa29a6fd:

  Merge branch '10GbE' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue (2018-02-26 
20:58:25 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git 

for you to fetch changes up to c51a63797acc159ead1e7b136119b18865e909c3:

  Merge branch 'bpf-bpftool-batch-improvements' (2018-03-02 09:46:42 +0100)


Alexei Starovoitov (2):
  Merge branch 'bpf-kselftest-improvements'
  Merge branch 'bpftool-visualization'

Daniel Borkmann (4):
  bpf: unify rlimit handling in selftests
  bpf: add tail call tests to test_verifier
  bpf, x64: remove bpf_flush_icache
  Merge branch 'bpf-bpftool-batch-improvements'

Jiong Wang (7):
  tools: bpftool: remove unnecessary 'if' to reduce indentation
  tools: bpftool: factor out xlated dump related code into separate file
  tools: bpftool: detect sub-programs from the eBPF sequence
  tools: bpftool: partition basic-block for each function in the CFG
  tools: bpftool: add out edges for each basic-block
  tools: bpftool: generate .dot graph from CFG information
  tools: bpftool: new command-line option and documentation for 'visual'

Prashant Bhole (1):
  samples/bpf: detach prog from cgroup

Quentin Monnet (5):
  tools: bpftool: add bash completion for CFG dump
  tools: bpftool: support comments in batch files
  tools: bpftool: support continuation lines in batch files
  tools: bpftool: read from stdin when batch file name is "-"
  tools: bpftool: add support for quotations in batch files

 arch/x86/net/bpf_jit_comp.c  |  15 +-
 samples/bpf/test_cgrp2_sock.sh   |   1 +
 samples/bpf/test_cgrp2_sock2.sh  |   3 +
 tools/bpf/bpftool/Documentation/bpftool-prog.rst |  18 +-
 tools/bpf/bpftool/bash-completion/bpftool|  13 +-
 tools/bpf/bpftool/cfg.c  | 514 +++
 tools/bpf/bpftool/cfg.h  |  43 ++
 tools/bpf/bpftool/main.c | 104 -
 tools/bpf/bpftool/prog.c | 305 ++
 tools/bpf/bpftool/xlated_dumper.c| 338 +++
 tools/bpf/bpftool/xlated_dumper.h|  64 +++
 tools/testing/selftests/bpf/bpf_rlimit.h |  28 ++
 tools/testing/selftests/bpf/test_align.c |   6 +-
 tools/testing/selftests/bpf/test_dev_cgroup.c|   6 +-
 tools/testing/selftests/bpf/test_lpm_map.c   |  14 +-
 tools/testing/selftests/bpf/test_lru_map.c   |   6 +-
 tools/testing/selftests/bpf/test_maps.c  |   7 +-
 tools/testing/selftests/bpf/test_progs.c |   7 +-
 tools/testing/selftests/bpf/test_tag.c   |   4 +-
 tools/testing/selftests/bpf/test_tcpbpf_user.c   |   6 +-
 tools/testing/selftests/bpf/test_verifier.c  | 123 +-
 tools/testing/selftests/bpf/test_verifier_log.c  |   8 +-
 22 files changed, 1259 insertions(+), 374 deletions(-)
 create mode 100644 tools/bpf/bpftool/cfg.c
 create mode 100644 tools/bpf/bpftool/cfg.h
 create mode 100644 tools/bpf/bpftool/xlated_dumper.c
 create mode 100644 tools/bpf/bpftool/xlated_dumper.h
 create mode 100644 tools/testing/selftests/bpf/bpf_rlimit.h


Re: [PATCH 00/14] Netfilter/IPVS fixes for net

2018-03-02 Thread David Miller
From: Pablo Neira Ayuso 
Date: Fri,  2 Mar 2018 21:32:48 +0100

> The following patchset contains Netfilter fixes for your net tree,
> they are:
 ...
> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Pulled, thank you.


RE: [PATCH] pci-iov: Add support for unmanaged SR-IOV

2018-03-02 Thread Tian, Kevin
> From: Alex Williamson [mailto:alex.william...@redhat.com]
> Sent: Saturday, March 3, 2018 2:14 AM
> 
> On Fri, 2 Mar 2018 06:54:17 +
> "Tian, Kevin"  wrote:
> 
> > > From: Alex Williamson
> > > Sent: Friday, March 2, 2018 4:22 AM
> > > >
> > > > I am pretty sure that you are describing is true of some, but not for
> > > > all. I think the Amazon solutions and the virtio solution are doing
> > > > hard partitioning of the part. I will leave it to those guys to speak
> > > > for themselves since I don't know anything about the hardware design
> > > > of those parts.
> > >
> > > I think we'd need device specific knowledge and enablement to be able
> > > to take advantage of any hardware partitioning, otherwise we need to
> > > assume the pf is privileged, as implemented in other sriov devices.
> > >
> > > I'm also trying to imagine whether there's a solution via the new
> > > vfio/mdev interface, where the mdev vendor driver would bind to the
> pf
> > > and effectively present itself as the mdev device.  The vendor driver
> > > could provide sriov capabilities and bear the burden of ensuring that
> > > the pf is used cooperatively.  The only existing mdev vendor drivers are
> > > vGPUs and rely on on-device DMA translation and isolation, such as
> > > through GTTs, but there have been some thoughts on providing IOMMU
> > > based
> > > isolation of mdev/sriov mixed devices (assuming DMA is even required
> > > for userspace management of the pf in this use case).  [Cc Kirti]
> > > Thanks,
> > >
> >
> > Hope not distracting this thread, but above sounds like an interesting
> > idea. Actually we ever brainstormed similar thought for another
> > potential usage - supporting VF live migration. We are already working
> > on an generic extension to allow state save/restore of mdev instance.
> > If vendor driver could further wrap pf as a mdev instance, it could
> > leverage the same framework for a clean state migration on VF. based
> > on mmap callback the vendor driver can easily switch back-and-forth
> > between pass through and trap/emulation of the VF resources. Of
> > course doing so alone doesn't address all the demands of VF live
> > migration (e.g. dirty page tracking still requires other techniques),
> > but it does pave a way toward a general framework to support VF
> > live migration.
> >
> > If above is feasible, finally we could use one mdev framework to
> > manage both mdev and pf/vf assignment, while providing added
> > values which are difficult to achieve today. :-)
> 
> mdev drivers may be the first to support migration, but I wonder if a
> full mdev implementation is necessary for it.  Once the vfio api is
> define, device specific extensions to vfio-pci might be able to
> implement migration more directly.  Thanks,
> 
> Alex

yes technically a full mdev implementation is not necessary. If
device specific extensions will be placed within vfio module, it's
obviously straightforward. What I thought earlier was in case vfio
wants to stay device-agnostic then we probably want device
specific extensions in vendor driver which is loaded but in a 
dummy mode which simply do basic PCI initialization as vfio-pci
and then wrap vf as mdev (since vfio-pci is not the vf driver in
this scenario). It's especially useful for vendor drivers which aim
to support both mdev and sr-iov by sharing common state mgmt.
knowledge, but looks an overkill to other vendor drivers. Possibly
finally we'll allow both - simple device extensions in vfio-pci and 
complex device extensions in vendor drivers through vfio-mdev.

Thanks
Kevin


[PATCH] ethernet: natsemi: correct spelling

2018-03-02 Thread Randy Dunlap
From: Randy Dunlap <rdun...@infradead.org>

Correct spelling of National Semi-conductor (no hyphen)
in drivers/net/ethernet/.

Signed-off-by: Randy Dunlap <rdun...@infradead.org>
---
 drivers/net/ethernet/8390/Kconfig |2 +-
 drivers/net/ethernet/natsemi/Kconfig  |6 +++---
 drivers/net/ethernet/natsemi/Makefile |2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

--- linux-next-20180302.orig/drivers/net/ethernet/8390/Kconfig
+++ linux-next-20180302/drivers/net/ethernet/8390/Kconfig
@@ -3,7 +3,7 @@
 #
 
 config NET_VENDOR_8390
-   bool "National Semi-conductor 8390 devices"
+   bool "National Semiconductor 8390 devices"
default y
depends on NET_VENDOR_NATSEMI
    ---help---
--- linux-next-20180302.orig/drivers/net/ethernet/natsemi/Kconfig
+++ linux-next-20180302/drivers/net/ethernet/natsemi/Kconfig
@@ -1,16 +1,16 @@
 #
-# National Semi-conductor device configuration
+# National Semiconductor device configuration
 #
 
 config NET_VENDOR_NATSEMI
-   bool "National Semi-conductor devices"
+   bool "National Semiconductor devices"
default y
---help---
  If you have a network (Ethernet) card belonging to this class, say Y.
 
  Note that the answer to this question doesn't directly affect the
  kernel: saying N will just cause the configurator to skip all
- the questions about National Semi-conductor devices. If you say Y,
+ the questions about National Semiconductor devices. If you say Y,
  you will be asked for your specific card in the following questions.
 
 if NET_VENDOR_NATSEMI
--- linux-next-20180302.orig/drivers/net/ethernet/natsemi/Makefile
+++ linux-next-20180302/drivers/net/ethernet/natsemi/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 #
-# Makefile for the National Semi-conductor Sonic devices.
+# Makefile for the National Semiconductor Sonic devices.
 #
 
 obj-$(CONFIG_MACSONIC) += macsonic.o




Re: [PATCH 1/3] pci-iov: Add support for unmanaged SR-IOV

2018-03-02 Thread Alexander Duyck
On Fri, Mar 2, 2018 at 3:59 PM, Alex Williamson
 wrote:
> On Fri, 02 Mar 2018 15:44:25 -0800
> Alexander Duyck  wrote:
>
>> From: Alexander Duyck 
>>
>> This patch is meant to add some basic functionality to support for SR-IOV
>> on devices when the VFs are not managed by the kernel. The functions
>> provided here can be used by drivers such as vfio-pci and virtio to enable
>> SR-IOV on devices that are either managed by userspace, or by some sort of
>> firmware entity respectively.
>>
>> A new sysfs value called sriov_unmanaged_autoprobe has been added. This
>> value is used as the drivers_autoprobe setting of the VFs when they are
>> being managed by an external entity such as userspace or device firmware
>> instead of being managed by the kernel.
>>
>> One side effect of this change is that the sriov_drivers_autoprobe and
>> sriov_unmanaged_autoprobe will only apply their updates when SR-IOV is
>> disabled. Attempts to update them when SR-IOV is in use will only update
>> the local value and will not update sriov->autoprobe.
>>
>> Signed-off-by: Alexander Duyck 
>> ---
>>  Documentation/ABI/testing/sysfs-bus-pci |   17 ++
>>  drivers/pci/iov.c   |   37 
>> +++
>>  drivers/pci/pci-driver.c|2 +-
>>  drivers/pci/pci-sysfs.c |   29 
>>  drivers/pci/pci.h   |4 +++
>>  include/linux/pci.h |1 +
>>  6 files changed, 88 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/ABI/testing/sysfs-bus-pci 
>> b/Documentation/ABI/testing/sysfs-bus-pci
>> index 44d4b2be92fd..ff0b6c19cb1a 100644
>> --- a/Documentation/ABI/testing/sysfs-bus-pci
>> +++ b/Documentation/ABI/testing/sysfs-bus-pci
>> @@ -323,3 +323,20 @@ Description:
>>
>>   This is similar to /sys/bus/pci/drivers_autoprobe, but
>>   affects only the VFs associated with a specific PF.
>> +
>> +What:/sys/bus/pci/devices/.../sriov_unmanaged_autoprobe
>> +Date:March 2018
>> +Contact: Alexander Duyck 
>> +Description:
>> + This file is associated with the PF of a device that
>> + supports SR-IOV.  It determines whether newly-enabled VFs
>> + are immediately bound to a driver when the PF driver does
>> + not manage the VFs itself.  It initially contains 0, which
>> + means the kernel will not automatically bind VFs to a driver.
>> + If an application writes 1 to the file before enabling VFs,
>> + the kernel will bind VFs to a compatible driver immediately
>> + after they are enabled.
>> +
>> + This overrides /sys/bus/pci/devices/.../sriov_drivers_autoprobe
>> + when a PF driver is not present to manage a device, or the PF
>> + driver does not provide functionality to support SR-IOV.
>
>
> Given a pf, how does a user determine whether it is managed or unmanaged
> and therefore which autoprobe attributes are in effect?  Thanks,
>
> Alex

Basically it comes down to what driver is loaded on it. For now
vfio-pci and virtio would be the only two using the "unmanaged"
version of things.

Really you don't know which autoprobe is in effect until SR-IOV is
enabled by whatever driver. As such you should really be setting both
the drivers_autoprobe and the unmanaged_autoprobe based on the
expected use case.

- Alex


Re: [virtio-dev] Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Siwei Liu
On Fri, Mar 2, 2018 at 3:12 PM, Samudrala, Sridhar
 wrote:
> On 3/2/2018 1:11 PM, Siwei Liu wrote:
>>
>> On Thu, Mar 1, 2018 at 12:08 PM, Sridhar Samudrala
>>  wrote:
>>>
>>> This patch enables virtio_net to switch over to a VF datapath when a VF
>>> netdev is present with the same MAC address. It allows live migration
>>> of a VM with a direct attached VF without the need to setup a bond/team
>>> between a VF and virtio net device in the guest.
>>>
>>> The hypervisor needs to enable only one datapath at any time so that
>>> packets don't get looped back to the VM over the other datapath. When a
>>> VF
>>> is plugged, the virtio datapath link state can be marked as down. The
>>> hypervisor needs to unplug the VF device from the guest on the source
>>> host
>>> and reset the MAC filter of the VF to initiate failover of datapath to
>>> virtio before starting the migration. After the migration is completed,
>>> the destination hypervisor sets the MAC filter on the VF and plugs it
>>> back
>>> to the guest to switch over to VF datapath.
>>>
>>> When BACKUP feature is enabled, an additional netdev(bypass netdev) is
>>> created that acts as a master device and tracks the state of the 2 lower
>>> netdevs. The original virtio_net netdev is marked as 'backup' netdev and
>>> a
>>> passthru device with the same MAC is registered as 'active' netdev.
>>>
>>> This patch is based on the discussion initiated by Jesse on this thread.
>>> https://marc.info/?l=linux-virtualization=151189725224231=2
>>>
>>> Signed-off-by: Sridhar Samudrala 
>>> Signed-off-by: Alexander Duyck 
>>> Reviewed-by: Jesse Brandeburg 
>>> ---
>>>   drivers/net/virtio_net.c | 683
>>> ++-
>>>   1 file changed, 682 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index bcd13fe906ca..f2860d86c952 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -30,6 +30,8 @@
>>>   #include 
>>>   #include 
>>>   #include 
>>> +#include 
>>> +#include 
>>>   #include 
>>>   #include 
>>>
>>> @@ -206,6 +208,9 @@ struct virtnet_info {
>>>  u32 speed;
>>>
>>>  unsigned long guest_offloads;
>>> +
>>> +   /* upper netdev created when BACKUP feature enabled */
>>> +   struct net_device *bypass_netdev;
>>>   };
>>>
>>>   struct padded_vnet_hdr {
>>> @@ -2236,6 +2241,22 @@ static int virtnet_xdp(struct net_device *dev,
>>> struct netdev_bpf *xdp)
>>>  }
>>>   }
>>>
>>> +static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
>>> + size_t len)
>>> +{
>>> +   struct virtnet_info *vi = netdev_priv(dev);
>>> +   int ret;
>>> +
>>> +   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
>>> +   return -EOPNOTSUPP;
>>> +
>>> +   ret = snprintf(buf, len, "_bkup");
>>> +   if (ret >= len)
>>> +   return -EOPNOTSUPP;
>>> +
>>> +   return 0;
>>> +}
>>> +
>>
>> What if the systemd/udevd is not new enough to enforce the
>> n naming? Would virtio_bypass get a different name
>> than the original virtio_net? Should we detect this earlier and fall
>> back to legacy mode without creating the bypass netdev and ensalving
>> the VF?
>
>
> If udev doesn't support renaming of the devices,  then the upper bypass
> device
> should get the original name and the lower virtio netdev will get the next
> name.

If you got two virtio-net's (say e.g. eth0 and eth1) before the
update, the virtio-bypass interface on the first virtio gets eth0 and
the backup netdev would get eth1? Then the IP address originally on
eth1 gets configurd on the backup interface?

> Hopefully the distros updating the kernel will also move to the new
> systemd/udev.

This is not reliable. I'd opt for a new udev API for this, and fall
back to what it was if don't see fit.

>
>
>
>>
>>>   static const struct net_device_ops virtnet_netdev = {
>>>  .ndo_open= virtnet_open,
>>>  .ndo_stop= virtnet_close,
>>> @@ -2253,6 +2274,7 @@ static const struct net_device_ops virtnet_netdev =
>>> {
>>>  .ndo_xdp_xmit   = virtnet_xdp_xmit,
>>>  .ndo_xdp_flush  = virtnet_xdp_flush,
>>>  .ndo_features_check = passthru_features_check,
>>> +   .ndo_get_phys_port_name = virtnet_get_phys_port_name,
>>>   };
>>>
>>>   static void virtnet_config_changed_work(struct work_struct *work)
>>> @@ -2647,6 +2669,653 @@ static int virtnet_validate(struct virtio_device
>>> *vdev)
>>>  return 0;
>>>   }
>>>
>>> +/* START of functions supporting VIRTIO_NET_F_BACKUP feature.
>>> + * When BACKUP feature is enabled, an additional netdev(bypass netdev)
>>> + * is created that acts as a master device and tracks the state of the
>>> + * 2 lower netdevs. The original 

Re: [PATCH 1/3] pci-iov: Add support for unmanaged SR-IOV

2018-03-02 Thread Alex Williamson
On Fri, 02 Mar 2018 15:44:25 -0800
Alexander Duyck  wrote:

> From: Alexander Duyck 
> 
> This patch is meant to add some basic functionality to support for SR-IOV
> on devices when the VFs are not managed by the kernel. The functions
> provided here can be used by drivers such as vfio-pci and virtio to enable
> SR-IOV on devices that are either managed by userspace, or by some sort of
> firmware entity respectively.
> 
> A new sysfs value called sriov_unmanaged_autoprobe has been added. This
> value is used as the drivers_autoprobe setting of the VFs when they are
> being managed by an external entity such as userspace or device firmware
> instead of being managed by the kernel.
> 
> One side effect of this change is that the sriov_drivers_autoprobe and
> sriov_unmanaged_autoprobe will only apply their updates when SR-IOV is
> disabled. Attempts to update them when SR-IOV is in use will only update
> the local value and will not update sriov->autoprobe.
> 
> Signed-off-by: Alexander Duyck 
> ---
>  Documentation/ABI/testing/sysfs-bus-pci |   17 ++
>  drivers/pci/iov.c   |   37 
> +++
>  drivers/pci/pci-driver.c|2 +-
>  drivers/pci/pci-sysfs.c |   29 
>  drivers/pci/pci.h   |4 +++
>  include/linux/pci.h |1 +
>  6 files changed, 88 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-pci 
> b/Documentation/ABI/testing/sysfs-bus-pci
> index 44d4b2be92fd..ff0b6c19cb1a 100644
> --- a/Documentation/ABI/testing/sysfs-bus-pci
> +++ b/Documentation/ABI/testing/sysfs-bus-pci
> @@ -323,3 +323,20 @@ Description:
>  
>   This is similar to /sys/bus/pci/drivers_autoprobe, but
>   affects only the VFs associated with a specific PF.
> +
> +What:/sys/bus/pci/devices/.../sriov_unmanaged_autoprobe
> +Date:March 2018
> +Contact: Alexander Duyck 
> +Description:
> + This file is associated with the PF of a device that
> + supports SR-IOV.  It determines whether newly-enabled VFs
> + are immediately bound to a driver when the PF driver does
> + not manage the VFs itself.  It initially contains 0, which
> + means the kernel will not automatically bind VFs to a driver.
> + If an application writes 1 to the file before enabling VFs,
> + the kernel will bind VFs to a compatible driver immediately
> + after they are enabled.
> +
> + This overrides /sys/bus/pci/devices/.../sriov_drivers_autoprobe
> + when a PF driver is not present to manage a device, or the PF
> + driver does not provide functionality to support SR-IOV.


Given a pf, how does a user determine whether it is managed or unmanaged
and therefore which autoprobe attributes are in effect?  Thanks,

Alex


Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Siwei Liu
On Fri, Mar 2, 2018 at 1:36 PM, Michael S. Tsirkin  wrote:
> On Fri, Mar 02, 2018 at 01:11:56PM -0800, Siwei Liu wrote:
>> On Thu, Mar 1, 2018 at 12:08 PM, Sridhar Samudrala
>>  wrote:
>> > This patch enables virtio_net to switch over to a VF datapath when a VF
>> > netdev is present with the same MAC address. It allows live migration
>> > of a VM with a direct attached VF without the need to setup a bond/team
>> > between a VF and virtio net device in the guest.
>> >
>> > The hypervisor needs to enable only one datapath at any time so that
>> > packets don't get looped back to the VM over the other datapath. When a VF
>> > is plugged, the virtio datapath link state can be marked as down. The
>> > hypervisor needs to unplug the VF device from the guest on the source host
>> > and reset the MAC filter of the VF to initiate failover of datapath to
>> > virtio before starting the migration. After the migration is completed,
>> > the destination hypervisor sets the MAC filter on the VF and plugs it back
>> > to the guest to switch over to VF datapath.
>> >
>> > When BACKUP feature is enabled, an additional netdev(bypass netdev) is
>> > created that acts as a master device and tracks the state of the 2 lower
>> > netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
>> > passthru device with the same MAC is registered as 'active' netdev.
>> >
>> > This patch is based on the discussion initiated by Jesse on this thread.
>> > https://marc.info/?l=linux-virtualization=151189725224231=2
>> >
>> > Signed-off-by: Sridhar Samudrala 
>> > Signed-off-by: Alexander Duyck 
>> > Reviewed-by: Jesse Brandeburg 
>> > ---
>> >  drivers/net/virtio_net.c | 683 
>> > ++-
>> >  1 file changed, 682 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> > index bcd13fe906ca..f2860d86c952 100644
>> > --- a/drivers/net/virtio_net.c
>> > +++ b/drivers/net/virtio_net.c
>> > @@ -30,6 +30,8 @@
>> >  #include 
>> >  #include 
>> >  #include 
>> > +#include 
>> > +#include 
>> >  #include 
>> >  #include 
>> >
>> > @@ -206,6 +208,9 @@ struct virtnet_info {
>> > u32 speed;
>> >
>> > unsigned long guest_offloads;
>> > +
>> > +   /* upper netdev created when BACKUP feature enabled */
>> > +   struct net_device *bypass_netdev;
>> >  };
>> >
>> >  struct padded_vnet_hdr {
>> > @@ -2236,6 +2241,22 @@ static int virtnet_xdp(struct net_device *dev, 
>> > struct netdev_bpf *xdp)
>> > }
>> >  }
>> >
>> > +static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
>> > + size_t len)
>> > +{
>> > +   struct virtnet_info *vi = netdev_priv(dev);
>> > +   int ret;
>> > +
>> > +   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
>> > +   return -EOPNOTSUPP;
>> > +
>> > +   ret = snprintf(buf, len, "_bkup");
>> > +   if (ret >= len)
>> > +   return -EOPNOTSUPP;
>> > +
>> > +   return 0;
>> > +}
>> > +
>>
>> What if the systemd/udevd is not new enough to enforce the
>> n naming? Would virtio_bypass get a different name
>> than the original virtio_net?
>
> You mean people using ethX names? Any hardware config change breaks
> these, I don't think that can be helped.

I don't like the way to rely on .ndo_get_phys_port_name - it's fragile
and it does not completely solve the problem it tries to address.
Imagine what can end up with if getting an old udevd, or users already
have exsiting explicit udev rules around phys_port_name. It does not
give you the an ack in saying "yes, I know you're the bypass and
you're the backup, please continue and I will give you both correct
names", or an unacknowlegment saying "no, I don't know what these
extra interfaces are, please go back and leave the VF device alone".
We need new udev API for both feature negotiation and naming, or may
even completely hide the lower interfaces.

>
>> Should we detect this earlier and fall
>> back to legacy mode without creating the bypass netdev and ensalving
>> the VF?
>
> I don't think we can do this with existing kernel/userspace APIs.

That's why I ever said to make udev aware of this new type of combined
device instead of doing hacks here and there around.

Regards,
-Siwei

>
> --
> MST


[PATCH 3/3] virtio_pci: Add support for unmanaged SR-IOV on virtio_pci devices

2018-03-02 Thread Alexander Duyck
From: Alexander Duyck 

Hardware-realized virtio_pci devices can implement SR-IOV, so this
patch enables its use. The device in question is an upcoming Intel
NIC that implements both a virtio_net PF and virtio_net VFs. These
are hardware realizations of what has been up to now been a software
interface.

The device in question has the following 4-part PCI IDs:

PF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 15fe
VF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 05fe

The patch currently needs no check for device ID, because the callback
will never be made for devices that do not assert the capability or
when run on a platform incapable of SR-IOV.

One reason for this patch is because the hardware requires the
vendor ID of a VF to be the same as the vendor ID of the PF that
created it. So it seemed logical to simply have a fully-functioning
virtio_net PF create the VFs. This patch makes that possible.

Signed-off-by: Mark Rustad 
Signed-off-by: Alexander Duyck 
---
 drivers/virtio/virtio_pci_common.c |4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 48d4d1cf1cb6..ca1549393255 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -584,6 +584,7 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
else
virtio_pci_modern_remove(vp_dev);
 
+   pci_disable_sriov(pci_dev);
pci_disable_device(pci_dev);
put_device(dev);
 }
@@ -596,6 +597,9 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
 #ifdef CONFIG_PM_SLEEP
.driver.pm  = _pci_pm_ops,
 #endif
+#ifdef CONFIG_PCI_IOV
+   .sriov_configure = pci_sriov_configure_unmanaged,
+#endif
 };
 
 module_pci_driver(virtio_pci_driver);



[PATCH 2/3] vfio: Add support for unmanaged or userspace managed SR-IOV

2018-03-02 Thread Alexander Duyck
From: Alexander Duyck 

This patch is meant to allow assignment of an SR-IOV enabled PF, as in VFs
have been generated, with vfio-pci. My understanding is the primary use
case for this is something like DPDK running the PF while the VFs are all
assigned to guests.

A secondary effect of this is that it provides an interface through which
it would be possible to enable SR-IOV on drivers that may not have a
physical function that actually manages the device.

Enabling SR-IOV should be pretty straight forward. As long as there are no
userspace processes currently controlling the interface the number of VFs
can be changed, and VFs will be generated without drivers being loaded on
the host. Once the userspace process begins controlling the interface the
number of VFs cannot be updated via the sysfs until the control is
released.

Note the VFs will have drivers load on them in the host if the
sriov_unmanaged_autoprobe is updated to a value of 1. However the behavior
of the VFs in such a setup cannot be guaranteed as the PF will not be
available until the userspace process starts and begins to manage the
device.

For now I am leaving the value as locked when the PF is being controlled
from userspace as a form of synchronization. Basically this way we cannot
have the number of VFs change out from under the process so it should not
require any notification framework, and the configuration can just be read
out via configuration space accesses.

Signed-off-by: Alexander Duyck 
---
 drivers/vfio/pci/vfio_pci.c |   57 +++
 1 file changed, 57 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index b0f759476900..3023bda39aa9 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1224,6 +1224,8 @@ static void vfio_pci_remove(struct pci_dev *pdev)
VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM);
}
 
+   pci_disable_sriov(pdev);
+
if (!disable_idle_d3)
pci_set_power_state(pdev, PCI_D0);
 }
@@ -1260,12 +1262,67 @@ static pci_ers_result_t 
vfio_pci_aer_err_detected(struct pci_dev *pdev,
.error_detected = vfio_pci_aer_err_detected,
 };
 
+static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
+{
+   struct vfio_pci_device *vdev;
+   struct vfio_device *device;
+   int err;
+
+   device = vfio_device_get_from_dev(>dev);
+   if (device == NULL)
+   return -ENODEV;
+
+   vdev = vfio_device_data(device);
+   if (vdev == NULL) {
+   vfio_device_put(device);
+   return -ENODEV;
+   }
+
+   /*
+* If a userspace process is already using this device just return
+* busy and don't allow for any changes.
+*/
+   if (vdev->refcnt) {
+   pci_warn(pdev,
+"PF is currently in use, blocked until released by 
user\n");
+   return -EBUSY;
+   }
+
+   err = pci_sriov_configure_unmanaged(pdev, nr_virtfn);
+   if (err <= 0)
+   return err;
+
+   /*
+* We are now leaving VFs in the control of some unknown PF entity.
+*
+* Best case is a well behaved userspace PF is expected and any VMs
+* that the VFs will be assigned to are dependent on the userspace
+* entity anyway. An example being NFV where maybe the PF is acting
+* as an accelerated interface for a firewall or switch.
+*
+* Worst case is somebody really messed up and just enabled SR-IOV
+* on a device they were planning to assign to a VM somwhere.
+*
+* In either case it is probably best for us to set the taint flag
+* and warn the user since this could get really ugly really quick
+* if this wasn't what they were planning to do.
+*/
+   add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+   pci_warn(pdev,
+"Adding kernel taint for vfio-pci now managing SR-IOV PF 
device\n");
+
+   return nr_virtfn;
+}
+
 static struct pci_driver vfio_pci_driver = {
.name   = "vfio-pci",
.id_table   = NULL, /* only dynamic ids */
.probe  = vfio_pci_probe,
.remove = vfio_pci_remove,
.err_handler= _err_handlers,
+#ifdef CONFIG_PCI_IOV
+   .sriov_configure = vfio_pci_sriov_configure,
+#endif
 };
 
 struct vfio_devices {



[PATCH 0/3] pci-iov: Add support for unmanaged SR-IOV

2018-03-02 Thread Alexander Duyck
This series is meant to add support for SR-IOV on devices when the VFs are
not managed by the kernel. Examples of recent patches attempting to do this
include:
virto - https://patchwork.kernel.org/patch/10241225/
pci-stub - https://patchwork.kernel.org/patch/10109935/
vfio - https://patchwork.kernel.org/patch/10103353/
uio - https://patchwork.kernel.org/patch/9974031/

Since this is quickly blowing up into a multi-driver problem it is probably
best to implement this solution as generically as possible.

This series is an attempt to do that. What we do with this patch set is 
provide a generic framework to enable SR-IOV in the case that the PF driver 
doesn't support managing the VFs itself.

I based my patch set originally on the patch by Mark Rustad but there isn't
much left after going through and cleaning out the bits that were no longer
needed, and after incorporating the feedback from David Miller. At this point
the only items to be fully reused was his patch description which is now 
present in patch 3 of the set.

I have included the authors of the original 4 patches above in the Cc here.
My hope is to get feedback and/or review on if this works for their use
cases.

My hope is that for now the pci-stub and uio driver approaches can be 
addressed using the current patch that enables vfio-pci support. The only
limitation is that it is also setting the taint flag until we have a better 
solution.

v2: Reduced scope back to just virtio_pci and vfio-pci
Broke into 3 patch set from single patch
Changed autoprobe behavior to always set when num_vfs is set non-zero

Cc: Mark Rustad 
Cc: Maximilian Heyne 
Cc: Liang-Min Wang 
Cc: David Woodhouse 

---

Alexander Duyck (3):
  pci-iov: Add support for unmanaged SR-IOV
  vfio: Add support for unmanaged or userspace managed SR-IOV
  virtio_pci: Add support for unmanaged SR-IOV on virtio_pci devices


 Documentation/ABI/testing/sysfs-bus-pci |   17 +
 drivers/pci/iov.c   |   37 
 drivers/pci/pci-driver.c|2 +
 drivers/pci/pci-sysfs.c |   29 
 drivers/pci/pci.h   |4 ++
 drivers/vfio/pci/vfio_pci.c |   57 +++
 drivers/virtio/virtio_pci_common.c  |4 ++
 include/linux/pci.h |1 +
 8 files changed, 149 insertions(+), 2 deletions(-)

--


[PATCH 1/3] pci-iov: Add support for unmanaged SR-IOV

2018-03-02 Thread Alexander Duyck
From: Alexander Duyck 

This patch is meant to add some basic functionality to support for SR-IOV
on devices when the VFs are not managed by the kernel. The functions
provided here can be used by drivers such as vfio-pci and virtio to enable
SR-IOV on devices that are either managed by userspace, or by some sort of
firmware entity respectively.

A new sysfs value called sriov_unmanaged_autoprobe has been added. This
value is used as the drivers_autoprobe setting of the VFs when they are
being managed by an external entity such as userspace or device firmware
instead of being managed by the kernel.

One side effect of this change is that the sriov_drivers_autoprobe and
sriov_unmanaged_autoprobe will only apply their updates when SR-IOV is
disabled. Attempts to update them when SR-IOV is in use will only update
the local value and will not update sriov->autoprobe.

Signed-off-by: Alexander Duyck 
---
 Documentation/ABI/testing/sysfs-bus-pci |   17 ++
 drivers/pci/iov.c   |   37 +++
 drivers/pci/pci-driver.c|2 +-
 drivers/pci/pci-sysfs.c |   29 
 drivers/pci/pci.h   |4 +++
 include/linux/pci.h |1 +
 6 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-pci 
b/Documentation/ABI/testing/sysfs-bus-pci
index 44d4b2be92fd..ff0b6c19cb1a 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -323,3 +323,20 @@ Description:
 
This is similar to /sys/bus/pci/drivers_autoprobe, but
affects only the VFs associated with a specific PF.
+
+What:  /sys/bus/pci/devices/.../sriov_unmanaged_autoprobe
+Date:  March 2018
+Contact:   Alexander Duyck 
+Description:
+   This file is associated with the PF of a device that
+   supports SR-IOV.  It determines whether newly-enabled VFs
+   are immediately bound to a driver when the PF driver does
+   not manage the VFs itself.  It initially contains 0, which
+   means the kernel will not automatically bind VFs to a driver.
+   If an application writes 1 to the file before enabling VFs,
+   the kernel will bind VFs to a compatible driver immediately
+   after they are enabled.
+
+   This overrides /sys/bus/pci/devices/.../sriov_drivers_autoprobe
+   when a PF driver is not present to manage a device, or the PF
+   driver does not provide functionality to support SR-IOV.
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 677924ae0350..3dcec1fa86bd 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -446,6 +446,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, >vf_device);
iov->pgsz = pgsz;
iov->self = dev;
+   iov->autoprobe = true;
iov->drivers_autoprobe = true;
pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, >cap);
pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, >link);
@@ -683,6 +684,9 @@ int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
if (!dev->is_physfn)
return -ENOSYS;
 
+   /* Update autoprobe setting to reflect managed device */
+   dev->sriov->autoprobe = dev->sriov->drivers_autoprobe;
+
return sriov_enable(dev, nr_virtfn);
 }
 EXPORT_SYMBOL_GPL(pci_enable_sriov);
@@ -807,3 +811,36 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
return dev->sriov->total_VFs;
 }
 EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
+
+/**
+ * pci_sriov_configure_unmanaged - helper to configure unmanaged SR-IOV
+ * @dev: the PCI device
+ * @nr_virtfn: number of virtual functions to enable, 0 to disable
+ *
+ * Used to provide generic enable/disable SR-IOV option for devices
+ * that do not manage the VFs generated by their driver, or have no
+ * driver present.
+ */
+int pci_sriov_configure_unmanaged(struct pci_dev *dev, int nr_virtfn)
+{
+   int err;
+
+   might_sleep();
+
+   if (!dev->is_physfn)
+   return -ENODEV;
+
+   if (!nr_virtfn) {
+   sriov_disable(dev);
+
+   return 0;
+   }
+
+   /* Update autoprobe setting to reflect unmanaged device */
+   dev->sriov->autoprobe = dev->sriov->unmanaged_autoprobe;
+
+   err = sriov_enable(dev, nr_virtfn);
+
+   return err ? err : nr_virtfn;
+}
+EXPORT_SYMBOL_GPL(pci_sriov_configure_unmanaged);
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 3bed6beda051..2cc68dff6130 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -398,7 +398,7 @@ void __weak pcibios_free_irq(struct pci_dev *dev)
 #ifdef CONFIG_PCI_IOV
 static inline bool 

Re: [PATCH net-next] net/ipv6: Address checks need to consider the L3 domain

2018-03-02 Thread David Ahern
On 3/2/18 4:09 PM, David Ahern wrote:
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index f0ae58424c45..792e7432ba6d 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -2820,9 +2820,9 @@ static struct rt6_info *ip6_route_info_create(struct 
> fib6_config *cfg,
>* prefix route was assigned to, which might be non-loopback.
>*/
>   err = -EINVAL;
> - if (ipv6_chk_addr_and_flags(net, gw_addr,
> + if (ipv6_chk_addr_and_flags(net, gw_addr, dev,
>   gwa_type & IPV6_ADDR_LINKLOCAL ?
> - dev : NULL, 0, 0)) {
> + false : true, 0, 0)) {
>   NL_SET_ERR_MSG(extack, "Invalid gateway address");
>   goto out;
>   }

Of course I find this right after sending the patch 

The route add needs another check to see if it is a local address.
Please drop this one; will send a v2 next week.


Re: [PATCH net-next 2/4] net: stmmac: use correct barrier between coherent memory and MMIO

2018-03-02 Thread Niklas Cassel
On Fri, Mar 02, 2018 at 09:54:11AM -0500, David Miller wrote:
> From: Pavel Machek 
> Date: Fri, 2 Mar 2018 10:20:00 +0100
>

Hello Pavel, David

> >> This barrier cannot be a simple dma_wmb(), since a dma_wmb() is only
> >> used to guarantee the ordering, with respect to other writes,
> >> to cache coherent DMA memory.
> > 
> > Could you explain this a bit more (and perhaps in code comment)?

Have a look at:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/memory-barriers.txt?h=v4.16-rc1#n1913

AFAICT, a dma_wmb() can only be used to guarantee that the
writes to cache coherent memory (e.g. memory allocated with
dma_alloc_coherent()) before the dma_wmb() will be performed
before the writes to cache coherent memory after the dma_wmb().

Since most of our writes are simply writing new buffer addresses
and sizes to TDES0/TDES1/TDES2/TDES3, and since these TX DMA
descriptors have been allocated with dma_alloc_coherent(),
a dma_wmb() should be enough to e.g. make sure that TDES3
(which contains the OWN bit), is written after the writes to
TDES0/TDES1/TDES2.

However, the last write we do is "DMA start transmission",
this is a register in the IP, i.e. it is a write to the cache
incoherent MMIO region (rather than a write to cache coherent memory).
To ensure that all writes to cache coherent memory have
completed before we start the DMA, we have to use the barrier
wmb() (which performs a more extensive flush compared to
dma_wmb()).

So the only place where we have to use a wmb() instead
of a dma_wmb() is where we have a write to coherent memory,
followed by a write to cache incoherent MMIO.
The only obvious place where we have this situtation is
where we write the OWN bit immediately followed by a write
to the "DMA start transmission" register.

Note that this also matches how it's done in other other drivers:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/net/ethernet/amd/xgbe/xgbe-dev.c?h=v4.16-rc1#n1638

There is already a comment describing the barrier in
stmmac_xmit() and stmmac_tso_xmit() that says:
/* The own bit must be the latest setting done when prepare the
 * descriptor and then barrier is needed to make sure that
 * all is coherent before granting the DMA engine.
 */
However, if you want, we could mention wmb() explicitly in this comment.

> > 
> > Ensuring other writes are done before writing the "GO!" bit should be
> > enough, no?
> 
> Indeed, the chip should never look at the descriptor contents unless
> the GO bit is set.
> 
> If there are ways that it can, this must be explained and documented
> since it is quite unusual compared to other hardware.
> 
> > (If it is not, do we need heavier barriers in other places, too?)
> 
> Right.

I hope that my explaination above has cleared any potential confusion.


Best regards,
Niklas


Re: [PATCH RESEND net-next 0/2] ntuple filters with RSS

2018-03-02 Thread Alexander Duyck
On Fri, Mar 2, 2018 at 10:55 AM, Jakub Kicinski  wrote:
> On Fri, 2 Mar 2018 15:24:29 +, Edward Cree wrote:
>> On Tue, Feb 27, 2018 at 3:47 PM, Jakub Kicinski  wrote:
>>
>> > Please, let's stop extending ethtool_rx_flow APIs.  I bit my tongue
>> > when Intel was adding their "redirection to VF" based on ethtool ntuples
>> > and look now they're adding the same functionality with flower :|  And
>> > wonder how to handle two interfaces doing the same thing.
>> Since sfc only supports ethtool NFC interfaces (we have no flower support,
>>  and I also wonder how one is to support both of those interfaces without
>>  producing an ugly mess), I'd much rather put this in ethtool than have to
>>  implement all of flower just so we can have this extension.
>
> "Just this one extension" is exactly the attitude that can lead to
> messy APIs :(
>
>> I guess part of the question is, which other drivers besides us would want
>>  to implement something like this, and what are their requirements?
>
> I think every vendor is trying to come up with ways to make their HW
> work with containers better these days.
>
>> > On the use case itself, I wonder how much sense that makes.  Can your
>> > hardware not tag the packet as well so you could then mux it to
>> > something like macvlan offload?
>> In practice the only way our hardware can "tag the packet" is by the
>>  selection of RX queue.  So you could for instance give a container its
>>  own RX queues (rather than just using the existing RX queues on the
>>  appropriate CPUs), and maybe in future hook those queues up to l2fwd
>>  offload somehow.
>> But that seems like a separate job (offloading the macvlan switching) to
>>  what this series is about (making the RX processing happen on the right
>>  CPUs).  Is software macvlan switching really noticeably slow, anyway?
>
> OK, thanks for clarifying.
>
>> Besides, more powerful filtering than just MAC addr might be needed, if,
>>  for instance, the container network is encapsulated.  In that case
>>  something like a UDP 4-tuple filter might be necessary (or, indeed, a
>>  filter looking at the VNID (VxLAN TNI) - which our hardware can do but
>>  ethtool doesn't currently have a way to specify).  AFAICT l2-fwd-offload
>>  can only be used for straight MAC addr, not for overlay networks like
>>  VxLAN or FOU?  At least, existing ndo_dfwd_add_station() implementations
>>  don't seem to check that dev is a macvlan...  Does it even support
>>  VLAN filters?  fm10k implementation doesn't seem to.
>
> Exactly!  One can come up with many protocol combinations which flower
> already has APIs for...  ethtool is not the place for it.
>
>> Anyway, like I say, filtering traffic onto its own queues seems to be
>>  orthogonal, or at least separate, to binding those queues into an
>>  upperdev for demux offload.
>
> It is, I was just trying to broaden the scope to more capable HW so we
> design APIs that would serve all.
>
>> On 28/02/18 01:24, Alexander Duyck wrote:
>>
>> > We did something like this for i40e. Basically we required creating
>> > the queue groups using mqprio to keep them symmetric on Tx and Rx, and
>> > then allowed for TC ingress filters to redirect traffic to those queue
>> > groups.
>> >
>> > - Alex
>> If we're not doing macvlan offload, I'm not sure what, if anything, the
>>  TX side would buy us.  So for now it seems to make sense for TX just to
>>  use the TXQ associated with the CPU from which the TX originates, which
>>  I believe already happens automatically.
>
> I don't think that's what Alex was referring to.  Please see
> commit e284fc280473 ("i40e: Add and delete cloud filter") for
> instance :)

Right. And as far as the Tx queue association goes right now we are
basing things off of skb->priority which is easily controlled via
cgroups. So in theory you could associate a given set of cgroup to a
specific set of Tx queues using this approach.

Most of the filtering that Jakub pointed out is applied to the Rx side
to make sure the packets come in on the right queue set.

- Alex


WAITING FOR YOUR URGENT AND IMMEDIATE RESPONSE.

2018-03-02 Thread Dr Rhama Benson
Dear Friend,

How are you today with your family, Hope all is well?. Please, I would
like you to give an urgent attention to this proposal. I have a very
lucrative business transaction which requires your utmost discretion.

Though, I know it would come to you at uttermost surprise. I am Dr
Rhama Benson, A banker by profession. Please, I want to transfer the
sum of ($10.5.million) dollars into your bank account. This business
is 100% risk free.

Your share will be 40% while 60% for me.

Full details will be send to you on the receipt of your urgent
response by forwarding the following details bellow.

1. Your Full Name
2. Your Telephone No..
3. Your Age
4. Your Home Address
5. Your Country

Thanks for your anticipated co-operation.
Best regards.
Dr Rhama.


Re: [virtio-dev] Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Samudrala, Sridhar

On 3/2/2018 1:11 PM, Siwei Liu wrote:

On Thu, Mar 1, 2018 at 12:08 PM, Sridhar Samudrala
 wrote:

This patch enables virtio_net to switch over to a VF datapath when a VF
netdev is present with the same MAC address. It allows live migration
of a VM with a direct attached VF without the need to setup a bond/team
between a VF and virtio net device in the guest.

The hypervisor needs to enable only one datapath at any time so that
packets don't get looped back to the VM over the other datapath. When a VF
is plugged, the virtio datapath link state can be marked as down. The
hypervisor needs to unplug the VF device from the guest on the source host
and reset the MAC filter of the VF to initiate failover of datapath to
virtio before starting the migration. After the migration is completed,
the destination hypervisor sets the MAC filter on the VF and plugs it back
to the guest to switch over to VF datapath.

When BACKUP feature is enabled, an additional netdev(bypass netdev) is
created that acts as a master device and tracks the state of the 2 lower
netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
passthru device with the same MAC is registered as 'active' netdev.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization=151189725224231=2

Signed-off-by: Sridhar Samudrala 
Signed-off-by: Alexander Duyck 
Reviewed-by: Jesse Brandeburg 
---
  drivers/net/virtio_net.c | 683 ++-
  1 file changed, 682 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index bcd13fe906ca..f2860d86c952 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -30,6 +30,8 @@
  #include 
  #include 
  #include 
+#include 
+#include 
  #include 
  #include 

@@ -206,6 +208,9 @@ struct virtnet_info {
 u32 speed;

 unsigned long guest_offloads;
+
+   /* upper netdev created when BACKUP feature enabled */
+   struct net_device *bypass_netdev;
  };

  struct padded_vnet_hdr {
@@ -2236,6 +2241,22 @@ static int virtnet_xdp(struct net_device *dev, struct 
netdev_bpf *xdp)
 }
  }

+static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
+ size_t len)
+{
+   struct virtnet_info *vi = netdev_priv(dev);
+   int ret;
+
+   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
+   return -EOPNOTSUPP;
+
+   ret = snprintf(buf, len, "_bkup");
+   if (ret >= len)
+   return -EOPNOTSUPP;
+
+   return 0;
+}
+

What if the systemd/udevd is not new enough to enforce the
n naming? Would virtio_bypass get a different name
than the original virtio_net? Should we detect this earlier and fall
back to legacy mode without creating the bypass netdev and ensalving
the VF?


If udev doesn't support renaming of the devices,  then the upper bypass device
should get the original name and the lower virtio netdev will get the next name.
Hopefully the distros updating the kernel will also move to the new 
systemd/udev.





  static const struct net_device_ops virtnet_netdev = {
 .ndo_open= virtnet_open,
 .ndo_stop= virtnet_close,
@@ -2253,6 +2274,7 @@ static const struct net_device_ops virtnet_netdev = {
 .ndo_xdp_xmit   = virtnet_xdp_xmit,
 .ndo_xdp_flush  = virtnet_xdp_flush,
 .ndo_features_check = passthru_features_check,
+   .ndo_get_phys_port_name = virtnet_get_phys_port_name,
  };

  static void virtnet_config_changed_work(struct work_struct *work)
@@ -2647,6 +2669,653 @@ static int virtnet_validate(struct virtio_device *vdev)
 return 0;
  }

+/* START of functions supporting VIRTIO_NET_F_BACKUP feature.
+ * When BACKUP feature is enabled, an additional netdev(bypass netdev)
+ * is created that acts as a master device and tracks the state of the
+ * 2 lower netdevs. The original virtio_net netdev is registered as
+ * 'backup' netdev and a passthru device with the same MAC is registered
+ * as 'active' netdev.
+ */
+
+/* bypass state maintained when BACKUP feature is enabled */
+struct virtnet_bypass_info {
+   /* passthru netdev with same MAC */
+   struct net_device __rcu *active_netdev;
+
+   /* virtio_net netdev */
+   struct net_device __rcu *backup_netdev;
+
+   /* active netdev stats */
+   struct rtnl_link_stats64 active_stats;
+
+   /* backup netdev stats */
+   struct rtnl_link_stats64 backup_stats;
+
+   /* aggregated stats */
+   struct rtnl_link_stats64 bypass_stats;
+
+   /* spinlock while updating stats */
+   spinlock_t stats_lock;
+};
+
+static void virtnet_bypass_child_open(struct net_device *dev,
+ struct net_device *child_netdev)
+{
+   

Re: [PATCH 09/58] net/irda: Convert timers to use timer_setup()

2018-03-02 Thread Marcelo Ricardo Leitner
On Fri, Mar 02, 2018 at 02:30:30PM -0800, Kees Cook wrote:
> On Fri, Mar 2, 2018 at 1:29 PM, Marcelo Ricardo Leitner
>  wrote:
> > Note how it is using the irda_start_timer definition from
> > include/net/irda/timer.h instead of
> > drivers/staging/irda/include/net/irda/timer.h which was patched in
> > this patch.
> 
> $ git show net-next/master:include/net/irda/iriap.h
> fatal: Path 'include/net/irda/iriap.h' does not exist in 'net-next/master'
> 
> 4.14 moved include/net/irda/iriap.h into the staging directory:
> 
> 5bf916ee0ab6 ("irda: move include/net/irda into staging subdirectory")
> 
> I think you've got a stale copy of the old file in your tree...

Right you are. Sorry for the noise.

Thanks,
  Marcelo


[PATCH v2 0/4] net: Use strlcpy() for ethtool::get_strings

2018-03-02 Thread Florian Fainelli
Hi all,

After turning on KASAN on one of my systems, I started getting lots of out of
bounds errors while fetching a given port's statistics, and indeed using
memcpy() is unsafe for copying strings which have not been declared as an array
of ETH_GSTRING_LEN bytes, so let's use strlcpy() instead. This allows the best
of both worlds: we still keep the efficient memory usage of variably sized
strings, but we don't copy more than we need to.

Changes in v2:
- dropped the 3 other patches that were not necessary
- use strlcpy() instead of strncpy()

Florian Fainelli (4):
  net: dsa: b53: Use strlcpy() for ethtool::get_strings
  net: phy: marvell: Use strlcpy() for ethtool::get_strings
  net: phy: micrel: Use strlcpy() for ethtool::get_strings
  net: phy: broadcom: Use strlcpy() for ethtool::get_strings

 drivers/net/dsa/b53/b53_common.c | 4 ++--
 drivers/net/phy/bcm-phy-lib.c| 4 ++--
 drivers/net/phy/marvell.c| 4 ++--
 drivers/net/phy/micrel.c | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

-- 
2.14.1



[PATCH v2 2/4] net: phy: marvell: Use strlcpy() for ethtool::get_strings

2018-03-02 Thread Florian Fainelli
Our statistics strings are allocated at initialization without being
bound to a specific size, yet, we would copy ETH_GSTRING_LEN bytes using
memcpy() which would create out of bounds accesses, this was flagged by
KASAN. Replace this with strlcpy() to make sure we are bound the source
buffer size and we also always NUL-terminate strings.

Fixes: d2fa47d9dd5c ("phy: marvell: Add ethtool statistics counters")
Signed-off-by: Florian Fainelli 
---
 drivers/net/phy/marvell.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 22d9bc9c33a4..0e0978d8a0eb 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -1452,8 +1452,8 @@ static void marvell_get_strings(struct phy_device 
*phydev, u8 *data)
int i;
 
for (i = 0; i < ARRAY_SIZE(marvell_hw_stats); i++) {
-   memcpy(data + i * ETH_GSTRING_LEN,
-  marvell_hw_stats[i].string, ETH_GSTRING_LEN);
+   strlcpy(data + i * ETH_GSTRING_LEN,
+   marvell_hw_stats[i].string, ETH_GSTRING_LEN);
}
 }
 
-- 
2.14.1



[PATCH v2 1/4] net: dsa: b53: Use strlcpy() for ethtool::get_strings

2018-03-02 Thread Florian Fainelli
Our statistics strings are allocated at initialization without being
bound to a specific size, yet, we would copy ETH_GSTRING_LEN bytes using
memcpy() which would create out of bounds accesses, this was flagged by
KASAN. Replace this with strlcpy() to make sure we are bound the source
buffer size and we also always NUL-terminate strings.

Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch")
Signed-off-by: Florian Fainelli 
---
 drivers/net/dsa/b53/b53_common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index db830a1141d9..63e02a54d537 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -814,8 +814,8 @@ void b53_get_strings(struct dsa_switch *ds, int port, 
uint8_t *data)
unsigned int i;
 
for (i = 0; i < mib_size; i++)
-   memcpy(data + i * ETH_GSTRING_LEN,
-  mibs[i].name, ETH_GSTRING_LEN);
+   strlcpy(data + i * ETH_GSTRING_LEN,
+   mibs[i].name, ETH_GSTRING_LEN);
 }
 EXPORT_SYMBOL(b53_get_strings);
 
-- 
2.14.1



[PATCH v2 4/4] net: phy: broadcom: Use strlcpy() for ethtool::get_strings

2018-03-02 Thread Florian Fainelli
Our statistics strings are allocated at initialization without being
bound to a specific size, yet, we would copy ETH_GSTRING_LEN bytes using
memcpy() which would create out of bounds accesses, this was flagged by
KASAN. Replace this with strlcpy() to make sure we are bound the source
buffer size and we also always NUL-terminate strings.

Fixes: 820ee17b8d3b ("net: phy: broadcom: Add support code for reading PHY 
counters")
Signed-off-by: Florian Fainelli 
---
 drivers/net/phy/bcm-phy-lib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c
index 171010eb4d9c..5ad130c3da43 100644
--- a/drivers/net/phy/bcm-phy-lib.c
+++ b/drivers/net/phy/bcm-phy-lib.c
@@ -341,8 +341,8 @@ void bcm_phy_get_strings(struct phy_device *phydev, u8 
*data)
unsigned int i;
 
for (i = 0; i < ARRAY_SIZE(bcm_phy_hw_stats); i++)
-   memcpy(data + i * ETH_GSTRING_LEN,
-  bcm_phy_hw_stats[i].string, ETH_GSTRING_LEN);
+   strlcpy(data + i * ETH_GSTRING_LEN,
+   bcm_phy_hw_stats[i].string, ETH_GSTRING_LEN);
 }
 EXPORT_SYMBOL_GPL(bcm_phy_get_strings);
 
-- 
2.14.1



[PATCH net-next] net/ipv6: Address checks need to consider the L3 domain

2018-03-02 Thread David Ahern
ipv6_chk_addr_and_flags determines if an address is a local address. It
is called by ip6_route_info_create to validate a gateway address is not a
local address. It currently does not consider L3 domains and as a result
does not allow a route to be added in one VRF if the nexthop points to
an address in a second VRF. e.g.,

$ ip route add 2001:db8:1::/64 vrf r2 via 2001:db8:102::23
Error: Invalid gateway address.

where 2001:db8:102::23 is an address on an interface in vrf r1.

Resolve by comparing the l3mdev for the passed in device and requiring an
l3mdev match with the device containing an address. The intent of checking
for an address on the specified device versus any device in the domain is
mantained by a new argument to skip the check between the passed in device
and the device with the address.

Update the handful of users of ipv6_chk_addr with a NULL dev argument:
- anycast to call ipv6_chk_addr_and_flags. If the device is given by the
  user, look for the given address across the L3 domain. If the index is
  not given, the default table is presumed so only addresses on devices
  not enslaved are considered.

- ip6_tnl_rcv_ctl - local address must exist on device, remote address
  can not exist in L3 domain; only remote check needs to be updated but
  do both for consistency.

Signed-off-by: David Ahern 
---
 include/net/addrconf.h |  4 ++--
 net/ipv6/addrconf.c| 26 ++
 net/ipv6/anycast.c |  9 ++---
 net/ipv6/datagram.c|  5 +++--
 net/ipv6/ip6_tunnel.c  | 12 
 net/ipv6/ndisc.c   |  2 +-
 net/ipv6/route.c   |  4 ++--
 7 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index c4185a7b0e90..132e5b95167a 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -69,8 +69,8 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg);
 int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
  const struct net_device *dev, int strict);
 int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
-   const struct net_device *dev, int strict,
-   u32 banned_flags);
+   const struct net_device *dev, bool skip_dev_check,
+   int strict, u32 banned_flags);
 
 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b5fd116c046a..c8432d778a3f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1851,22 +1851,40 @@ static int ipv6_count_addresses(const struct inet6_dev 
*idev)
 int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
  const struct net_device *dev, int strict)
 {
-   return ipv6_chk_addr_and_flags(net, addr, dev, strict, IFA_F_TENTATIVE);
+   return ipv6_chk_addr_and_flags(net, addr, dev, false,
+  strict, IFA_F_TENTATIVE);
 }
 EXPORT_SYMBOL(ipv6_chk_addr);
 
+/* device argument is used to find the L3 domain of interest. If
+ * skip_dev_check is set, then the ifp device is not checked against
+ * the passed in dev argument. So the 2 cases for addresses checks are:
+ *   1. does the address exist in the L3 domain that dev is part of
+ *  (skip_dev_check = true), or
+ *
+ *   2. does the address exist on the specific device
+ *  (skip_dev_check = false)
+ */
 int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
-   const struct net_device *dev, int strict,
-   u32 banned_flags)
+   const struct net_device *dev, bool skip_dev_check,
+   int strict, u32 banned_flags)
 {
unsigned int hash = inet6_addr_hash(net, addr);
+   const struct net_device *l3mdev;
struct inet6_ifaddr *ifp;
u32 ifp_flags;
 
rcu_read_lock();
+
+   l3mdev = l3mdev_master_dev_rcu(dev);
+
hlist_for_each_entry_rcu(ifp, _addr_lst[hash], addr_lst) {
if (!net_eq(dev_net(ifp->idev->dev), net))
continue;
+
+   if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev)
+   continue;
+
/* Decouple optimistic from tentative for evaluation here.
 * Ban optimistic addresses explicitly, when required.
 */
@@ -1875,7 +1893,7 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct 
in6_addr *addr,
: ifp->flags;
if (ipv6_addr_equal(>addr, addr) &&
!(ifp_flags_flags) &&
-   (!dev || ifp->idev->dev == dev ||
+   (skip_dev_check || ifp->idev->dev == dev ||
 !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {

[PATCH v2 3/4] net: phy: micrel: Use strlcpy() for ethtool::get_strings

2018-03-02 Thread Florian Fainelli
Our statistics strings are allocated at initialization without being
bound to a specific size, yet, we would copy ETH_GSTRING_LEN bytes using
memcpy() which would create out of bounds accesses, this was flagged by
KASAN. Replace this with strlcpy() to make sure we are bound the source
buffer size and we also always NUL-terminate strings.

Fixes: 2b2427d06426 ("phy: micrel: Add ethtool statistics counters")
Signed-off-by: Florian Fainelli 
---
 drivers/net/phy/micrel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 0f45310300f6..49be85afbea9 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -664,8 +664,8 @@ static void kszphy_get_strings(struct phy_device *phydev, 
u8 *data)
int i;
 
for (i = 0; i < ARRAY_SIZE(kszphy_hw_stats); i++) {
-   memcpy(data + i * ETH_GSTRING_LEN,
-  kszphy_hw_stats[i].string, ETH_GSTRING_LEN);
+   strlcpy(data + i * ETH_GSTRING_LEN,
+   kszphy_hw_stats[i].string, ETH_GSTRING_LEN);
}
 }
 
-- 
2.14.1



Re: [PATCH net] ipv6: Reflect MTU changes on PMTU of exceptions for MTU-less routes

2018-03-02 Thread David Ahern
On 3/2/18 8:36 AM, Stefano Brivio wrote:
> Currently, administrative MTU changes on a given netdevice are
> not reflected on route exceptions for MTU-less routes, with a
> set PMTU value, for that device:
> 
>  # ip -6 route get 3000::b
>  3000::b from :: dev vti_a proto kernel src 3000::a metric 256 pref medium
>  # ping6 -c 1 -q -s1 3000::b > /dev/null
>  # ip netns exec a ip -6 route get 3000::b
>  3000::b from :: dev vti_a src 3000::a metric 0
>  cache expires 571sec mtu 4926 pref medium
>  # ip link set dev vti_a mtu 3000
>  # ip -6 route get 3000::b
>  3000::b from :: dev vti_a src 3000::a metric 0
>  cache expires 571sec mtu 4926 pref medium
>  # ip link set dev vti_a mtu 9000
>  # ip -6 route get 3000::b
>  3000::b from :: dev vti_a src 3000::a metric 0
>  cache expires 571sec mtu 4926 pref medium

Addresses in the 2001:db8: range should be used for commit messages.

And please codify the above expectation as a test under
tools/testing/selftests/net





Re: [PATCH PATCH net v2 0/9] hv_netvsc: minor fixes

2018-03-02 Thread Jakub Kicinski
On Fri,  2 Mar 2018 13:49:00 -0800, Stephen Hemminger wrote:
>- change propogate rx mode patch to handle startup of vf

Thanks! :)


Re: Issue accessing task_struct from BPF due to 4.16 stack-protector changes

2018-03-02 Thread Kees Cook
On Fri, Mar 2, 2018 at 2:26 PM, Alexei Starovoitov
 wrote:
> On Fri, Mar 02, 2018 at 02:04:17PM -0800, Gianluca Borello wrote:
>> On Fri, Mar 2, 2018 at 12:42 PM, Alexei Starovoitov
>>  wrote:
>> >
>> > good catch!
>> > I wonder why sched.h is using this flag insead of relying on #defines from 
>> > autoconf.h
>> > It could have been using CONFIG_HAVE_CC_STACKPROTECTOR
>> > instead of CONFIG_CC_STACKPROTECTOR, no ?
>> >
>>
>> Thanks for your reply Alexei. I think switching to
>> HAVE_CC_STACKPROTECTOR could indeed solve this particular BPF issue in
>> a cleaner way (I tested it), at the cost of having that struct member
>> always present for the supported architectures even if the stack
>> protector is actually disabled (e.g. CONFIG_CC_STACKPROTECTOR_NONE=y).
>
> if defined(HAVE_CC_STACKPROTECTOR) && !defined(CONFIG_CC_STACKPROTECTOR_NONE)

CONFIG_CC_STACKPROTECTOR_AUTO may result in no stack protector, so
CONFIG_CC_STACKPROTECTOR is the way to determine if it should exist.

> let's fix it properly instead of adding more hacks to Makefiles

It is being fixed properly -- the detection code is being moved out of
Makefile into Kconfig, at which point this won't be as weird as it is.

If KBUILD_CPPFLAGS won't work for you, I'm not hugely opposed to
switching the task_struct ifdef to HAVE_CC_STACKPROTECTOR, since it is
extremely rare to build without stack protector on architectures that
support it.

-Kees

-- 
Kees Cook
Pixel Security


Re: [PATCH 09/58] net/irda: Convert timers to use timer_setup()

2018-03-02 Thread Kees Cook
On Fri, Mar 2, 2018 at 1:29 PM, Marcelo Ricardo Leitner
 wrote:
> Note how it is using the irda_start_timer definition from
> include/net/irda/timer.h instead of
> drivers/staging/irda/include/net/irda/timer.h which was patched in
> this patch.

$ git show net-next/master:include/net/irda/iriap.h
fatal: Path 'include/net/irda/iriap.h' does not exist in 'net-next/master'

4.14 moved include/net/irda/iriap.h into the staging directory:

5bf916ee0ab6 ("irda: move include/net/irda into staging subdirectory")

I think you've got a stale copy of the old file in your tree...

-Kees

-- 
Kees Cook
Pixel Security


Re: [virtio-dev] Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Siwei Liu
On Fri, Mar 2, 2018 at 1:31 PM, Michael S. Tsirkin  wrote:
> On Fri, Mar 02, 2018 at 12:44:56PM -0800, Siwei Liu wrote:
>> On Fri, Mar 2, 2018 at 12:10 PM, Michael S. Tsirkin  wrote:
>> > On Fri, Mar 02, 2018 at 11:52:27AM -0800, Samudrala, Sridhar wrote:
>> >>
>> >>
>> >> On 3/2/2018 11:41 AM, Michael S. Tsirkin wrote:
>> >> > On Fri, Mar 02, 2018 at 07:26:25AM -0800, Alexander Duyck wrote:
>> >> > > The design limits things to a 1:1 relationship since we just have the
>> >> > > child and backup pointers, but I don't think I am seeing exception
>> >> > > handling to prevent us from overwriting the child pointers so there
>> >> > > may be a leak there.
>> >> > >
>> >> > > Thanks.
>> >> > >
>> >> > > - Alex
>> >> > In fact maintaining a list in that case would be nicer, and
>> >> > just use an arbitrary one.
>> >> > E.g. one can see how a user wanting to swap device 1 for device 2
>> >> > might first add device 2 with same MAC then drop device 1.
>> >>
>> >> It should be possible to swap VF1 with VF2 by
>> >> 1.- enabling virtio link
>> >> 2.- unplugging VF1
>> >> 3.- plugging VF2
>> >> 4.- disabling virtio link
>> >>
>> >
>> > True, but it isn't hard to avoid breakage if user
>> > swapped steps 2 and 3. No need to make it more
>> > fragile that it has to be.
>>
>> The migration case, VF2 is associated with another PF on another
>> machine (destination), I wonder how it is possible.
>
> E.g. you want to remove the PF so you unplug the VF
> then add another VF of the same PF.
>
>> Even with local plugging of VF2 on the same PF, the MAC address
>> requirement (VF1's == VF2's) would fail the MAC address assignment on
>> VF2.
>>
>> -Siwei
>
> Why would it fail? These are separate cards.

OK. I realized that you may talk about assigning a VF on a diffferent
PF (VF1 on PF1 while VF2 on PF2). And we might assign a pass-through
device rather than a VF. Yes, it's indeed possible that may happen but
I take it as a further step down (another patch maybe) as it would
involve changes to notify the network with gratuituious ARP and/or
unsolicited ND advertisement of the MAC address association with the
new port.

-Siwei

>
>> >
>> > --
>> > MST
>> >
>> > -
>> > To unsubscribe, e-mail: virtio-dev-unsubscr...@lists.oasis-open.org
>> > For additional commands, e-mail: virtio-dev-h...@lists.oasis-open.org
>> >


Re: Issue accessing task_struct from BPF due to 4.16 stack-protector changes

2018-03-02 Thread Alexei Starovoitov
On Fri, Mar 02, 2018 at 02:04:17PM -0800, Gianluca Borello wrote:
> On Fri, Mar 2, 2018 at 12:42 PM, Alexei Starovoitov
>  wrote:
> >
> > good catch!
> > I wonder why sched.h is using this flag insead of relying on #defines from 
> > autoconf.h
> > It could have been using CONFIG_HAVE_CC_STACKPROTECTOR
> > instead of CONFIG_CC_STACKPROTECTOR, no ?
> >
> 
> Thanks for your reply Alexei. I think switching to
> HAVE_CC_STACKPROTECTOR could indeed solve this particular BPF issue in
> a cleaner way (I tested it), at the cost of having that struct member
> always present for the supported architectures even if the stack
> protector is actually disabled (e.g. CONFIG_CC_STACKPROTECTOR_NONE=y).

if defined(HAVE_CC_STACKPROTECTOR) && !defined(CONFIG_CC_STACKPROTECTOR_NONE)

or

def(have_cc) && (def(cc_stack_regular) || def(cc_stack_strong) || 
def(cc_stack_auto))

let's fix it properly instead of adding more hacks to Makefiles



Re: Issue accessing task_struct from BPF due to 4.16 stack-protector changes

2018-03-02 Thread Kees Cook
On Fri, Mar 2, 2018 at 2:04 PM, Gianluca Borello  wrote:
> On Fri, Mar 2, 2018 at 12:42 PM, Alexei Starovoitov
>  wrote:
>>
>> good catch!
>> I wonder why sched.h is using this flag insead of relying on #defines from 
>> autoconf.h
>> It could have been using CONFIG_HAVE_CC_STACKPROTECTOR
>> instead of CONFIG_CC_STACKPROTECTOR, no ?
>>
>
> Thanks for your reply Alexei. I think switching to
> HAVE_CC_STACKPROTECTOR could indeed solve this particular BPF issue in
> a cleaner way (I tested it), at the cost of having that struct member
> always present for the supported architectures even if the stack
> protector is actually disabled (e.g. CONFIG_CC_STACKPROTECTOR_NONE=y).
>
> Not sure if this could be frowned upon by someone considering how
> critical task_struct is, but on the other hand is really just 8 bytes.

That structure is huge, and I think it's proper to leave this as is.

Adding KBUILD_CPPFLAGS (for now) seems like the right way to go;
though in the future stack protector will be changed around again (to
be purely Kconfig again). There are a number of issues with its logic
in detecting and enabling, and another draft at solving it is under
development.

-Kees

-- 
Kees Cook
Pixel Security


Re: Issue accessing task_struct from BPF due to 4.16 stack-protector changes

2018-03-02 Thread Gianluca Borello
On Fri, Mar 2, 2018 at 12:42 PM, Alexei Starovoitov
 wrote:
>
> good catch!
> I wonder why sched.h is using this flag insead of relying on #defines from 
> autoconf.h
> It could have been using CONFIG_HAVE_CC_STACKPROTECTOR
> instead of CONFIG_CC_STACKPROTECTOR, no ?
>

Thanks for your reply Alexei. I think switching to
HAVE_CC_STACKPROTECTOR could indeed solve this particular BPF issue in
a cleaner way (I tested it), at the cost of having that struct member
always present for the supported architectures even if the stack
protector is actually disabled (e.g. CONFIG_CC_STACKPROTECTOR_NONE=y).

Not sure if this could be frowned upon by someone considering how
critical task_struct is, but on the other hand is really just 8 bytes.

Thanks


[PATCH PATCH net v2 1/9] hv_netvsc: avoid retry on send during shutdown

2018-03-02 Thread Stephen Hemminger
Change the initialization order so that the device is ready to transmit
(ie connect vsp is completed) before setting the internal reference
to the device with RCU.

This avoids any races on initialization and prevents retry issues
on shutdown.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc.c | 24 +++-
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 17e529af79dc..686900d61374 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -852,13 +852,6 @@ int netvsc_send(struct net_device *ndev,
if (unlikely(!net_device || net_device->destroy))
return -ENODEV;
 
-   /* We may race with netvsc_connect_vsp()/netvsc_init_buf() and get
-* here before the negotiation with the host is finished and
-* send_section_map may not be allocated yet.
-*/
-   if (unlikely(!net_device->send_section_map))
-   return -EAGAIN;
-
nvchan = _device->chan_table[packet->q_idx];
packet->send_buf_index = NETVSC_INVALID_INDEX;
packet->cp_partial = false;
@@ -866,10 +859,8 @@ int netvsc_send(struct net_device *ndev,
/* Send control message directly without accessing msd (Multi-Send
 * Data) field which may be changed during data packet processing.
 */
-   if (!skb) {
-   cur_send = packet;
-   goto send_now;
-   }
+   if (!skb)
+   return netvsc_send_pkt(device, packet, net_device, pb, skb);
 
/* batch packets in send buffer if possible */
msdp = >msd;
@@ -953,7 +944,6 @@ int netvsc_send(struct net_device *ndev,
}
}
 
-send_now:
if (cur_send)
ret = netvsc_send_pkt(device, cur_send, net_device, pb, skb);
 
@@ -1306,11 +1296,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device 
*device,
 
napi_enable(_device->chan_table[0].napi);
 
-   /* Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is
-* populated.
-*/
-   rcu_assign_pointer(net_device_ctx->nvdev, net_device);
-
/* Connect with the NetVsp */
ret = netvsc_connect_vsp(device, net_device, device_info);
if (ret != 0) {
@@ -1319,6 +1304,11 @@ struct netvsc_device *netvsc_device_add(struct hv_device 
*device,
goto close;
}
 
+   /* Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is
+* populated.
+*/
+   rcu_assign_pointer(net_device_ctx->nvdev, net_device);
+
return net_device;
 
 close:
-- 
2.16.1



[PATCH PATCH net v2 2/9] hv_netvsc: only wake transmit queue if link is up

2018-03-02 Thread Stephen Hemminger
Don't wake transmit queues if link is not up yet.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc_drv.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index c5584c2d440e..fa6cf18e7719 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -91,12 +91,11 @@ static int netvsc_open(struct net_device *net)
return ret;
}
 
-   netif_tx_wake_all_queues(net);
-
rdev = nvdev->extension;
-
-   if (!rdev->link_state)
+   if (!rdev->link_state) {
netif_carrier_on(net);
+   netif_tx_wake_all_queues(net);
+   }
 
if (vf_netdev) {
/* Setting synthetic device up transparently sets
-- 
2.16.1



[PATCH PATCH net v2 0/9] hv_netvsc: minor fixes

2018-03-02 Thread Stephen Hemminger
These are improvements to netvsc driver. They aren't functionality
changes so not targeting net-next; and they are not show stopper
bugs that need to go to stable either.

v2
   - drop the irq flags patch, defer it to net-next
   - split the multicast filter flag patch out
   - change propogate rx mode patch to handle startup of vf

Stephen Hemminger (9):
  hv_netvsc: avoid retry on send during shutdown
  hv_netvsc: only wake transmit queue if link is up
  hv_netvsc: fix error unwind handling if vmbus_open fails
  hv_netvsc: cancel subchannel setup before halting device
  hv_netvsc: fix race in napi poll when rescheduling
  hv_netvsc: use napi_schedule_irqoff
  hv_netvsc: defer queue selection to VF
  hv_netvsc: filter multicast/broadcast
  hv_netvsc: propagate rx filters to VF

 drivers/net/hyperv/netvsc.c   | 33 -
 drivers/net/hyperv/netvsc_drv.c   | 62 ---
 drivers/net/hyperv/rndis_filter.c | 23 ++-
 3 files changed, 79 insertions(+), 39 deletions(-)

-- 
2.16.1



[PATCH net-next 5/5] net sched actions: implement get_fill_size routine in act_police

2018-03-02 Thread Roman Mashak
Signed-off-by: Roman Mashak 
---
 net/sched/act_police.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 51fe4fe..d4b4b15 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -314,6 +314,13 @@ static int tcf_police_search(struct net *net, struct 
tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
 }
 
+static size_t tcf_police_get_fill_size(const struct tc_action *act)
+{
+   return nla_total_size(sizeof(struct tc_police)) /* TCA_POLICE_TBF */
+   + nla_total_size(sizeof(u32)) /* TCA_POLICE_RESULT */
+   + nla_total_size(sizeof(u32)); /* TCA_POLICE_AVRATE */
+}
+
 MODULE_AUTHOR("Alexey Kuznetsov");
 MODULE_DESCRIPTION("Policing actions");
 MODULE_LICENSE("GPL");
@@ -327,6 +334,7 @@ static struct tc_action_ops act_police_ops = {
.init   =   tcf_act_police_init,
.walk   =   tcf_act_police_walker,
.lookup =   tcf_police_search,
+   .get_fill_size  =   tcf_police_get_fill_size,
.size   =   sizeof(struct tcf_police),
 };
 
-- 
2.7.4



[PATCH net-next 4/5] net sched actions: implement get_fill_size routine in act_gact

2018-03-02 Thread Roman Mashak
Signed-off-by: Roman Mashak 
---
 net/sched/act_gact.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 7456325..88fbb84 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -217,6 +217,19 @@ static int tcf_gact_search(struct net *net, struct 
tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
 }
 
+static size_t tcf_gact_get_fill_size(const struct tc_action *act)
+{
+   size_t sz = nla_total_size(sizeof(struct tc_gact)); /* TCA_GACT_PARMS */
+
+#ifdef CONFIG_GACT_PROB
+   if (to_gact(act)->tcfg_ptype)
+   /* TCA_GACT_PROB */
+   sz += nla_total_size(sizeof(struct tc_gact_p));
+#endif
+
+   return sz;
+}
+
 static struct tc_action_ops act_gact_ops = {
.kind   =   "gact",
.type   =   TCA_ACT_GACT,
@@ -227,6 +240,7 @@ static struct tc_action_ops act_gact_ops = {
.init   =   tcf_gact_init,
.walk   =   tcf_gact_walker,
.lookup =   tcf_gact_search,
+   .get_fill_size  =   tcf_gact_get_fill_size,
.size   =   sizeof(struct tcf_gact),
 };
 
-- 
2.7.4



[PATCH net-next 2/5] net sched actions: add new tc_action_ops callback

2018-03-02 Thread Roman Mashak
Add a new callback in tc_action_ops, it will be needed by the tc actions
to compute its size when a ADD/DELETE notification message is constructed.
This routine has to take into account optional/variable size TLVs specific
per action.

Signed-off-by: Roman Mashak 
---
 include/net/act_api.h | 1 +
 net/sched/act_api.c   | 8 
 2 files changed, 9 insertions(+)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9c2f226..0a56465 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -97,6 +97,7 @@ struct tc_action_ops {
const struct tc_action_ops *,
struct netlink_ext_ack *);
void(*stats_update)(struct tc_action *, u64, u32, u64);
+   size_t  (*get_fill_size)(const struct tc_action *act);
struct net_device *(*get_dev)(const struct tc_action *a);
 };
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index acac92a..6f3307f 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -136,6 +136,14 @@ static size_t tcf_action_full_attrs_size(size_t sz)
+ sz;
 }
 
+static size_t tcf_action_fill_size(const struct tc_action *act)
+{
+   if (act->ops->get_fill_size)
+   return act->ops->get_fill_size(act) +
+   tcf_action_shared_attrs_size(act);
+   return 0;
+}
+
 static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
   struct netlink_callback *cb)
 {
-- 
2.7.4



[PATCH net-next 3/5] net sched actions: calculate add/delete event message size

2018-03-02 Thread Roman Mashak
Update add/delete action logic to have the size for event messages,
the size is passed to tcf_add_notify() and tcf_del_notify().

Signed-off-by: Roman Mashak 
---
 include/net/act_api.h |  3 ++-
 net/sched/act_api.c   | 26 ++
 net/sched/cls_api.c   |  3 ++-
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 0a56465..e0a9c20 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -167,7 +167,8 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action 
**actions,
int nr_actions, struct tcf_result *res);
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
-   struct list_head *actions, struct netlink_ext_ack *extack);
+   struct list_head *actions, size_t *attr_size,
+   struct netlink_ext_ack *extack);
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 6f3307f..097ca07 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -776,10 +776,12 @@ static void cleanup_a(struct list_head *actions, int ovr)
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
-   struct list_head *actions, struct netlink_ext_ack *extack)
+   struct list_head *actions, size_t *attr_size,
+   struct netlink_ext_ack *extack)
 {
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
+   size_t sz = 0;
int err;
int i;
 
@@ -795,11 +797,14 @@ int tcf_action_init(struct net *net, struct tcf_proto 
*tp, struct nlattr *nla,
goto err;
}
act->order = i;
+   sz += tcf_action_fill_size(act);
if (ovr)
act->tcfa_refcnt++;
list_add_tail(>list, actions);
}
 
+   *attr_size = tcf_action_full_attrs_size(sz);
+
/* Remove the temp refcnt which was necessary to protect against
 * destroying an existing action which was being replaced
 */
@@ -1029,12 +1034,13 @@ static int tca_action_flush(struct net *net, struct 
nlattr *nla,
 
 static int
 tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
-  u32 portid, struct netlink_ext_ack *extack)
+  u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
 {
int ret;
struct sk_buff *skb;
 
-   skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+   skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : 
attr_size,
+   GFP_KERNEL);
if (!skb)
return -ENOBUFS;
 
@@ -1067,6 +1073,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct 
nlmsghdr *n,
int i, ret;
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
+   size_t attr_size = 0;
LIST_HEAD(actions);
 
ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
@@ -1088,13 +1095,14 @@ tca_action_gd(struct net *net, struct nlattr *nla, 
struct nlmsghdr *n,
goto err;
}
act->order = i;
+   attr_size += tcf_action_fill_size(act);
list_add_tail(>list, );
}
 
if (event == RTM_GETACTION)
ret = tcf_get_notify(net, portid, n, , event, extack);
else { /* delete */
-   ret = tcf_del_notify(net, n, , portid, extack);
+   ret = tcf_del_notify(net, n, , portid, attr_size, 
extack);
if (ret)
goto err;
return ret;
@@ -1107,12 +1115,13 @@ tca_action_gd(struct net *net, struct nlattr *nla, 
struct nlmsghdr *n,
 
 static int
 tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
-  u32 portid, struct netlink_ext_ack *extack)
+  u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
 {
struct sk_buff *skb;
int err = 0;
 
-   skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+   skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : 
attr_size,
+   GFP_KERNEL);
if (!skb)
return -ENOBUFS;
 
@@ -1134,15 +1143,16 @@ static int tcf_action_add(struct net *net, struct 
nlattr *nla,
  struct nlmsghdr *n, u32 portid, int ovr,
  struct netlink_ext_ack *extack)
 {
+   size_t attr_size = 0;
int ret = 0;
LIST_HEAD(actions);
 
ret = 

[PATCH net-next 0/5] Fix event generation for actions batch Add/Delete mode

2018-03-02 Thread Roman Mashak
When adding or deleting a batch of entries, the kernel sends upto
TCA_ACT_MAX_PRIO entries in an event to user space. However it does not
consider that the action sizes may vary and require different skb sizes.

For example :

% cat tc-batch.sh
#!/bin/bash
TC="sudo /mnt/iproute2.git/tc/tc"

$TC actions flush action gact
for i in `seq 1 $1`;
do
   cmd="action pass index $i "
   args=$args$cmd
done
$TC actions add $args
%
% ./tc-batch.sh 32
Error: Failed to fill netlink attributes while deleting TC action.
We have an error talking to the kernel
%

This patchset introduces new callback in tc_action_ops, which calculates
the action size, and passes size to tcf_add_notify()/tcf_del_notify(). The
patch fixes act_gact and act_police, and the rest of actions will be
updated in the follow-up patches.

Roman Mashak (5):
  net sched actions: routines to calculate common TLVs size
  net sched actions: add new tc_action_ops callback
  net sched actions: calculate add/delete event message size.
  net sched actions: implement get_fill_size routine in act_gact
  net sched actions: implement get_fill_size routine in act_police

 include/net/act_api.h  |  6 -
 net/sched/act_api.c| 62 +++---
 net/sched/act_gact.c   | 18 +++
 net/sched/act_police.c |  9 
 net/sched/cls_api.c|  3 ++-
 5 files changed, 88 insertions(+), 10 deletions(-)

-- 
2.7.4



[PATCH net-next 1/5] net sched actions: routines to calculate common TLVs size

2018-03-02 Thread Roman Mashak
Introduce routine to calculate size of the common tc netlink attributes,
and another helper routine to get the full message size including netlink
header and service header.

Signed-off-by: Roman Mashak 
---
 net/sched/act_api.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 1f65d6a..acac92a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -109,6 +109,33 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool 
strict)
 }
 EXPORT_SYMBOL(__tcf_idr_release);
 
+static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
+{
+   u32 cookie_len = 0;
+
+   if (act->act_cookie)
+   cookie_len = nla_total_size(act->act_cookie->len);
+
+   return  nla_total_size(0) /* action number nested */
+   + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
+   + cookie_len /* TCA_ACT_COOKIE */
+   + nla_total_size(0) /* TCA_ACT_STATS nested */
+   /* TCA_STATS_BASIC */
+   + nla_total_size_64bit(sizeof(struct gnet_stats_basic))
+   /* TCA_STATS_QUEUE */
+   + nla_total_size_64bit(sizeof(struct gnet_stats_queue))
+   + nla_total_size(0) /* TCA_OPTIONS nested */
+   + nla_total_size(sizeof(struct tcf_t)); /* TCA_GACT_TM */
+}
+
+static size_t tcf_action_full_attrs_size(size_t sz)
+{
+   return NLMSG_HDRLEN /* struct nlmsghdr */
+   + sizeof(struct tcamsg)
+   + nla_total_size(0) /* TCA_ACT_TAB nested */
+   + sz;
+}
+
 static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
   struct netlink_callback *cb)
 {
-- 
2.7.4



[PATCH PATCH net v2 3/9] hv_netvsc: fix error unwind handling if vmbus_open fails

2018-03-02 Thread Stephen Hemminger
Need to delete NAPI association if vmbus_open fails.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 686900d61374..ff97a85b2e9d 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1286,7 +1286,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device 
*device,
 netvsc_channel_cb, net_device->chan_table);
 
if (ret != 0) {
-   netif_napi_del(_device->chan_table[0].napi);
netdev_err(ndev, "unable to open channel: %d\n", ret);
goto cleanup;
}
@@ -1319,6 +1318,7 @@ struct netvsc_device *netvsc_device_add(struct hv_device 
*device,
vmbus_close(device->channel);
 
 cleanup:
+   netif_napi_del(_device->chan_table[0].napi);
free_netvsc_device(_device->rcu);
 
return ERR_PTR(ret);
-- 
2.16.1



[PATCH PATCH net v2 5/9] hv_netvsc: fix race in napi poll when rescheduling

2018-03-02 Thread Stephen Hemminger
There is a race between napi_reschedule and re-enabling interrupts
which could lead to missed host interrrupts.  This occurs when
interrupts are re-enabled (hv_end_read) and vmbus irq callback
(netvsc_channel_cb) has already scheduled NAPI.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index ff97a85b2e9d..4237cedc4f08 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1207,9 +1207,10 @@ int netvsc_poll(struct napi_struct *napi, int budget)
if (send_recv_completions(ndev, net_device, nvchan) == 0 &&
work_done < budget &&
napi_complete_done(napi, work_done) &&
-   hv_end_read(>inbound)) {
+   hv_end_read(>inbound) &&
+   napi_schedule_prep(napi)) {
hv_begin_read(>inbound);
-   napi_reschedule(napi);
+   __napi_schedule(napi);
}
 
/* Driver may overshoot since multiple packets per descriptor */
-- 
2.16.1



[PATCH PATCH net v2 8/9] hv_netvsc: filter multicast/broadcast

2018-03-02 Thread Stephen Hemminger
The netvsc driver was always enabling all multicast and broadcast
even if netdevice flag had not enabled it.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/rndis_filter.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 1cba767c6453..8927c483c217 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -854,15 +854,19 @@ static void rndis_set_multicast(struct work_struct *w)
 {
struct rndis_device *rdev
= container_of(w, struct rndis_device, mcast_work);
+   u32 filter = NDIS_PACKET_TYPE_DIRECTED;
+   unsigned int flags = rdev->ndev->flags;
 
-   if (rdev->ndev->flags & IFF_PROMISC)
-   rndis_filter_set_packet_filter(rdev,
-  NDIS_PACKET_TYPE_PROMISCUOUS);
-   else
-   rndis_filter_set_packet_filter(rdev,
-  NDIS_PACKET_TYPE_BROADCAST |
-  NDIS_PACKET_TYPE_ALL_MULTICAST |
-  NDIS_PACKET_TYPE_DIRECTED);
+   if (flags & IFF_PROMISC) {
+   filter = NDIS_PACKET_TYPE_PROMISCUOUS;
+   } else {
+   if (flags & IFF_ALLMULTI)
+   flags |= NDIS_PACKET_TYPE_ALL_MULTICAST;
+   if (flags & IFF_BROADCAST)
+   flags |= NDIS_PACKET_TYPE_BROADCAST;
+   }
+
+   rndis_filter_set_packet_filter(rdev, filter);
 }
 
 void rndis_filter_update(struct netvsc_device *nvdev)
-- 
2.16.1



[PATCH PATCH net v2 4/9] hv_netvsc: cancel subchannel setup before halting device

2018-03-02 Thread Stephen Hemminger
Block setup of multiple channels earlier in the teardown
process. This avoids possible races between halt and subchannel
initialization.

Suggested-by: Haiyang Zhang 
Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/rndis_filter.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index c3ca191fea7f..1cba767c6453 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1340,6 +1340,9 @@ void rndis_filter_device_remove(struct hv_device *dev,
 {
struct rndis_device *rndis_dev = net_dev->extension;
 
+   /* Don't try and setup sub channels if about to halt */
+   cancel_work_sync(_dev->subchan_work);
+
/* Halt and release the rndis device */
rndis_filter_halt_device(rndis_dev);
 
-- 
2.16.1



[PATCH PATCH net v2 9/9] hv_netvsc: propagate rx filters to VF

2018-03-02 Thread Stephen Hemminger
The netvsc device should propagate filters to the SR-IOV VF
device (if present). The flags also need to be propagated to the
VF device as well. This only really matters on local Hyper-V
since Azure does not support multiple addresses.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc_drv.c | 40 
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 5299cfb16ce2..cdb78eefab67 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -66,10 +66,36 @@ static int debug = -1;
 module_param(debug, int, S_IRUGO);
 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
 
-static void netvsc_set_multicast_list(struct net_device *net)
+static void netvsc_change_rx_flags(struct net_device *net, int change)
 {
-   struct net_device_context *net_device_ctx = netdev_priv(net);
-   struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
+   struct net_device_context *ndev_ctx = netdev_priv(net);
+   struct net_device *vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
+   int inc;
+
+   if (!vf_netdev)
+   return;
+
+   if (change & IFF_PROMISC) {
+   inc = (net->flags & IFF_PROMISC) ? 1 : -1;
+   dev_set_promiscuity(vf_netdev, inc);
+   }
+
+   if (change & IFF_ALLMULTI) {
+   inc = (net->flags & IFF_ALLMULTI) ? 1 : -1;
+   dev_set_allmulti(vf_netdev, inc);
+   }
+}
+
+static void netvsc_set_rx_mode(struct net_device *net)
+{
+   struct net_device_context *ndev_ctx = netdev_priv(net);
+   struct net_device *vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
+   struct netvsc_device *nvdev = rtnl_dereference(ndev_ctx->nvdev);
+
+   if (vf_netdev) {
+   dev_uc_sync(vf_netdev, net);
+   dev_mc_sync(vf_netdev, net);
+   }
 
rndis_filter_update(nvdev);
 }
@@ -1586,7 +1612,8 @@ static const struct net_device_ops device_ops = {
.ndo_open = netvsc_open,
.ndo_stop = netvsc_close,
.ndo_start_xmit =   netvsc_start_xmit,
-   .ndo_set_rx_mode =  netvsc_set_multicast_list,
+   .ndo_change_rx_flags =  netvsc_change_rx_flags,
+   .ndo_set_rx_mode =  netvsc_set_rx_mode,
.ndo_change_mtu =   netvsc_change_mtu,
.ndo_validate_addr =eth_validate_addr,
.ndo_set_mac_address =  netvsc_set_mac_addr,
@@ -1817,6 +1844,11 @@ static void __netvsc_vf_setup(struct net_device *ndev,
netdev_warn(vf_netdev,
"unable to change mtu to %u\n", ndev->mtu);
 
+   /* set multicast etc flags on VF */
+   dev_change_flags(vf_netdev, ndev->flags | IFF_SLAVE);
+   dev_uc_sync(vf_netdev, ndev);
+   dev_mc_sync(vf_netdev, ndev);
+
if (netif_running(ndev)) {
ret = dev_open(vf_netdev);
if (ret)
-- 
2.16.1



[PATCH PATCH net v2 7/9] hv_netvsc: defer queue selection to VF

2018-03-02 Thread Stephen Hemminger
When VF is used for accelerated networking it will likely have
more queues (and different policy) than the synthetic NIC.
This patch defers the queue policy to the VF so that all the
queues can be used. This impacts workloads like local generate UDP.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc_drv.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index fa6cf18e7719..5299cfb16ce2 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -298,8 +298,19 @@ static u16 netvsc_select_queue(struct net_device *ndev, 
struct sk_buff *skb,
rcu_read_lock();
vf_netdev = rcu_dereference(ndc->vf_netdev);
if (vf_netdev) {
-   txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
-   qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+   const struct net_device_ops *vf_ops = vf_netdev->netdev_ops;
+
+   if (vf_ops->ndo_select_queue)
+   txq = vf_ops->ndo_select_queue(vf_netdev, skb,
+  accel_priv, fallback);
+   else
+   txq = fallback(vf_netdev, skb);
+
+   /* Record the queue selected by VF so that it can be
+* used for common case where VF has more queues than
+* the synthetic device.
+*/
+   qdisc_skb_cb(skb)->slave_dev_queue_mapping = txq;
} else {
txq = netvsc_pick_tx(ndev, skb);
}
-- 
2.16.1



[PATCH PATCH net v2 6/9] hv_netvsc: use napi_schedule_irqoff

2018-03-02 Thread Stephen Hemminger
Since the netvsc_channel_cb is already called in interrupt
context from vmbus, there is no need to do irqsave/restore.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 4237cedc4f08..0265d703eb03 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1233,7 +1233,7 @@ void netvsc_channel_cb(void *context)
/* disable interupts from host */
hv_begin_read(rbi);
 
-   __napi_schedule(>napi);
+   __napi_schedule_irqoff(>napi);
}
 }
 
-- 
2.16.1



Good Morning !

2018-03-02 Thread Mr. Allen


Good day.

Do you need a loan to pay off bills ? To pay off your mortgage quickly ? To set 
up a new business or to Re- finance your existing business ? I can help you 
secure a private loan should you be interested please respond for more details 


Thanks 

Allen




Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Michael S. Tsirkin
On Fri, Mar 02, 2018 at 01:11:56PM -0800, Siwei Liu wrote:
> On Thu, Mar 1, 2018 at 12:08 PM, Sridhar Samudrala
>  wrote:
> > This patch enables virtio_net to switch over to a VF datapath when a VF
> > netdev is present with the same MAC address. It allows live migration
> > of a VM with a direct attached VF without the need to setup a bond/team
> > between a VF and virtio net device in the guest.
> >
> > The hypervisor needs to enable only one datapath at any time so that
> > packets don't get looped back to the VM over the other datapath. When a VF
> > is plugged, the virtio datapath link state can be marked as down. The
> > hypervisor needs to unplug the VF device from the guest on the source host
> > and reset the MAC filter of the VF to initiate failover of datapath to
> > virtio before starting the migration. After the migration is completed,
> > the destination hypervisor sets the MAC filter on the VF and plugs it back
> > to the guest to switch over to VF datapath.
> >
> > When BACKUP feature is enabled, an additional netdev(bypass netdev) is
> > created that acts as a master device and tracks the state of the 2 lower
> > netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
> > passthru device with the same MAC is registered as 'active' netdev.
> >
> > This patch is based on the discussion initiated by Jesse on this thread.
> > https://marc.info/?l=linux-virtualization=151189725224231=2
> >
> > Signed-off-by: Sridhar Samudrala 
> > Signed-off-by: Alexander Duyck 
> > Reviewed-by: Jesse Brandeburg 
> > ---
> >  drivers/net/virtio_net.c | 683 
> > ++-
> >  1 file changed, 682 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index bcd13fe906ca..f2860d86c952 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -30,6 +30,8 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> > +#include 
> >  #include 
> >  #include 
> >
> > @@ -206,6 +208,9 @@ struct virtnet_info {
> > u32 speed;
> >
> > unsigned long guest_offloads;
> > +
> > +   /* upper netdev created when BACKUP feature enabled */
> > +   struct net_device *bypass_netdev;
> >  };
> >
> >  struct padded_vnet_hdr {
> > @@ -2236,6 +2241,22 @@ static int virtnet_xdp(struct net_device *dev, 
> > struct netdev_bpf *xdp)
> > }
> >  }
> >
> > +static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
> > + size_t len)
> > +{
> > +   struct virtnet_info *vi = netdev_priv(dev);
> > +   int ret;
> > +
> > +   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
> > +   return -EOPNOTSUPP;
> > +
> > +   ret = snprintf(buf, len, "_bkup");
> > +   if (ret >= len)
> > +   return -EOPNOTSUPP;
> > +
> > +   return 0;
> > +}
> > +
> 
> What if the systemd/udevd is not new enough to enforce the
> n naming? Would virtio_bypass get a different name
> than the original virtio_net?

You mean people using ethX names? Any hardware config change breaks
these, I don't think that can be helped.

> Should we detect this earlier and fall
> back to legacy mode without creating the bypass netdev and ensalving
> the VF?

I don't think we can do this with existing kernel/userspace APIs.

-- 
MST


Re: [virtio-dev] Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Michael S. Tsirkin
On Fri, Mar 02, 2018 at 12:56:21PM -0800, Samudrala, Sridhar wrote:
> 
> 
> On 3/2/2018 12:44 PM, Siwei Liu wrote:
> > On Fri, Mar 2, 2018 at 12:10 PM, Michael S. Tsirkin  wrote:
> > > On Fri, Mar 02, 2018 at 11:52:27AM -0800, Samudrala, Sridhar wrote:
> > > > 
> > > > On 3/2/2018 11:41 AM, Michael S. Tsirkin wrote:
> > > > > On Fri, Mar 02, 2018 at 07:26:25AM -0800, Alexander Duyck wrote:
> > > > > > The design limits things to a 1:1 relationship since we just have 
> > > > > > the
> > > > > > child and backup pointers, but I don't think I am seeing exception
> > > > > > handling to prevent us from overwriting the child pointers so there
> > > > > > may be a leak there.
> > > > > > 
> > > > > > Thanks.
> > > > > > 
> > > > > > - Alex
> > > > > In fact maintaining a list in that case would be nicer, and
> > > > > just use an arbitrary one.
> > > > > E.g. one can see how a user wanting to swap device 1 for device 2
> > > > > might first add device 2 with same MAC then drop device 1.
> > > > It should be possible to swap VF1 with VF2 by
> > > > 1.- enabling virtio link
> > > > 2.- unplugging VF1
> > > > 3.- plugging VF2
> > > > 4.- disabling virtio link
> > > > 
> > > True, but it isn't hard to avoid breakage if user
> > > swapped steps 2 and 3. No need to make it more
> > > fragile that it has to be.
> > The migration case, VF2 is associated with another PF on another
> > machine (destination), I wonder how it is possible.
> > 
> > Even with local plugging of VF2 on the same PF, the MAC address
> > requirement (VF1's == VF2's) would fail the MAC address assignment on
> > VF2.
> > 
> > 
> I didn't include updating the MAC filter step in the above sequence.
> So definitely plugging 2 VFs with the same MAC address will be an issue.

If these are two separate PFs then I don't see why -
each has its own MAC filter.

> Here is the more complete sequence of steps that are required to
> enable live migration.

Replacing VF1 by VF2 is not about migration. It's to remove PF
from host e.g. for service.

-- 
MST


Re: [PATCH iproute2] bpf: Print section name when hitting non ld64 issue

2018-03-02 Thread Stephen Hemminger
On Wed, 28 Feb 2018 14:16:42 -0800
Joe Stringer  wrote:

> It's useful to be able to tell which section is being processed in the
> ELF when this error is triggered, so print that detail.
> 
> Signed-off-by: Joe Stringer 

Applied


Respond for details

2018-03-02 Thread Mr. Allen


Good day.

Do you need a loan to pay off bills ? To pay off your mortgage quickly ? To set 
up a new business or to Re- finance your existing business ? I can help you 
secure a private loan should you be interested please respond for more details 


Thanks 

Allen




Re: [PATCH iproute2] libnetlink: __rtnl_talk_iov should only loop max iovlen times

2018-03-02 Thread Stephen Hemminger
On Thu,  1 Mar 2018 14:43:08 -0800
David Ahern  wrote:

> William reported ip hanging and bisected to a recent commit for batching
> allowing more than 1 command to be sent per message. The loop over
> recvmsg should never cycle more than iovlen times -- 1 response for
> each command in the message.
> 
> Fixes: 72a2ff3916e5 ("lib/libnetlink: Add a new function rtnl_talk_iov")
> Signed-off-by: David Ahern 
> ---

Applied


Re: [virtio-dev] Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Michael S. Tsirkin
On Fri, Mar 02, 2018 at 12:44:56PM -0800, Siwei Liu wrote:
> On Fri, Mar 2, 2018 at 12:10 PM, Michael S. Tsirkin  wrote:
> > On Fri, Mar 02, 2018 at 11:52:27AM -0800, Samudrala, Sridhar wrote:
> >>
> >>
> >> On 3/2/2018 11:41 AM, Michael S. Tsirkin wrote:
> >> > On Fri, Mar 02, 2018 at 07:26:25AM -0800, Alexander Duyck wrote:
> >> > > The design limits things to a 1:1 relationship since we just have the
> >> > > child and backup pointers, but I don't think I am seeing exception
> >> > > handling to prevent us from overwriting the child pointers so there
> >> > > may be a leak there.
> >> > >
> >> > > Thanks.
> >> > >
> >> > > - Alex
> >> > In fact maintaining a list in that case would be nicer, and
> >> > just use an arbitrary one.
> >> > E.g. one can see how a user wanting to swap device 1 for device 2
> >> > might first add device 2 with same MAC then drop device 1.
> >>
> >> It should be possible to swap VF1 with VF2 by
> >> 1.- enabling virtio link
> >> 2.- unplugging VF1
> >> 3.- plugging VF2
> >> 4.- disabling virtio link
> >>
> >
> > True, but it isn't hard to avoid breakage if user
> > swapped steps 2 and 3. No need to make it more
> > fragile that it has to be.
> 
> The migration case, VF2 is associated with another PF on another
> machine (destination), I wonder how it is possible.

E.g. you want to remove the PF so you unplug the VF
then add another VF of the same PF.

> Even with local plugging of VF2 on the same PF, the MAC address
> requirement (VF1's == VF2's) would fail the MAC address assignment on
> VF2.
> 
> -Siwei

Why would it fail? These are separate cards.

> >
> > --
> > MST
> >
> > -
> > To unsubscribe, e-mail: virtio-dev-unsubscr...@lists.oasis-open.org
> > For additional commands, e-mail: virtio-dev-h...@lists.oasis-open.org
> >


Re: [PATCH 09/58] net/irda: Convert timers to use timer_setup()

2018-03-02 Thread Marcelo Ricardo Leitner
On Mon, Oct 16, 2017 at 05:28:53PM -0700, Kees Cook wrote:
> In preparation for unconditionally passing the struct timer_list pointer to
> all timer callbacks, switch to using the new timer_setup() and from_timer()
> to pass the timer pointer explicitly.
> 
> Cc: Samuel Ortiz 
> Cc: "David S. Miller" 
> Cc: Stephen Hemminger 
> Cc: Johannes Berg 
> Cc: Ingo Molnar 
> Cc: netdev@vger.kernel.org
> Signed-off-by: Kees Cook 
> ---
>  .../staging/irda/include/net/irda/irlmp_event.h|  6 +--
>  drivers/staging/irda/include/net/irda/timer.h  | 11 ++---
>  drivers/staging/irda/net/af_irda.c |  7 ++-
>  drivers/staging/irda/net/ircomm/ircomm_tty.c   |  2 +-
>  .../staging/irda/net/ircomm/ircomm_tty_attach.c|  8 ++--
>  drivers/staging/irda/net/irda_device.c | 10 ++--
>  drivers/staging/irda/net/iriap.c   | 10 ++--
>  drivers/staging/irda/net/irlan/irlan_client.c  |  6 +--
>  drivers/staging/irda/net/irlan/irlan_common.c  |  4 +-
>  drivers/staging/irda/net/irlap.c   | 16 +++
>  drivers/staging/irda/net/irlap_event.c |  6 +--
>  drivers/staging/irda/net/irlmp.c   |  8 ++--
>  drivers/staging/irda/net/irlmp_event.c | 10 ++--
>  drivers/staging/irda/net/irttp.c   | 11 ++---
>  drivers/staging/irda/net/timer.c   | 54 
> +++---
>  15 files changed, 79 insertions(+), 90 deletions(-)
> 
> diff --git a/drivers/staging/irda/include/net/irda/irlmp_event.h 
> b/drivers/staging/irda/include/net/irda/irlmp_event.h
> index 9e4ec17a7449..a1a082fe384e 100644
> --- a/drivers/staging/irda/include/net/irda/irlmp_event.h
> +++ b/drivers/staging/irda/include/net/irda/irlmp_event.h
> @@ -82,9 +82,9 @@ typedef enum {
>  extern const char *const irlmp_state[];
>  extern const char *const irlsap_state[];
>  
> -void irlmp_watchdog_timer_expired(void *data);
> -void irlmp_discovery_timer_expired(void *data);
> -void irlmp_idle_timer_expired(void *data);
> +void irlmp_watchdog_timer_expired(struct timer_list *t);
> +void irlmp_discovery_timer_expired(struct timer_list *t);
> +void irlmp_idle_timer_expired(struct timer_list *t);
>  
>  void irlmp_do_lap_event(struct lap_cb *self, IRLMP_EVENT event, 
>   struct sk_buff *skb);
> diff --git a/drivers/staging/irda/include/net/irda/timer.h 
> b/drivers/staging/irda/include/net/irda/timer.h
> index d784f242cf7b..a6635f0afae9 100644
> --- a/drivers/staging/irda/include/net/irda/timer.h
> +++ b/drivers/staging/irda/include/net/irda/timer.h
> @@ -72,14 +72,11 @@ struct lap_cb;
>  
>  #define WATCHDOG_TIMEOUT(20*HZ)   /* 20 sec */
>  
> -typedef void (*TIMER_CALLBACK)(void *);
> -
> -static inline void irda_start_timer(struct timer_list *ptimer, int timeout, 
> - void* data, TIMER_CALLBACK callback)
> +static inline void irda_start_timer(struct timer_list *ptimer, int timeout,
> + void (*callback)(struct timer_list *))
>  {
> - ptimer->function = (void (*)(unsigned long)) callback;
> - ptimer->data = (unsigned long) data;
> - 
> + ptimer->function = (TIMER_FUNC_TYPE) callback;
> +
>   /* Set new value for timer (update or add timer).
>* We use mod_timer() because it's more efficient and also
>* safer with respect to race conditions - Jean II */
> diff --git a/drivers/staging/irda/net/af_irda.c 
> b/drivers/staging/irda/net/af_irda.c
> index 23fa7c8b09a5..b82a47b9ef0b 100644
> --- a/drivers/staging/irda/net/af_irda.c
> +++ b/drivers/staging/irda/net/af_irda.c
> @@ -429,11 +429,11 @@ static void 
> irda_selective_discovery_indication(discinfo_t *discovery,
>   * We were waiting for a node to be discovered, but nothing has come up
>   * so far. Wake up the user and tell him that we failed...
>   */
> -static void irda_discovery_timeout(u_long priv)
> +static void irda_discovery_timeout(struct timer_list *t)
>  {
>   struct irda_sock *self;
>  
> - self = (struct irda_sock *) priv;
> + self = from_timer(self, t, watchdog);
>   BUG_ON(self == NULL);
>  
>   /* Nothing for the caller */
> @@ -2505,8 +2505,7 @@ static int irda_getsockopt(struct socket *sock, int 
> level, int optname,
>  
>   /* Set watchdog timer to expire in  ms. */
>   self->errno = 0;
> - setup_timer(>watchdog, irda_discovery_timeout,
> - (unsigned long)self);
> + timer_setup(>watchdog, irda_discovery_timeout, 0);
>   mod_timer(>watchdog,
> jiffies + msecs_to_jiffies(val));
>  
> diff --git a/drivers/staging/irda/net/ircomm/ircomm_tty.c 
> b/drivers/staging/irda/net/ircomm/ircomm_tty.c
> index ec157c3419b5..473abfaffe7b 

Re: [PATCH] selinux: Fix ltp test connect-syscall failure

2018-03-02 Thread Paul Moore
On Fri, Mar 2, 2018 at 2:54 PM, Richard Haines
 wrote:
> Fix the following error when running regression tests using LTP as follows:
> cd /opt/ltp/
> cat runtest/syscalls |grep connect01>runtest/connect-syscall
> ./runltp -pq -f connect-syscall
>
> Running tests...
> connect011  TPASS  :  bad file descriptor successful
> connect012  TPASS  :  invalid socket buffer successful
> connect013  TPASS  :  invalid salen successful
> connect014  TPASS  :  invalid socket successful
> connect015  TPASS  :  already connected successful
> connect016  TPASS  :  connection refused successful
> connect017  TFAIL  :  connect01.c:146: invalid address family ;
> returned -1 (expected -1), errno 22 (expected 97)
> INFO: ltp-pan reported some tests FAIL
> LTP Version: 20180118
>
> Reported-by: Anders Roxell 
> Signed-off-by: Richard Haines 
> ---
>  security/selinux/hooks.c | 42 ++
>  1 file changed, 30 insertions(+), 12 deletions(-)

Merged, thanks guys.

> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 28a5c4e..d614df1 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -4470,22 +4470,29 @@ static int selinux_socket_bind(struct socket *sock, 
> struct sockaddr *address, in
>  * need to check address->sa_family as it is possible to have
>  * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
>  */
> -   if (address->sa_family == AF_INET) {
> -   if (addrlen < sizeof(struct sockaddr_in)) {
> -   err = -EINVAL;
> -   goto out;
> -   }
> +   switch (address->sa_family) {
> +   case AF_INET:
> +   if (addrlen < sizeof(struct sockaddr_in))
> +   return -EINVAL;
> addr4 = (struct sockaddr_in *)address;
> snum = ntohs(addr4->sin_port);
> addrp = (char *)>sin_addr.s_addr;
> -   } else {
> -   if (addrlen < SIN6_LEN_RFC2133) {
> -   err = -EINVAL;
> -   goto out;
> -   }
> +   break;
> +   case AF_INET6:
> +   if (addrlen < SIN6_LEN_RFC2133)
> +   return -EINVAL;
> addr6 = (struct sockaddr_in6 *)address;
> snum = ntohs(addr6->sin6_port);
> addrp = (char *)>sin6_addr.s6_addr;
> +   break;
> +   default:
> +   /* Note that SCTP services expect -EINVAL, whereas
> +* others expect -EAFNOSUPPORT.
> +*/
> +   if (sksec->sclass == SECCLASS_SCTP_SOCKET)
> +   return -EINVAL;
> +   else
> +   return -EAFNOSUPPORT;
> }
>
> if (snum) {
> @@ -4589,16 +4596,27 @@ static int selinux_socket_connect_helper(struct 
> socket *sock,
>  * need to check address->sa_family as it is possible to have
>  * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
>  */
> -   if (address->sa_family == AF_INET) {
> +   switch (address->sa_family) {
> +   case AF_INET:
> addr4 = (struct sockaddr_in *)address;
> if (addrlen < sizeof(struct sockaddr_in))
> return -EINVAL;
> snum = ntohs(addr4->sin_port);
> -   } else {
> +   break;
> +   case AF_INET6:
> addr6 = (struct sockaddr_in6 *)address;
> if (addrlen < SIN6_LEN_RFC2133)
> return -EINVAL;
> snum = ntohs(addr6->sin6_port);
> +   break;
> +   default:
> +   /* Note that SCTP services expect -EINVAL, whereas
> +* others expect -EAFNOSUPPORT.
> +*/
> +   if (sksec->sclass == SECCLASS_SCTP_SOCKET)
> +   return -EINVAL;
> +   else
> +   return -EAFNOSUPPORT;
> }
>
> err = sel_netport_sid(sk->sk_protocol, snum, );
> --
> 2.14.3
>



-- 
paul moore
www.paul-moore.com


Re: [PATCH v2 4/4] net: make skb_gso_*_seglen functions private

2018-03-02 Thread Marcelo Ricardo Leitner
On Thu, Mar 01, 2018 at 05:13:40PM +1100, Daniel Axtens wrote:
> They're very hard to use properly as they do not consider the
> GSO_BY_FRAGS case. Code should use skb_gso_validate_network_len
> and skb_gso_validate_mac_len as they do consider this case.
> 
> Make the seglen functions static, which stops people using them
> outside of skbuff.c
> 
> Signed-off-by: Daniel Axtens 

Reviewed-by: Marcelo Ricardo Leitner 

> 
> ---
> 
> v2: drop inline from functions.
> ---
>  include/linux/skbuff.h | 33 -
>  net/core/skbuff.c  | 37 +++--
>  2 files changed, 35 insertions(+), 35 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index a057dd1a75c7..ddf77cf4ff2d 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3285,7 +3285,6 @@ int skb_zerocopy(struct sk_buff *to, struct sk_buff 
> *from,
>  void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
>  int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
>  void skb_scrub_packet(struct sk_buff *skb, bool xnet);
> -unsigned int skb_gso_transport_seglen(const struct sk_buff *skb);
>  bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int 
> mtu);
>  bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
>  struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
> @@ -4104,38 +4103,6 @@ static inline bool skb_head_is_locked(const struct 
> sk_buff *skb)
>   return !skb->head_frag || skb_cloned(skb);
>  }
>  
> -/**
> - * skb_gso_network_seglen - Return length of individual segments of a gso 
> packet
> - *
> - * @skb: GSO skb
> - *
> - * skb_gso_network_seglen is used to determine the real size of the
> - * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
> - *
> - * The MAC/L2 header is not accounted for.
> - */
> -static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
> -{
> - unsigned int hdr_len = skb_transport_header(skb) -
> -skb_network_header(skb);
> - return hdr_len + skb_gso_transport_seglen(skb);
> -}
> -
> -/**
> - * skb_gso_mac_seglen - Return length of individual segments of a gso packet
> - *
> - * @skb: GSO skb
> - *
> - * skb_gso_mac_seglen is used to determine the real size of the
> - * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
> - * headers (TCP/UDP).
> - */
> -static inline unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
> -{
> - unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
> - return hdr_len + skb_gso_transport_seglen(skb);
> -}
> -
>  /* Local Checksum Offload.
>   * Compute outer checksum based on the assumption that the
>   * inner checksum will be offloaded later.
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index b63767008824..0bb0d8877954 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -4891,7 +4891,7 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet);
>   *
>   * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
>   */
> -unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
> +static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
>  {
>   const struct skb_shared_info *shinfo = skb_shinfo(skb);
>   unsigned int thlen = 0;
> @@ -4913,7 +4913,40 @@ unsigned int skb_gso_transport_seglen(const struct 
> sk_buff *skb)
>*/
>   return thlen + shinfo->gso_size;
>  }
> -EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
> +
> +/**
> + * skb_gso_network_seglen - Return length of individual segments of a gso 
> packet
> + *
> + * @skb: GSO skb
> + *
> + * skb_gso_network_seglen is used to determine the real size of the
> + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
> + *
> + * The MAC/L2 header is not accounted for.
> + */
> +static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
> +{
> + unsigned int hdr_len = skb_transport_header(skb) -
> +skb_network_header(skb);
> +
> + return hdr_len + skb_gso_transport_seglen(skb);
> +}
> +
> +/**
> + * skb_gso_mac_seglen - Return length of individual segments of a gso packet
> + *
> + * @skb: GSO skb
> + *
> + * skb_gso_mac_seglen is used to determine the real size of the
> + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
> + * headers (TCP/UDP).
> + */
> +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
> +{
> + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
> +
> + return hdr_len + skb_gso_transport_seglen(skb);
> +}
>  
>  /**
>   * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
> -- 
> 2.14.1
> 


Re: [PATCH v2 3/4] net: xfrm: use skb_gso_validate_network_len() to check gso sizes

2018-03-02 Thread Marcelo Ricardo Leitner
On Thu, Mar 01, 2018 at 05:13:39PM +1100, Daniel Axtens wrote:
> Replace skb_gso_network_seglen() with
> skb_gso_validate_network_len(), as it considers the GSO_BY_FRAGS
> case.
> 
> Signed-off-by: Daniel Axtens 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
>  net/ipv4/xfrm4_output.c | 3 ++-
>  net/ipv6/xfrm6_output.c | 2 +-
>  2 files changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
> index 94b8702603bc..be980c195fc5 100644
> --- a/net/ipv4/xfrm4_output.c
> +++ b/net/ipv4/xfrm4_output.c
> @@ -30,7 +30,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
>  
>   mtu = dst_mtu(skb_dst(skb));
>   if ((!skb_is_gso(skb) && skb->len > mtu) ||
> - (skb_is_gso(skb) && skb_gso_network_seglen(skb) > 
> ip_skb_dst_mtu(skb->sk, skb))) {
> + (skb_is_gso(skb) &&
> +  !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb 
> {
>   skb->protocol = htons(ETH_P_IP);
>  
>   if (skb->sk)
> diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
> index 8ae87d4ec5ff..5959ce9620eb 100644
> --- a/net/ipv6/xfrm6_output.c
> +++ b/net/ipv6/xfrm6_output.c
> @@ -82,7 +82,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
>  
>   if ((!skb_is_gso(skb) && skb->len > mtu) ||
>   (skb_is_gso(skb) &&
> -  skb_gso_network_seglen(skb) > ip6_skb_dst_mtu(skb))) {
> +  !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb {
>   skb->dev = dst->dev;
>   skb->protocol = htons(ETH_P_IPV6);
>  
> -- 
> 2.14.1
> 


Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Siwei Liu
On Thu, Mar 1, 2018 at 12:08 PM, Sridhar Samudrala
 wrote:
> This patch enables virtio_net to switch over to a VF datapath when a VF
> netdev is present with the same MAC address. It allows live migration
> of a VM with a direct attached VF without the need to setup a bond/team
> between a VF and virtio net device in the guest.
>
> The hypervisor needs to enable only one datapath at any time so that
> packets don't get looped back to the VM over the other datapath. When a VF
> is plugged, the virtio datapath link state can be marked as down. The
> hypervisor needs to unplug the VF device from the guest on the source host
> and reset the MAC filter of the VF to initiate failover of datapath to
> virtio before starting the migration. After the migration is completed,
> the destination hypervisor sets the MAC filter on the VF and plugs it back
> to the guest to switch over to VF datapath.
>
> When BACKUP feature is enabled, an additional netdev(bypass netdev) is
> created that acts as a master device and tracks the state of the 2 lower
> netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
> passthru device with the same MAC is registered as 'active' netdev.
>
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization=151189725224231=2
>
> Signed-off-by: Sridhar Samudrala 
> Signed-off-by: Alexander Duyck 
> Reviewed-by: Jesse Brandeburg 
> ---
>  drivers/net/virtio_net.c | 683 
> ++-
>  1 file changed, 682 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index bcd13fe906ca..f2860d86c952 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -30,6 +30,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>
> @@ -206,6 +208,9 @@ struct virtnet_info {
> u32 speed;
>
> unsigned long guest_offloads;
> +
> +   /* upper netdev created when BACKUP feature enabled */
> +   struct net_device *bypass_netdev;
>  };
>
>  struct padded_vnet_hdr {
> @@ -2236,6 +2241,22 @@ static int virtnet_xdp(struct net_device *dev, struct 
> netdev_bpf *xdp)
> }
>  }
>
> +static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
> + size_t len)
> +{
> +   struct virtnet_info *vi = netdev_priv(dev);
> +   int ret;
> +
> +   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
> +   return -EOPNOTSUPP;
> +
> +   ret = snprintf(buf, len, "_bkup");
> +   if (ret >= len)
> +   return -EOPNOTSUPP;
> +
> +   return 0;
> +}
> +

What if the systemd/udevd is not new enough to enforce the
n naming? Would virtio_bypass get a different name
than the original virtio_net? Should we detect this earlier and fall
back to legacy mode without creating the bypass netdev and ensalving
the VF?

>  static const struct net_device_ops virtnet_netdev = {
> .ndo_open= virtnet_open,
> .ndo_stop= virtnet_close,
> @@ -2253,6 +2274,7 @@ static const struct net_device_ops virtnet_netdev = {
> .ndo_xdp_xmit   = virtnet_xdp_xmit,
> .ndo_xdp_flush  = virtnet_xdp_flush,
> .ndo_features_check = passthru_features_check,
> +   .ndo_get_phys_port_name = virtnet_get_phys_port_name,
>  };
>
>  static void virtnet_config_changed_work(struct work_struct *work)
> @@ -2647,6 +2669,653 @@ static int virtnet_validate(struct virtio_device 
> *vdev)
> return 0;
>  }
>
> +/* START of functions supporting VIRTIO_NET_F_BACKUP feature.
> + * When BACKUP feature is enabled, an additional netdev(bypass netdev)
> + * is created that acts as a master device and tracks the state of the
> + * 2 lower netdevs. The original virtio_net netdev is registered as
> + * 'backup' netdev and a passthru device with the same MAC is registered
> + * as 'active' netdev.
> + */
> +
> +/* bypass state maintained when BACKUP feature is enabled */
> +struct virtnet_bypass_info {
> +   /* passthru netdev with same MAC */
> +   struct net_device __rcu *active_netdev;
> +
> +   /* virtio_net netdev */
> +   struct net_device __rcu *backup_netdev;
> +
> +   /* active netdev stats */
> +   struct rtnl_link_stats64 active_stats;
> +
> +   /* backup netdev stats */
> +   struct rtnl_link_stats64 backup_stats;
> +
> +   /* aggregated stats */
> +   struct rtnl_link_stats64 bypass_stats;
> +
> +   /* spinlock while updating stats */
> +   spinlock_t stats_lock;
> +};
> +
> +static void virtnet_bypass_child_open(struct net_device *dev,
> + struct net_device *child_netdev)
> +{
> +   int err = dev_open(child_netdev);
> +
> +   if (err)
> +   

Re: [PATCH v2 1/4] net: rename skb_gso_validate_mtu -> skb_gso_validate_network_len

2018-03-02 Thread Marcelo Ricardo Leitner
On Thu, Mar 01, 2018 at 05:13:37PM +1100, Daniel Axtens wrote:
> If you take a GSO skb, and split it into packets, will the network
> length (L3 headers + L4 headers + payload) of those packets be small
> enough to fit within a given MTU?
> 
> skb_gso_validate_mtu gives you the answer to that question. However,
> we recently added to add a way to validate the MAC length of a split GSO
> skb (L2+L3+L4+payload), and the names get confusing, so rename
> skb_gso_validate_mtu to skb_gso_validate_network_len
> 
> Signed-off-by: Daniel Axtens 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
>  include/linux/skbuff.h  |  2 +-
>  net/core/skbuff.c   | 11 ++-
>  net/ipv4/ip_forward.c   |  2 +-
>  net/ipv4/ip_output.c|  2 +-
>  net/ipv4/netfilter/nf_flow_table_ipv4.c |  2 +-
>  net/ipv6/ip6_output.c   |  2 +-
>  net/ipv6/netfilter/nf_flow_table_ipv6.c |  2 +-
>  net/mpls/af_mpls.c  |  2 +-
>  net/xfrm/xfrm_device.c  |  2 +-
>  9 files changed, 14 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index c1e66bdcf583..a057dd1a75c7 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3286,7 +3286,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff 
> *skb1, const u32 len);
>  int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
>  void skb_scrub_packet(struct sk_buff *skb, bool xnet);
>  unsigned int skb_gso_transport_seglen(const struct sk_buff *skb);
> -bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu);
> +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int 
> mtu);
>  bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
>  struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
>  struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 09bd89c90a71..b63767008824 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -4955,19 +4955,20 @@ static inline bool skb_gso_size_check(const struct 
> sk_buff *skb,
>  }
>  
>  /**
> - * skb_gso_validate_mtu - Return in case such skb fits a given MTU
> + * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
>   *
>   * @skb: GSO skb
>   * @mtu: MTU to validate against
>   *
> - * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU
> - * once split.
> + * skb_gso_validate_network_len validates if a given skb will fit a
> + * wanted MTU once split. It considers L3 headers, L4 headers, and the
> + * payload.
>   */
> -bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu)
> +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int 
> mtu)
>  {
>   return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
>  }
> -EXPORT_SYMBOL_GPL(skb_gso_validate_mtu);
> +EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
>  
>  /**
>   * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
> diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
> index 2dd21c3281a1..b54b948b0596 100644
> --- a/net/ipv4/ip_forward.c
> +++ b/net/ipv4/ip_forward.c
> @@ -55,7 +55,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, 
> unsigned int mtu)
>   if (skb->ignore_df)
>   return false;
>  
> - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
> + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
>   return false;
>  
>   return true;
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index e8e675be60ec..66340ab750e6 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -248,7 +248,7 @@ static int ip_finish_output_gso(struct net *net, struct 
> sock *sk,
>  
>   /* common case: seglen is <= mtu
>*/
> - if (skb_gso_validate_mtu(skb, mtu))
> + if (skb_gso_validate_network_len(skb, mtu))
>   return ip_finish_output2(net, sk, skb);
>  
>   /* Slowpath -  GSO segment length exceeds the egress MTU.
> diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c 
> b/net/ipv4/netfilter/nf_flow_table_ipv4.c
> index 25d2975da156..2447077d163d 100644
> --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
> +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
> @@ -185,7 +185,7 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff 
> *skb, unsigned int mtu)
>   if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
>   return false;
>  
> - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
> + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
>   return false;
>  
>   return true;
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index 997c7f19ad62..a8a919520090 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ 

Re: [PATCH v2 2/4] net: sched: tbf: handle GSO_BY_FRAGS case in enqueue

2018-03-02 Thread Marcelo Ricardo Leitner
On Thu, Mar 01, 2018 at 05:13:38PM +1100, Daniel Axtens wrote:
> tbf_enqueue() checks the size of a packet before enqueuing it.
> However, the GSO size check does not consider the GSO_BY_FRAGS
> case, and so will drop GSO SCTP packets, causing a massive drop
> in throughput.
> 
> Use skb_gso_validate_mac_len() instead, as it does consider that
> case.
> 
> Signed-off-by: Daniel Axtens 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
>  net/sched/sch_tbf.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
> index 229172d509cc..03225a8df973 100644
> --- a/net/sched/sch_tbf.c
> +++ b/net/sched/sch_tbf.c
> @@ -188,7 +188,8 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc 
> *sch,
>   int ret;
>  
>   if (qdisc_pkt_len(skb) > q->max_size) {
> - if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
> + if (skb_is_gso(skb) &&
> + skb_gso_validate_mac_len(skb, q->max_size))
>   return tbf_segment(skb, sch, to_free);
>   return qdisc_drop(skb, sch, to_free);
>   }
> -- 
> 2.14.1
> 


Re: [Intel-wired-lan] SRIOV on Intel x710 fail to get attached both at VM creation time and VM runtime

2018-03-02 Thread Alexander Duyck
On Thu, Mar 1, 2018 at 11:21 PM, Stefan Assmann  wrote:
> On 2018-03-01 19:40, Alexander Duyck wrote:
>> On Thu, Mar 1, 2018 at 8:12 AM,   wrote:
>> > + intel-wired-...@lists.osuosl.org
>> >
>> >
>> > On 2018-03-01 21:41, p...@codeaurora.org wrote:
>> >>
>> >> Hi All,
>> >>
>> >> I am facing the following issue on kernel 4.14.14.
>> >>
>> >> Enable SRIOV on Intel x710 card.
>> >> echo 32 > /sys/class/net/eth1/device/sriov_numvfs
>> >> start net_pool
>> >> virsh net-start intel_pool
>> >>
>> >> case 1)
>> >> attach the VF while creatig VM:
>> >> virt-install --accelerate --import --disk /home/disk.img --network
>> >> network=intel_pool  --boot uefi --name poza-guest --os-type linux
>> >> --os-variant rhel7 --ram 8000 --vcpus 4
>> >>
>> >> case 2)
>> >> create VM:
>> >> virt-install --accelerate --import --disk /home/disk.img --boot uefi
>> >> --name poza-guest --os-type linux --os-variant rhel7 --ram 8000
>> >> --vcpus 4
>> >> attach it:
>> >> virsh attach-interface --domain oza-guest --type network --source
>> >> intel_pool --target eth1
>> >>
>> >> kernel logs:
>> >> [44287.825287] i40evf :01:02.0: Unable to send opcode 2 to PF, err
>> >> I40E_ERR_QUEUE_EMPTY, aq_err OK
>> >> [44287.962640] i40e :01:00.0: VF 0 still in reset. Try again.
>> >> error: Failed to attach interface
>> >> error: Cannot set interface MAC/vlanid to 52:54:00:e9:f1:b5/0 for
>> >> ifname eth1 vf 0: Resource temporarily unavailable
>> >>
>> >>
>> >> The same use case works with following card with the same kernel
>> >> version and rootfs.
>> >> Ethernet controller: Mellanox Technologies MT27710 Family [ConnectX-4 Lx]
>> >>
>> >> for details logs please have a look at
>> >> https://bugzilla.kernel.org/show_bug.cgi?id=198959
>> >>
>> >> Regards,
>> >> Oza.
>>
>> So the first question that jumps to mind is what is the firmware
>> version on the PF, ethtool -i should tell you. Are there any issues
>> bringing up the PF and getting it to pass traffic?
>
> There's a patch on Intel-wired-lan titled "i40e: Fix attach VF to VM
> issue" which should fix the problem. It's not upstream yet.
>
> As a workaround you could unload/blacklist the i40evf driver in the
> host.
>
>   Stefan

Either that or there should be an sriov_drivers_autoprobe sysfs value
for the PF that you can set to 0 and that should prevent the VF
drivers from auto-loading.

- Alex


Re: [virtio-dev] Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Samudrala, Sridhar



On 3/2/2018 12:44 PM, Siwei Liu wrote:

On Fri, Mar 2, 2018 at 12:10 PM, Michael S. Tsirkin  wrote:

On Fri, Mar 02, 2018 at 11:52:27AM -0800, Samudrala, Sridhar wrote:


On 3/2/2018 11:41 AM, Michael S. Tsirkin wrote:

On Fri, Mar 02, 2018 at 07:26:25AM -0800, Alexander Duyck wrote:

The design limits things to a 1:1 relationship since we just have the
child and backup pointers, but I don't think I am seeing exception
handling to prevent us from overwriting the child pointers so there
may be a leak there.

Thanks.

- Alex

In fact maintaining a list in that case would be nicer, and
just use an arbitrary one.
E.g. one can see how a user wanting to swap device 1 for device 2
might first add device 2 with same MAC then drop device 1.

It should be possible to swap VF1 with VF2 by
1.- enabling virtio link
2.- unplugging VF1
3.- plugging VF2
4.- disabling virtio link


True, but it isn't hard to avoid breakage if user
swapped steps 2 and 3. No need to make it more
fragile that it has to be.

The migration case, VF2 is associated with another PF on another
machine (destination), I wonder how it is possible.

Even with local plugging of VF2 on the same PF, the MAC address
requirement (VF1's == VF2's) would fail the MAC address assignment on
VF2.



I didn't include updating the MAC filter step in the above sequence.
So definitely plugging 2 VFs with the same MAC address will be an issue.

Here is the more complete sequence of steps that are required to
enable live migration.

Source Hypervisor
- Bring up the virtio link
- Hot Unplug VF from the VM
- Delete FDB entry for the MAC on the Bridge associated with virtio/tap
- Remove the MAC filter associated with the VF
- Migrate VM to destination

Destination Hypervisor (after migration is completed)
- Set MAC filter for the VF
- Hot Plug VF to the VM
- Bring down the virtio link

Thanks
Sridhar





Re: [Intel-wired-lan] [net-next PATCH] ixgbevf: fix unused variable warning

2018-03-02 Thread Alexander Duyck
On Wed, Feb 28, 2018 at 3:17 PM, Arnd Bergmann  wrote:
> The new ixgbevf_set_rx_buffer_len() function causes a harmless warnings
> in configurations with large page size:
>
> drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c: In function 
> 'ixgbevf_set_rx_buffer_len':
> drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c:1758:15: error: unused 
> variable 'max_frame' [-Werror=unused-variable]
>
> This rephrases the code so that the compiler can see the use of that
> variable, making it slightly easier to read in the process.
>
> Fixes: f15c5ba5b6cd ("ixgbevf: add support for using order 1 pages to receive 
> large frames")
> Signed-off-by: Arnd Bergmann 
> ---
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
> b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> index f37307131eb6..4da449e0a4ba 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> @@ -1766,12 +1766,12 @@ static void ixgbevf_set_rx_buffer_len(struct 
> ixgbevf_adapter *adapter,
>
> set_ring_build_skb_enabled(rx_ring);
>
> -#if (PAGE_SIZE < 8192)
> -   if (max_frame <= IXGBEVF_MAX_FRAME_BUILD_SKB)
> -   return;
> +   if (PAGE_SIZE < 8192) {
> +   if (max_frame <= IXGBEVF_MAX_FRAME_BUILD_SKB)
> +   return;
>
> -   set_ring_uses_large_buffer(rx_ring);
> -#endif
> +   set_ring_uses_large_buffer(rx_ring);
> +   }
>  }
>
>  /**

The fix looks good to me.

Acked-by: Alexander Duyck 


Re: [virtio-dev] Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Siwei Liu
On Fri, Mar 2, 2018 at 11:42 AM, Michael S. Tsirkin  wrote:
> On Fri, Mar 02, 2018 at 05:20:17PM +0100, Jiri Pirko wrote:
>> >Yeah, this code essentially calls out the "shareable" code with a
>> >comment at the start and end of the section what defines the
>> >virtio_bypass functionality. It would just be a matter of mostly
>> >cutting and pasting to put it into a separate driver module.
>>
>> Please put it there and unite the use of it with netvsc.
>
> Surely, adding this to other drivers (e.g. might this be handy for xen
> too?) can be left for a separate patchset. Let's get one device merged
> first.
Agreed.

-Siwei

>
> --
> MST
>
> -
> To unsubscribe, e-mail: virtio-dev-unsubscr...@lists.oasis-open.org
> For additional commands, e-mail: virtio-dev-h...@lists.oasis-open.org
>


Re: [virtio-dev] Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Siwei Liu
On Fri, Mar 2, 2018 at 12:10 PM, Michael S. Tsirkin  wrote:
> On Fri, Mar 02, 2018 at 11:52:27AM -0800, Samudrala, Sridhar wrote:
>>
>>
>> On 3/2/2018 11:41 AM, Michael S. Tsirkin wrote:
>> > On Fri, Mar 02, 2018 at 07:26:25AM -0800, Alexander Duyck wrote:
>> > > The design limits things to a 1:1 relationship since we just have the
>> > > child and backup pointers, but I don't think I am seeing exception
>> > > handling to prevent us from overwriting the child pointers so there
>> > > may be a leak there.
>> > >
>> > > Thanks.
>> > >
>> > > - Alex
>> > In fact maintaining a list in that case would be nicer, and
>> > just use an arbitrary one.
>> > E.g. one can see how a user wanting to swap device 1 for device 2
>> > might first add device 2 with same MAC then drop device 1.
>>
>> It should be possible to swap VF1 with VF2 by
>> 1.- enabling virtio link
>> 2.- unplugging VF1
>> 3.- plugging VF2
>> 4.- disabling virtio link
>>
>
> True, but it isn't hard to avoid breakage if user
> swapped steps 2 and 3. No need to make it more
> fragile that it has to be.

The migration case, VF2 is associated with another PF on another
machine (destination), I wonder how it is possible.

Even with local plugging of VF2 on the same PF, the MAC address
requirement (VF1's == VF2's) would fail the MAC address assignment on
VF2.

-Siwei

>
> --
> MST
>
> -
> To unsubscribe, e-mail: virtio-dev-unsubscr...@lists.oasis-open.org
> For additional commands, e-mail: virtio-dev-h...@lists.oasis-open.org
>


Re: Issue accessing task_struct from BPF due to 4.16 stack-protector changes

2018-03-02 Thread Alexei Starovoitov
On Fri, Mar 02, 2018 at 12:09:57PM -0800, Gianluca Borello wrote:
> Hello,
> 
> While testing bpf-next, I noticed that I was reading garbage when
> accessing some task_struct members, and the issue seems caused by the
> recent commit 2bc2f688fdf8 ("Makefile: move stack-protector
> availability out of Kconfig") which removes CONFIG_CC_STACKPROTECTOR
> from autoconf.h.
> 
> When I compile my BPF program, offsetof(struct task_struct, files),
> which is the member I'm dereferencing, returns 1768 (where the garbage
> is), whereas doing it on 4.15 returns 1776 (where the correct member
> is). I believe when compiling with clang this portion of the
> task_struct doesn't get considered anymore:
> 
> #ifdef CONFIG_CC_STACKPROTECTOR
> /* Canary value for the -fstack-protector GCC feature: */
> unsigned long stack_canary;
> #endif
> 
> I solved it by adding $(KBUILD_CPPFLAGS) to my BPF Makefile (which is
> pretty similar to the one used in samples/bpf/Makefile).
> 
> Two questions:
> 
> 1) Do you confirm this is the proper way to handle this moving
> forward? Or should there be a better way?
> 
> 2) Would you consider useful a simple patch to samples/bpf/Makefile so
> that other developers will not be stuck in a long bisect to figure out
> why they read garbage when dereferencing task_struct? I assume that
> several people use that Makefile as a template to start their project,
> like I did (perhaps I'm assuming wrong though).

good catch!
I wonder why sched.h is using this flag insead of relying on #defines from 
autoconf.h
It could have been using CONFIG_HAVE_CC_STACKPROTECTOR
instead of CONFIG_CC_STACKPROTECTOR, no ?



[PATCH 00/14] Netfilter/IPVS fixes for net

2018-03-02 Thread Pablo Neira Ayuso
Hi David,

The following patchset contains Netfilter fixes for your net tree,
they are:

1) Put back reference on CLUSTERIP configuration structure from the
   error path, patch from Florian Westphal.

2) Put reference on CLUSTERIP configuration instead of freeing it,
   another cpu may still be walking over it, also from Florian.

3) Refetch pointer to IPv6 header from nf_nat_ipv6_manip_pkt() given
   packet manipulation may reallocation the skbuff header, from Florian.

4) Missing match size sanity checks in ebt_among, from Florian.

5) Convert BUG_ON to WARN_ON in ebtables, from Florian.

6) Sanity check userspace offsets from ebtables kernel, from Florian.

7) Missing checksum replace call in flowtable IPv4 DNAT, from Felix
   Fietkau.

8) Bump the right stats on checksum error from bridge netfilter,
   from Taehee Yoo.

9) Unset interface flag in IPv6 fib lookups otherwise we get
   misleading routing lookup results, from Florian.

10) Missing sk_to_full_sk() in ip6_route_me_harder() from Eric Dumazet.

11) Don't allow devices to be part of multiple flowtables at the same
time, this may break setups.

12) Missing netlink attribute validation in flowtable deletion.

13) Wrong array index in nf_unregister_net_hook() call from error path
in flowtable addition path.

14) Fix FTP IPVS helper when NAT mangling is in place, patch from
Julian Anastasov.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!



The following changes since commit 9cb9c07d6b0c5fd97d83b8ab14d7e308ba4b612f:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2018-02-23 
15:14:17 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to 8a949fff0302b50063f74bb345a66190015528d0:

  ipvs: remove IPS_NAT_MASK check to fix passive FTP (2018-02-28 19:48:26 +0100)


Eric Dumazet (1):
  netfilter: use skb_to_full_sk in ip6_route_me_harder

Felix Fietkau (1):
  netfilter: nf_flow_table: fix checksum when handling DNAT

Florian Westphal (7):
  netfilter: ipt_CLUSTERIP: put config struct if we can't increment ct 
refcount
  netfilter: ipt_CLUSTERIP: put config instead of freeing it
  netfilter: ipv6: fix use-after-free Write in nf_nat_ipv6_manip_pkt
  netfilter: bridge: ebt_among: add missing match size checks
  netfilter: ebtables: convert BUG_ONs to WARN_ONs
  netfilter: ebtables: CONFIG_COMPAT: don't trust userland offsets
  netfilter: don't set F_IFACE on ipv6 fib lookups

Julian Anastasov (1):
  ipvs: remove IPS_NAT_MASK check to fix passive FTP

Pablo Neira Ayuso (3):
  netfilter: nf_tables: return EBUSY if device already belongs to flowtable
  netfilter: nf_tables: missing attribute validation in 
nf_tables_delflowtable()
  netfilter: nf_tables: use the right index from flowtable error path

Taehee Yoo (1):
  netfilter: increase IPSTATS_MIB_CSUMERRORS stat

 net/bridge/br_netfilter_hooks.c  |  4 +++-
 net/bridge/netfilter/ebt_among.c | 21 +++--
 net/bridge/netfilter/ebtables.c  | 40 
 net/ipv4/netfilter/ipt_CLUSTERIP.c   | 15 
 net/ipv4/netfilter/nf_flow_table_ipv4.c  |  1 +
 net/ipv6/netfilter.c |  9 +++
 net/ipv6/netfilter/ip6t_rpfilter.c   |  4 
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c |  4 
 net/ipv6/netfilter/nft_fib_ipv6.c| 12 ++
 net/netfilter/ipvs/ip_vs_ftp.c   |  2 +-
 net/netfilter/nf_tables_api.c| 25 ++--
 11 files changed, 98 insertions(+), 39 deletions(-)


[PATCH 02/14] netfilter: ipt_CLUSTERIP: put config instead of freeing it

2018-03-02 Thread Pablo Neira Ayuso
From: Florian Westphal 

Once struct is added to per-netns list it becomes visible to other cpus,
so we cannot use kfree().

Also delay setting entries refcount to 1 until after everything is
initialised so that when we call clusterip_config_put() in this spot
entries is still zero.

Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c 
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 4c8cfd352687..8a8ae61cea71 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -232,7 +232,6 @@ clusterip_config_init(struct net *net, const struct 
ipt_clusterip_tgt_info *i,
c->hash_mode = i->hash_mode;
c->hash_initval = i->hash_initval;
refcount_set(>refcount, 1);
-   refcount_set(>entries, 1);
 
spin_lock_bh(>lock);
if (__clusterip_config_find(net, ip)) {
@@ -263,8 +262,10 @@ clusterip_config_init(struct net *net, const struct 
ipt_clusterip_tgt_info *i,
 
c->notifier.notifier_call = clusterip_netdev_event;
err = register_netdevice_notifier(>notifier);
-   if (!err)
+   if (!err) {
+   refcount_set(>entries, 1);
return c;
+   }
 
 #ifdef CONFIG_PROC_FS
proc_remove(c->pde);
@@ -273,7 +274,7 @@ clusterip_config_init(struct net *net, const struct 
ipt_clusterip_tgt_info *i,
spin_lock_bh(>lock);
list_del_rcu(>list);
spin_unlock_bh(>lock);
-   kfree(c);
+   clusterip_config_put(c);
 
return ERR_PTR(err);
 }
-- 
2.11.0



[PATCH 03/14] netfilter: ipv6: fix use-after-free Write in nf_nat_ipv6_manip_pkt

2018-03-02 Thread Pablo Neira Ayuso
From: Florian Westphal 

l4proto->manip_pkt() can cause reallocation of skb head so pointer
to the ipv6 header must be reloaded.

Reported-and-tested-by: 
Fixes: 58a317f1061c89 ("netfilter: ipv6: add IPv6 NAT support")
Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c 
b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index bed57ee65f7b..6b7f075f811f 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -99,6 +99,10 @@ static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
!l4proto->manip_pkt(skb, _nat_l3proto_ipv6, iphdroff, hdroff,
target, maniptype))
return false;
+
+   /* must reload, offset might have changed */
+   ipv6h = (void *)skb->data + iphdroff;
+
 manip_addr:
if (maniptype == NF_NAT_MANIP_SRC)
ipv6h->saddr = target->src.u3.in6;
-- 
2.11.0



[PATCH 06/14] netfilter: ebtables: CONFIG_COMPAT: don't trust userland offsets

2018-03-02 Thread Pablo Neira Ayuso
From: Florian Westphal 

We need to make sure the offsets are not out of range of the
total size.
Also check that they are in ascending order.

The WARN_ON triggered by syzkaller (it sets panic_on_warn) is
changed to also bail out, no point in continuing parsing.

Briefly tested with simple ruleset of
-A INPUT --limit 1/s' --log
plus jump to custom chains using 32bit ebtables binary.

Reported-by: 
Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/bridge/netfilter/ebtables.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 61f87879e389..254ef9f49567 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -2060,7 +2060,9 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt 
*match32,
if (match_kern)
match_kern->match_size = ret;
 
-   WARN_ON(type == EBT_COMPAT_TARGET && size_left);
+   if (WARN_ON(type == EBT_COMPAT_TARGET && size_left))
+   return -EINVAL;
+
match32 = (struct compat_ebt_entry_mwt *) buf;
}
 
@@ -2116,6 +2118,15 @@ static int size_entry_mwt(struct ebt_entry *entry, const 
unsigned char *base,
 *
 * offsets are relative to beginning of struct ebt_entry (i.e., 0).
 */
+   for (i = 0; i < 4 ; ++i) {
+   if (offsets[i] >= *total)
+   return -EINVAL;
+   if (i == 0)
+   continue;
+   if (offsets[i-1] > offsets[i])
+   return -EINVAL;
+   }
+
for (i = 0, j = 1 ; j < 4 ; j++, i++) {
struct compat_ebt_entry_mwt *match32;
unsigned int size;
-- 
2.11.0



[PATCH 04/14] netfilter: bridge: ebt_among: add missing match size checks

2018-03-02 Thread Pablo Neira Ayuso
From: Florian Westphal 

ebt_among is special, it has a dynamic match size and is exempt
from the central size checks.

Therefore it must check that the size of the match structure
provided from userspace is sane by making sure em->match_size
is at least the minimum size of the expected structure.

The module has such a check, but its only done after accessing
a structure that might be out of bounds.

tested with: ebtables -A INPUT ... \
--among-dst fe:fe:fe:fe:fe:fe
--among-dst fe:fe:fe:fe:fe:fe --among-src 
fe:fe:fe:fe:ff:f,fe:fe:fe:fe:fe:fb,fe:fe:fe:fe:fc:fd,fe:fe:fe:fe:fe:fd,fe:fe:fe:fe:fe:fe
--among-src 
fe:fe:fe:fe:ff:f,fe:fe:fe:fe:fe:fa,fe:fe:fe:fe:fe:fd,fe:fe:fe:fe:fe:fe,fe:fe:fe:fe:fe:fe

Reported-by: 
Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/bridge/netfilter/ebt_among.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index ce7152a12bd8..c5afb4232ecb 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -172,18 +172,35 @@ ebt_among_mt(const struct sk_buff *skb, struct 
xt_action_param *par)
return true;
 }
 
+static bool poolsize_invalid(const struct ebt_mac_wormhash *w)
+{
+   return w && w->poolsize >= (INT_MAX / sizeof(struct 
ebt_mac_wormhash_tuple));
+}
+
 static int ebt_among_mt_check(const struct xt_mtchk_param *par)
 {
const struct ebt_among_info *info = par->matchinfo;
const struct ebt_entry_match *em =
container_of(par->matchinfo, const struct ebt_entry_match, 
data);
-   int expected_length = sizeof(struct ebt_among_info);
+   unsigned int expected_length = sizeof(struct ebt_among_info);
const struct ebt_mac_wormhash *wh_dst, *wh_src;
int err;
 
+   if (expected_length > em->match_size)
+   return -EINVAL;
+
wh_dst = ebt_among_wh_dst(info);
-   wh_src = ebt_among_wh_src(info);
+   if (poolsize_invalid(wh_dst))
+   return -EINVAL;
+
expected_length += ebt_mac_wormhash_size(wh_dst);
+   if (expected_length > em->match_size)
+   return -EINVAL;
+
+   wh_src = ebt_among_wh_src(info);
+   if (poolsize_invalid(wh_src))
+   return -EINVAL;
+
expected_length += ebt_mac_wormhash_size(wh_src);
 
if (em->match_size != EBT_ALIGN(expected_length)) {
-- 
2.11.0



[PATCH 07/14] netfilter: nf_flow_table: fix checksum when handling DNAT

2018-03-02 Thread Pablo Neira Ayuso
From: Felix Fietkau 

Add a missing call to csum_replace4 like on SNAT.

Signed-off-by: Felix Fietkau 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/nf_flow_table_ipv4.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c 
b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 25d2975da156..282b9cc4fe82 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -111,6 +111,7 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, 
struct sk_buff *skb,
default:
return -1;
}
+   csum_replace4(>check, addr, new_addr);
 
return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
 }
-- 
2.11.0



[PATCH 01/14] netfilter: ipt_CLUSTERIP: put config struct if we can't increment ct refcount

2018-03-02 Thread Pablo Neira Ayuso
From: Florian Westphal 

This needs to put() the entry to avoid a resource leak in error path.

Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c 
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 4b02ab39ebc5..4c8cfd352687 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -496,12 +496,15 @@ static int clusterip_tg_check(const struct xt_tgchk_param 
*par)
return PTR_ERR(config);
}
}
-   cipinfo->config = config;
 
ret = nf_ct_netns_get(par->net, par->family);
-   if (ret < 0)
+   if (ret < 0) {
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
+   clusterip_config_entry_put(par->net, config);
+   clusterip_config_put(config);
+   return ret;
+   }
 
if (!par->net->xt.clusterip_deprecated_warning) {
pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, "
@@ -509,6 +512,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param 
*par)
par->net->xt.clusterip_deprecated_warning = true;
}
 
+   cipinfo->config = config;
return ret;
 }
 
-- 
2.11.0



[PATCH 08/14] netfilter: increase IPSTATS_MIB_CSUMERRORS stat

2018-03-02 Thread Pablo Neira Ayuso
From: Taehee Yoo 

In the ip_rcv, IPSTATS_MIB_CSUMERRORS is increased when
checksum error is occurred.
bridge netfilter routine should increase IPSTATS_MIB_CSUMERRORS.

Signed-off-by: Taehee Yoo 
Signed-off-by: Pablo Neira Ayuso 
---
 net/bridge/br_netfilter_hooks.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 27f1d4f2114a..9b16eaf33819 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -214,7 +214,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff 
*skb)
 
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
-   goto inhdr_error;
+   goto csum_error;
 
len = ntohs(iph->tot_len);
if (skb->len < len) {
@@ -236,6 +236,8 @@ static int br_validate_ipv4(struct net *net, struct sk_buff 
*skb)
 */
return 0;
 
+csum_error:
+   __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
 inhdr_error:
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
 drop:
-- 
2.11.0



[PATCH 05/14] netfilter: ebtables: convert BUG_ONs to WARN_ONs

2018-03-02 Thread Pablo Neira Ayuso
From: Florian Westphal 

All of these conditions are not fatal and should have
been WARN_ONs from the get-go.

Convert them to WARN_ONs and bail out.

Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/bridge/netfilter/ebtables.c | 27 ++-
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 02c4b409d317..61f87879e389 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1641,7 +1641,8 @@ static int compat_match_to_user(struct ebt_entry_match 
*m, void __user **dstptr,
int off = ebt_compat_match_offset(match, m->match_size);
compat_uint_t msize = m->match_size - off;
 
-   BUG_ON(off >= m->match_size);
+   if (WARN_ON(off >= m->match_size))
+   return -EINVAL;
 
if (copy_to_user(cm->u.name, match->name,
strlen(match->name) + 1) || put_user(msize, >match_size))
@@ -1671,7 +1672,8 @@ static int compat_target_to_user(struct ebt_entry_target 
*t,
int off = xt_compat_target_offset(target);
compat_uint_t tsize = t->target_size - off;
 
-   BUG_ON(off >= t->target_size);
+   if (WARN_ON(off >= t->target_size))
+   return -EINVAL;
 
if (copy_to_user(cm->u.name, target->name,
strlen(target->name) + 1) || put_user(tsize, >match_size))
@@ -1902,7 +1904,8 @@ static int ebt_buf_add(struct ebt_entries_buf_state 
*state,
if (state->buf_kern_start == NULL)
goto count_only;
 
-   BUG_ON(state->buf_kern_offset + sz > state->buf_kern_len);
+   if (WARN_ON(state->buf_kern_offset + sz > state->buf_kern_len))
+   return -EINVAL;
 
memcpy(state->buf_kern_start + state->buf_kern_offset, data, sz);
 
@@ -1915,7 +1918,8 @@ static int ebt_buf_add_pad(struct ebt_entries_buf_state 
*state, unsigned int sz)
 {
char *b = state->buf_kern_start;
 
-   BUG_ON(b && state->buf_kern_offset > state->buf_kern_len);
+   if (WARN_ON(b && state->buf_kern_offset > state->buf_kern_len))
+   return -EINVAL;
 
if (b != NULL && sz > 0)
memset(b + state->buf_kern_offset, 0, sz);
@@ -1992,8 +1996,10 @@ static int compat_mtw_from_user(struct 
compat_ebt_entry_mwt *mwt,
pad = XT_ALIGN(size_kern) - size_kern;
 
if (pad > 0 && dst) {
-   BUG_ON(state->buf_kern_len <= pad);
-   BUG_ON(state->buf_kern_offset - (match_size + off) + size_kern 
> state->buf_kern_len - pad);
+   if (WARN_ON(state->buf_kern_len <= pad))
+   return -EINVAL;
+   if (WARN_ON(state->buf_kern_offset - (match_size + off) + 
size_kern > state->buf_kern_len - pad))
+   return -EINVAL;
memset(dst + size_kern, 0, pad);
}
return off + match_size;
@@ -2043,7 +2049,8 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt 
*match32,
if (ret < 0)
return ret;
 
-   BUG_ON(ret < match32->match_size);
+   if (WARN_ON(ret < match32->match_size))
+   return -EINVAL;
growth += ret - match32->match_size;
growth += ebt_compat_entry_padsize();
 
@@ -2140,7 +2147,8 @@ static int size_entry_mwt(struct ebt_entry *entry, const 
unsigned char *base,
 
startoff = state->buf_user_offset - startoff;
 
-   BUG_ON(*total < startoff);
+   if (WARN_ON(*total < startoff))
+   return -EINVAL;
*total -= startoff;
return 0;
 }
@@ -2267,7 +2275,8 @@ static int compat_do_replace(struct net *net, void __user 
*user,
state.buf_kern_len = size64;
 
ret = compat_copy_entries(entries_tmp, tmp.entries_size, );
-   BUG_ON(ret < 0);/* parses same data again */
+   if (WARN_ON(ret < 0))
+   goto out_unlock;
 
vfree(entries_tmp);
tmp.entries_size = size64;
-- 
2.11.0



[PATCH 13/14] netfilter: nf_tables: use the right index from flowtable error path

2018-03-02 Thread Pablo Neira Ayuso
Use the right loop index, not the number of devices in the array that we
need to remove, the following message uncovered the problem:

[ 5437.044119] hook not found, pf 5 num 0
[ 5437.044140] WARNING: CPU: 2 PID: 24983 at net/netfilter/core.c:376 
__nf_unregister_net_hook+0x250/0x280

Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_tables_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2b5aa78979db..558593e6a0a3 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5136,7 +5136,7 @@ static int nf_tables_newflowtable(struct net *net, struct 
sock *nlsk,
i = flowtable->ops_len;
 err4:
for (k = i - 1; k >= 0; k--)
-   nf_unregister_net_hook(net, >ops[i]);
+   nf_unregister_net_hook(net, >ops[k]);
 
kfree(flowtable->ops);
 err3:
-- 
2.11.0



[PATCH 11/14] netfilter: nf_tables: return EBUSY if device already belongs to flowtable

2018-03-02 Thread Pablo Neira Ayuso
If the netdevice is already part of a flowtable, return EBUSY. I cannot
find a valid usecase for having two flowtables bound to the same
netdevice. We can still have two flowtable where the device set is
disjoint.

Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_tables_api.c | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8b9fe30de0cd..43acdeef045d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5037,9 +5037,9 @@ static int nf_tables_newflowtable(struct net *net, struct 
sock *nlsk,
 {
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nf_flowtable_type *type;
+   struct nft_flowtable *flowtable, *ft;
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
-   struct nft_flowtable *flowtable;
struct nft_table *table;
struct nft_ctx ctx;
int err, i, k;
@@ -5099,6 +5099,22 @@ static int nf_tables_newflowtable(struct net *net, 
struct sock *nlsk,
goto err3;
 
for (i = 0; i < flowtable->ops_len; i++) {
+   if (!flowtable->ops[i].dev)
+   continue;
+
+   list_for_each_entry(ft, >flowtables, list) {
+   for (k = 0; k < ft->ops_len; k++) {
+   if (!ft->ops[k].dev)
+   continue;
+
+   if (flowtable->ops[i].dev == ft->ops[k].dev &&
+   flowtable->ops[i].pf == ft->ops[k].pf) {
+   err = -EBUSY;
+   goto err4;
+   }
+   }
+   }
+
err = nf_register_net_hook(net, >ops[i]);
if (err < 0)
goto err4;
-- 
2.11.0



[PATCH 09/14] netfilter: don't set F_IFACE on ipv6 fib lookups

2018-03-02 Thread Pablo Neira Ayuso
From: Florian Westphal 

"fib" starts to behave strangely when an ipv6 default route is
added - the FIB lookup returns a route using 'oif' in this case.

This behaviour was inherited from ip6tables rpfilter so change
this as well.

Bugzilla: https://bugzilla.netfilter.org/show_bug.cgi?id=1221
Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv6/netfilter/ip6t_rpfilter.c |  4 
 net/ipv6/netfilter/nft_fib_ipv6.c  | 12 ++--
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c 
b/net/ipv6/netfilter/ip6t_rpfilter.c
index 94deb69bbbda..91ed25a24b79 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -48,10 +48,6 @@ static bool rpfilter_lookup_reverse6(struct net *net, const 
struct sk_buff *skb,
}
 
fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
-   if ((flags & XT_RPFILTER_LOOSE) == 0) {
-   fl6.flowi6_oif = dev->ifindex;
-   lookup_flags |= RT6_LOOKUP_F_IFACE;
-   }
 
rt = (void *) ip6_route_lookup(net, , lookup_flags);
if (rt->dst.error)
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c 
b/net/ipv6/netfilter/nft_fib_ipv6.c
index cc5174c7254c..62fc84d7bdff 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -180,7 +180,6 @@ void nft_fib6_eval(const struct nft_expr *expr, struct 
nft_regs *regs,
}
 
*dest = 0;
- again:
rt = (void *)ip6_route_lookup(nft_net(pkt), , lookup_flags);
if (rt->dst.error)
goto put_rt_err;
@@ -189,15 +188,8 @@ void nft_fib6_eval(const struct nft_expr *expr, struct 
nft_regs *regs,
if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
goto put_rt_err;
 
-   if (oif && oif != rt->rt6i_idev->dev) {
-   /* multipath route? Try again with F_IFACE */
-   if ((lookup_flags & RT6_LOOKUP_F_IFACE) == 0) {
-   lookup_flags |= RT6_LOOKUP_F_IFACE;
-   fl6.flowi6_oif = oif->ifindex;
-   ip6_rt_put(rt);
-   goto again;
-   }
-   }
+   if (oif && oif != rt->rt6i_idev->dev)
+   goto put_rt_err;
 
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
-- 
2.11.0



[PATCH 10/14] netfilter: use skb_to_full_sk in ip6_route_me_harder

2018-03-02 Thread Pablo Neira Ayuso
From: Eric Dumazet 

For some reason, Florian forgot to apply to ip6_route_me_harder
the fix that went in commit 29e09229d9f2 ("netfilter: use
skb_to_full_sk in ip_route_me_harder")

Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of 
listener") 
Signed-off-by: Eric Dumazet 
Reported-by: syzbot 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv6/netfilter.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index d95ceca7ff8f..531d6957af36 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -21,18 +21,19 @@
 int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 {
const struct ipv6hdr *iph = ipv6_hdr(skb);
+   struct sock *sk = sk_to_full_sk(skb->sk);
unsigned int hh_len;
struct dst_entry *dst;
struct flowi6 fl6 = {
-   .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+   .flowi6_oif = sk ? sk->sk_bound_dev_if : 0,
.flowi6_mark = skb->mark,
-   .flowi6_uid = sock_net_uid(net, skb->sk),
+   .flowi6_uid = sock_net_uid(net, sk),
.daddr = iph->daddr,
.saddr = iph->saddr,
};
int err;
 
-   dst = ip6_route_output(net, skb->sk, );
+   dst = ip6_route_output(net, sk, );
err = dst->error;
if (err) {
IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
@@ -50,7 +51,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
xfrm_decode_session(skb, flowi6_to_flowi(), AF_INET6) == 0) {
skb_dst_set(skb, NULL);
-   dst = xfrm_lookup(net, dst, flowi6_to_flowi(), skb->sk, 0);
+   dst = xfrm_lookup(net, dst, flowi6_to_flowi(), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_dst_set(skb, dst);
-- 
2.11.0



[PATCH 14/14] ipvs: remove IPS_NAT_MASK check to fix passive FTP

2018-03-02 Thread Pablo Neira Ayuso
From: Julian Anastasov 

The IPS_NAT_MASK check in 4.12 replaced previous check for nfct_nat()
which was needed to fix a crash in 2.6.36-rc, see
commit 7bcbf81a2296 ("ipvs: avoid oops for passive FTP").
But as IPVS does not set the IPS_SRC_NAT and IPS_DST_NAT bits,
checking for IPS_NAT_MASK prevents PASV response to be properly
mangled and blocks the transfer. Remove the check as it is not
needed after 3.12 commit 41d73ec053d2 ("netfilter: nf_conntrack:
make sequence number adjustments usuable without NAT") which
changes nfct_nat() with nfct_seqadj() and especially after 3.13
commit b25adce16064 ("ipvs: correct usage/allocation of seqadj
ext in ipvs").

Thanks to Li Shuang and Florian Westphal for reporting the problem!

Reported-by: Li Shuang 
Fixes: be7be6e161a2 ("netfilter: ipvs: fix incorrect conflict resolution")
Signed-off-by: Julian Anastasov 
Acked-by: Simon Horman 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/ipvs/ip_vs_ftp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 3e17d32b629d..58d5d05aec24 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -260,7 +260,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct 
ip_vs_conn *cp,
buf_len = strlen(buf);
 
ct = nf_ct_get(skb, );
-   if (ct && (ct->status & IPS_NAT_MASK)) {
+   if (ct) {
bool mangled;
 
/* If mangling fails this function will return 0
-- 
2.11.0



[PATCH 12/14] netfilter: nf_tables: missing attribute validation in nf_tables_delflowtable()

2018-03-02 Thread Pablo Neira Ayuso
Return -EINVAL is mandatory attributes are missing.

Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_tables_api.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 43acdeef045d..2b5aa78979db 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5161,6 +5161,11 @@ static int nf_tables_delflowtable(struct net *net, 
struct sock *nlsk,
struct nft_table *table;
struct nft_ctx ctx;
 
+   if (!nla[NFTA_FLOWTABLE_TABLE] ||
+   (!nla[NFTA_FLOWTABLE_NAME] &&
+!nla[NFTA_FLOWTABLE_HANDLE]))
+   return -EINVAL;
+
table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
   family, genmask);
if (IS_ERR(table))
-- 
2.11.0



Re: [PATCH net-next] ipv6: fix access to non-linear packet in ndisc_fill_redirect_hdr_option()

2018-03-02 Thread Eric Dumazet
On Fri, 2018-03-02 at 11:53 +0100, Lorenzo Bianconi wrote:
> Fix the following slab-out-of-bounds kasan report in
> ndisc_fill_redirect_hdr_option when the incoming ipv6 packet is not
> linear and the accessed data are not in the linear data region of orig_skb
> 

> Reported-by: Jianlin Shi 
> Reviewed-by: Stefano Brivio 
> Signed-off-by: Lorenzo Bianconi 
> ---
>  net/ipv6/ndisc.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
> index 0a19ce3a6f7f..afd8c15827cd 100644
> --- a/net/ipv6/ndisc.c
> +++ b/net/ipv6/ndisc.c
> @@ -1554,7 +1554,8 @@ static void ndisc_fill_redirect_hdr_option(struct 
> sk_buff *skb,
>   *(opt++) = (rd_len >> 3);
>   opt += 6;
>  
> - memcpy(opt, ipv6_hdr(orig_skb), rd_len - 8);
> + skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt,
> +   rd_len - 8);
>  }

Wow, nice catch !

Reviewed-by: Eric Dumazet 



Re: [PATCH v3 net-next 1/2] lan743x: Add main source files for new lan743x driver

2018-03-02 Thread Andrew Lunn
> Hi Andrew,
> 

> The phy is built in for LAN7430, and external for LAN7431. But the
> same reset should work in both cases because it asserts the normal
> phy reset line.

Assuming the PHY reset is connect to the external line, not a GPIO.
So this is O.K.

> > Assuming it is built in, does the MAC get the interrupt?
> > phy_mac_interrupt() can be called if so.
> 
> I believe the MAC does support a phy interrupt. But right now it's
> working well using phy polling. Is it ok if I post pone supporting phy
> interrupts until a future patch? I am planning a series of patches, and I
> can add support for phy interrupts to the list.

Yes, adding this later is fine.

 Andrew


Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available

2018-03-02 Thread Michael S. Tsirkin
On Fri, Mar 02, 2018 at 11:52:27AM -0800, Samudrala, Sridhar wrote:
> 
> 
> On 3/2/2018 11:41 AM, Michael S. Tsirkin wrote:
> > On Fri, Mar 02, 2018 at 07:26:25AM -0800, Alexander Duyck wrote:
> > > The design limits things to a 1:1 relationship since we just have the
> > > child and backup pointers, but I don't think I am seeing exception
> > > handling to prevent us from overwriting the child pointers so there
> > > may be a leak there.
> > > 
> > > Thanks.
> > > 
> > > - Alex
> > In fact maintaining a list in that case would be nicer, and
> > just use an arbitrary one.
> > E.g. one can see how a user wanting to swap device 1 for device 2
> > might first add device 2 with same MAC then drop device 1.
> 
> It should be possible to swap VF1 with VF2 by
> 1.- enabling virtio link
> 2.- unplugging VF1
> 3.- plugging VF2
> 4.- disabling virtio link
> 

True, but it isn't hard to avoid breakage if user
swapped steps 2 and 3. No need to make it more
fragile that it has to be.

-- 
MST


[PATCH v2 iproute2-next 3/6] rdma: Add CM_ID resource tracking information

2018-03-02 Thread Steve Wise
Sample output:

# rdma resource
2: cxgb4_0: pd 5 cq 2 qp 2 cm_id 3 mr 7
3: mlx4_0: pd 7 cq 3 qp 3 cm_id 3 mr 7

# rdma resource show cm_id
link cxgb4_0/- lqpn 0 qp-type RC state LISTEN ps TCP pid 30485 comm rping 
src-addr 0.0.0.0:7174
link cxgb4_0/2 lqpn 1048 qp-type RC state CONNECT ps TCP pid 30503 comm rping 
src-addr 172.16.2.1:7174 dst-addr 172.16.2.1:38246
link cxgb4_0/2 lqpn 1040 qp-type RC state CONNECT ps TCP pid 30498 comm rping 
src-addr 172.16.2.1:38246 dst-addr 172.16.2.1:7174
link mlx4_0/- lqpn 0 qp-type RC state LISTEN ps TCP pid 30485 comm rping 
src-addr 0.0.0.0:7174
link mlx4_0/1 lqpn 539 qp-type RC state CONNECT ps TCP pid 30494 comm rping 
src-addr 172.16.99.1:7174 dst-addr 172.16.99.1:43670
link mlx4_0/1 lqpn 538 qp-type RC state CONNECT ps TCP pid 30492 comm rping 
src-addr 172.16.99.1:43670 dst-addr 172.16.99.1:7174

# rdma resource show cm_id dst-port 7174
link cxgb4_0/2 lqpn 1040 qp-type RC state CONNECT ps TCP pid 30498 comm rping 
src-addr 172.16.2.1:38246 dst-addr 172.16.2.1:7174
link mlx4_0/1 lqpn 538 qp-type RC state CONNECT ps TCP pid 30492 comm rping 
src-addr 172.16.99.1:43670 dst-addr 172.16.99.1:7174

Signed-off-by: Steve Wise 
---
 rdma/rdma.h  |   2 +
 rdma/res.c   | 258 ++-
 rdma/utils.c |   5 ++
 3 files changed, 264 insertions(+), 1 deletion(-)

diff --git a/rdma/rdma.h b/rdma/rdma.h
index 5809f70..e55205b 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -18,10 +18,12 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "list.h"
 #include "utils.h"
 #include "json_writer.h"
+#include 
 
 #define pr_err(args...) fprintf(stderr, ##args)
 #define pr_out(args...) fprintf(stdout, ##args)
diff --git a/rdma/res.c b/rdma/res.c
index 62f5c54..1ef4f20 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -16,9 +16,11 @@ static int res_help(struct rd *rd)
 {
pr_out("Usage: %s resource\n", rd->filename);
pr_out("  resource show [DEV]\n");
-   pr_out("  resource show [qp]\n");
+   pr_out("  resource show [qp|cm_id]\n");
pr_out("  resource show qp link [DEV/PORT]\n");
pr_out("  resource show qp link [DEV/PORT] [FILTER-NAME 
FILTER-VALUE]\n");
+   pr_out("  resource show cm_id link [DEV/PORT]\n");
+   pr_out("  resource show cm_id link [DEV/PORT] [FILTER-NAME 
FILTER-VALUE]\n");
return 0;
 }
 
@@ -433,6 +435,230 @@ static int res_qp_parse_cb(const struct nlmsghdr *nlh, 
void *data)
return MNL_CB_OK;
 }
 
+static void print_qp_type(struct rd *rd, uint32_t val)
+{
+   if (rd->json_output)
+   jsonw_string_field(rd->jw, "qp-type",
+  qp_types_to_str(val));
+   else
+   pr_out("qp-type %s ", qp_types_to_str(val));
+}
+
+static const char *cm_id_state_to_str(uint8_t idx)
+{
+   static const char * const cm_id_states_str[] = { "IDLE", "ADDR_QUERY",
+ "ADDR_RESOLVED", 
"ROUTE_QUERY", "ROUTE_RESOLVED",
+ "CONNECT", "DISCONNECT",
+ "ADDR_BOUND", "LISTEN", 
"DEVICE_REMOVAL", "DESTROYING" };
+
+   if (idx < ARRAY_SIZE(cm_id_states_str))
+   return cm_id_states_str[idx];
+   return "UNKNOWN";
+}
+
+static const char *cm_id_ps_to_str(uint32_t ps)
+{
+   switch (ps) {
+   case RDMA_PS_IPOIB:
+   return "IPoIB";
+   case RDMA_PS_IB:
+   return "IPoIB";
+   case RDMA_PS_TCP:
+   return "TCP";
+   case RDMA_PS_UDP:
+   return "UDP";
+   default:
+   return "---";
+   }
+}
+
+static void print_cm_id_state(struct rd *rd, uint8_t state)
+{
+   if (rd->json_output) {
+   jsonw_string_field(rd->jw, "state", cm_id_state_to_str(state));
+   return;
+   }
+   pr_out("state %s ", cm_id_state_to_str(state));
+}
+
+static void print_ps(struct rd *rd, uint32_t ps)
+{
+   if (rd->json_output) {
+   jsonw_string_field(rd->jw, "ps", cm_id_ps_to_str(ps));
+   return;
+   }
+   pr_out("ps %s ", cm_id_ps_to_str(ps));
+}
+
+static void print_ipaddr(struct rd *rd, const char *key, char *addrstr, 
uint16_t port)
+{
+   if (rd->json_output) {
+   int name_size = INET6_ADDRSTRLEN+strlen(":65535");
+   char json_name[name_size];
+
+   snprintf(json_name, name_size, "%s:%u", addrstr, port);
+   jsonw_string_field(rd->jw, key, json_name);
+   return;
+   }
+   pr_out("%s %s:%u ", key, addrstr, port);
+}
+
+static int ss_ntop(struct nlattr *nla_line, char *addr_str, uint16_t *port)
+{
+   struct __kernel_sockaddr_storage *addr;
+
+   addr = (struct __kernel_sockaddr_storage 
*)mnl_attr_get_payload(nla_line);
+   switch (addr->ss_family) {
+   

[PATCH v2 iproute2-next 6/6] rdma: Add PD resource tracking information

2018-03-02 Thread Steve Wise
Sample output:

Without CAP_NET_ADMIN capability:

link mlx4_0/- users 0 pid 0 comm [ib_srpt]
link mlx4_0/- users 0 pid 0 comm [ib_srp]
link mlx4_0/- users 1 pid 0 comm [ib_core]
link cxgb4_0/- users 0 pid 0 comm [ib_srp]

With CAP_NET_ADMIN capability:
link mlx4_0/- local_dma_lkey 0x8000 users 0 pid 0 comm [ib_srpt]
link mlx4_0/- local_dma_lkey 0x8000 users 0 pid 0 comm [ib_srp]
link mlx4_0/- local_dma_lkey 0x8000 users 1 pid 0 comm [ib_core]
link cxgb4_0/- local_dma_lkey 0x0 users 0 pid 0 comm [ib_srp]

Signed-off-by: Steve Wise 
---
 rdma/res.c | 92 ++
 1 file changed, 92 insertions(+)

diff --git a/rdma/res.c b/rdma/res.c
index 4caf1d0..435e1ad 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -892,6 +892,87 @@ static int res_mr_parse_cb(const struct nlmsghdr *nlh, 
void *data)
return MNL_CB_OK;
 }
 
+static int res_pd_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+   struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+   struct nlattr *nla_table, *nla_entry;
+   struct rd *rd = data;
+   const char *name;
+   uint32_t idx;
+
+   mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+   if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+   !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+   !tb[RDMA_NLDEV_ATTR_RES_PD])
+   return MNL_CB_ERROR;
+
+   name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+   idx =  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+   nla_table = tb[RDMA_NLDEV_ATTR_RES_PD];
+
+   mnl_attr_for_each_nested(nla_entry, nla_table) {
+   uint32_t local_dma_lkey = 0, unsafe_global_rkey = 0;
+   struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
+   char *comm = NULL;
+   uint32_t pid = 0;
+   uint64_t users;
+   int err;
+
+   err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, nla_line);
+   if (err != MNL_CB_OK)
+   return MNL_CB_ERROR;
+
+   if (!nla_line[RDMA_NLDEV_ATTR_RES_USECNT] ||
+   (!nla_line[RDMA_NLDEV_ATTR_RES_PID] &&
+!nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])) {
+   return MNL_CB_ERROR;
+   }
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY])
+   local_dma_lkey = 
mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY]);
+
+   users = mnl_attr_get_u64(nla_line[RDMA_NLDEV_ATTR_RES_USECNT]);
+   if (rd_check_is_filtered(rd, "users", users))
+   continue;
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY])
+   unsafe_global_rkey = 
mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]);
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_PID]) {
+   pid = 
mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_PID]);
+   comm = get_task_name(pid);
+   }
+
+   if (rd_check_is_filtered(rd, "pid", pid))
+   continue;
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])
+   /* discard const from mnl_attr_get_str */
+   comm = (char 
*)mnl_attr_get_str(nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME]);
+
+   if (rd->json_output)
+   jsonw_start_array(rd->jw);
+
+   print_link(rd, idx, name, 0, nla_line);
+   if (nla_line[RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY])
+   print_key(rd, "local_dma_lkey", local_dma_lkey);
+   print_users(rd, users);
+   if (nla_line[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY])
+   print_key(rd, "unsafe_global_rkey", unsafe_global_rkey);
+   print_pid(rd, pid);
+   print_comm(rd, comm, nla_line);
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
+   free(comm);
+
+   if (rd->json_output)
+   jsonw_end_array(rd->jw);
+   else
+   pr_out("\n");
+   }
+   return MNL_CB_OK;
+}
+
 RES_FUNC(res_no_args,  RDMA_NLDEV_CMD_RES_GET, NULL, true);
 
 static const struct
@@ -971,6 +1052,16 @@ filters mr_valid_filters[MAX_NUMBER_OF_FILTERS] = {{ 
.name = "link",
 
 RES_FUNC(res_mr,   RDMA_NLDEV_CMD_RES_MR_GET, mr_valid_filters, true);
 
+static const struct
+filters pd_valid_filters[MAX_NUMBER_OF_FILTERS] = {{ .name = "link",
+  .is_number = false },
+  { .name = "users",
+  .is_number = true },
+  { .name = "pid",
+  .is_number = true }};
+
+RES_FUNC(res_pd,   RDMA_NLDEV_CMD_RES_PD_GET, pd_valid_filters, true);
+
 static int 

[PATCH v2 iproute2-next 5/6] rdma: Add MR resource tracking information

2018-03-02 Thread Steve Wise
Sample output:

Without CAP_NET_ADMIN:

$ rdma resource show mr mrlen 65536
link mlx4_0/- mrlen 65536 pid 0 comm [nvme_rdma]
link cxgb4_0/- mrlen 65536 pid 0 comm [nvme_rdma]

With CAP_NET_ADMIN:

# rdma resource show mr mrlen 65536
link mlx4_0/- rkey 0x12702 lkey 0x12702 iova 0x85724a000 mrlen 65536 pid 0 comm 
[nvme_rdma]
link cxgb4_0/- rkey 0x68fe4e9 lkey 0x68fe4e9 iova 0x835b91000 mrlen 65536 pid 0 
comm [nvme_rdma]

Signed-off-by: Steve Wise 
---
 include/json_writer.h |   2 +
 lib/json_writer.c |  11 +
 rdma/res.c| 125 ++
 rdma/utils.c  |   6 +++
 4 files changed, 144 insertions(+)

diff --git a/include/json_writer.h b/include/json_writer.h
index 1516aaf..34f2ccc 100644
--- a/include/json_writer.h
+++ b/include/json_writer.h
@@ -39,6 +39,7 @@ void jsonw_bool(json_writer_t *self, bool value);
 void jsonw_float(json_writer_t *self, double number);
 void jsonw_float_fmt(json_writer_t *self, const char *fmt, double num);
 void jsonw_uint(json_writer_t *self, uint64_t number);
+void jsonw_xint(json_writer_t *self, uint64_t number);
 void jsonw_hu(json_writer_t *self, unsigned short number);
 void jsonw_int(json_writer_t *self, int64_t number);
 void jsonw_null(json_writer_t *self);
@@ -49,6 +50,7 @@ void jsonw_string_field(json_writer_t *self, const char 
*prop, const char *val);
 void jsonw_bool_field(json_writer_t *self, const char *prop, bool value);
 void jsonw_float_field(json_writer_t *self, const char *prop, double num);
 void jsonw_uint_field(json_writer_t *self, const char *prop, uint64_t num);
+void jsonw_xint_field(json_writer_t *self, const char *prop, uint64_t num);
 void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num);
 void jsonw_int_field(json_writer_t *self, const char *prop, int64_t num);
 void jsonw_null_field(json_writer_t *self, const char *prop);
diff --git a/lib/json_writer.c b/lib/json_writer.c
index f3eeaf7..6d73a1b 100644
--- a/lib/json_writer.c
+++ b/lib/json_writer.c
@@ -224,6 +224,11 @@ void jsonw_uint(json_writer_t *self, uint64_t num)
jsonw_printf(self, "%"PRIu64, num);
 }
 
+void jsonw_xint(json_writer_t *self, uint64_t num)
+{
+   jsonw_printf(self, "%"PRIx64, num);
+}
+
 void jsonw_lluint(json_writer_t *self, unsigned long long int num)
 {
jsonw_printf(self, "%llu", num);
@@ -268,6 +273,12 @@ void jsonw_uint_field(json_writer_t *self, const char 
*prop, uint64_t num)
jsonw_uint(self, num);
 }
 
+void jsonw_xint_field(json_writer_t *self, const char *prop, uint64_t num)
+{
+   jsonw_name(self, prop);
+   jsonw_xint(self, num);
+}
+
 void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num)
 {
jsonw_name(self, prop);
diff --git a/rdma/res.c b/rdma/res.c
index 595fbbb..4caf1d0 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -782,6 +782,116 @@ static int res_cq_parse_cb(const struct nlmsghdr *nlh, 
void *data)
return MNL_CB_OK;
 }
 
+static void print_key(struct rd *rd, const char *name, uint32_t val)
+{
+   if (rd->json_output)
+   jsonw_xint_field(rd->jw, name, val);
+   else
+   pr_out("%s 0x%x ", name, val);
+}
+
+static void print_iova(struct rd *rd, uint64_t val)
+{
+   if (rd->json_output)
+   jsonw_xint_field(rd->jw, "iova", val);
+   else
+   pr_out("iova 0x%" PRIx64 " ", val);
+}
+
+static void print_mrlen(struct rd *rd, uint64_t val)
+{
+   if (rd->json_output)
+   jsonw_uint_field(rd->jw, "mrlen", val);
+   else
+   pr_out("mrlen %" PRIu64 " ", val);
+}
+
+static int res_mr_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+   struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+   struct nlattr *nla_table, *nla_entry;
+   struct rd *rd = data;
+   const char *name;
+   uint32_t idx;
+
+   mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+   if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+   !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+   !tb[RDMA_NLDEV_ATTR_RES_MR])
+   return MNL_CB_ERROR;
+
+   name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+   idx =  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+   nla_table = tb[RDMA_NLDEV_ATTR_RES_MR];
+
+   mnl_attr_for_each_nested(nla_entry, nla_table) {
+   struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
+   uint32_t rkey = 0, lkey = 0;
+   uint64_t iova = 0, mrlen;
+   char *comm = NULL;
+   uint32_t pid = 0;
+   int err;
+
+   err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, nla_line);
+   if (err != MNL_CB_OK)
+   return MNL_CB_ERROR;
+
+   if (!nla_line[RDMA_NLDEV_ATTR_RES_MRLEN] ||
+   (!nla_line[RDMA_NLDEV_ATTR_RES_PID] &&
+!nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])) {
+   return 

[PATCH v2 iproute2-next 1/6] rdma: update rdma_netlink.h

2018-03-02 Thread Steve Wise
From: Steve Wise 

Pull in the latest rdma_netlink.h which has support for
the rdma nldev resource tracking objects being added
with this patch series.

Signed-off-by: Steve Wise 
---
 include/uapi/rdma/rdma_netlink.h | 44 +---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index dbac3b8..351139c 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _RDMA_NETLINK_H
-#define _RDMA_NETLINK_H
+#ifndef _UAPI_RDMA_NETLINK_H
+#define _UAPI_RDMA_NETLINK_H
 
 #include 
 
@@ -238,6 +238,14 @@ enum rdma_nldev_command {
 
RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */
 
+   RDMA_NLDEV_CMD_RES_CM_ID_GET, /* can dump */
+
+   RDMA_NLDEV_CMD_RES_CQ_GET, /* can dump */
+
+   RDMA_NLDEV_CMD_RES_MR_GET, /* can dump */
+
+   RDMA_NLDEV_CMD_RES_PD_GET, /* can dump */
+
RDMA_NLDEV_NUM_OPS
 };
 
@@ -350,6 +358,36 @@ enum rdma_nldev_attr {
 */
RDMA_NLDEV_ATTR_RES_KERN_NAME,  /* string */
 
+   RDMA_NLDEV_ATTR_RES_CM_ID,  /* nested table */
+   RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,/* nested table */
+   /*
+* rdma_cm_id port space.
+*/
+   RDMA_NLDEV_ATTR_RES_PS, /* u32 */
+   /*
+* Source and destination socket addresses
+*/
+   RDMA_NLDEV_ATTR_RES_SRC_ADDR,   /* __kernel_sockaddr_storage */
+   RDMA_NLDEV_ATTR_RES_DST_ADDR,   /* __kernel_sockaddr_storage */
+
+   RDMA_NLDEV_ATTR_RES_CQ, /* nested table */
+   RDMA_NLDEV_ATTR_RES_CQ_ENTRY,   /* nested table */
+   RDMA_NLDEV_ATTR_RES_CQE,/* u32 */
+   RDMA_NLDEV_ATTR_RES_USECNT, /* u64 */
+   RDMA_NLDEV_ATTR_RES_POLL_CTX,   /* u8 */
+
+   RDMA_NLDEV_ATTR_RES_MR, /* nested table */
+   RDMA_NLDEV_ATTR_RES_MR_ENTRY,   /* nested table */
+   RDMA_NLDEV_ATTR_RES_RKEY,   /* u32 */
+   RDMA_NLDEV_ATTR_RES_LKEY,   /* u32 */
+   RDMA_NLDEV_ATTR_RES_IOVA,   /* u64 */
+   RDMA_NLDEV_ATTR_RES_MRLEN,  /* u64 */
+
+   RDMA_NLDEV_ATTR_RES_PD, /* nested table */
+   RDMA_NLDEV_ATTR_RES_PD_ENTRY,   /* nested table */
+   RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */
+   RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */
+
RDMA_NLDEV_ATTR_MAX
 };
-#endif /* _RDMA_NETLINK_H */
+#endif /* _UAPI_RDMA_NETLINK_H */
-- 
1.8.3.1



Issue accessing task_struct from BPF due to 4.16 stack-protector changes

2018-03-02 Thread Gianluca Borello
Hello,

While testing bpf-next, I noticed that I was reading garbage when
accessing some task_struct members, and the issue seems caused by the
recent commit 2bc2f688fdf8 ("Makefile: move stack-protector
availability out of Kconfig") which removes CONFIG_CC_STACKPROTECTOR
from autoconf.h.

When I compile my BPF program, offsetof(struct task_struct, files),
which is the member I'm dereferencing, returns 1768 (where the garbage
is), whereas doing it on 4.15 returns 1776 (where the correct member
is). I believe when compiling with clang this portion of the
task_struct doesn't get considered anymore:

#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector GCC feature: */
unsigned long stack_canary;
#endif

I solved it by adding $(KBUILD_CPPFLAGS) to my BPF Makefile (which is
pretty similar to the one used in samples/bpf/Makefile).

Two questions:

1) Do you confirm this is the proper way to handle this moving
forward? Or should there be a better way?

2) Would you consider useful a simple patch to samples/bpf/Makefile so
that other developers will not be stuck in a long bisect to figure out
why they read garbage when dereferencing task_struct? I assume that
several people use that Makefile as a template to start their project,
like I did (perhaps I'm assuming wrong though).

Thanks


[PATCH v2 iproute2-next 2/6] rdma: initialize the rd struct

2018-03-02 Thread Steve Wise
Initialize the rd struct so port_idx is 0 unless set otherwise.
Otherwise, strict_port queries end up passing an uninitialized PORT
nlattr.

Signed-off-by: Steve Wise 
---
 rdma/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdma/rdma.c b/rdma/rdma.c
index 19608f4..c652550 100644
--- a/rdma/rdma.c
+++ b/rdma/rdma.c
@@ -93,7 +93,7 @@ int main(int argc, char **argv)
bool show_details = false;
bool json_output = false;
char *filename;
-   struct rd rd;
+   struct rd rd = {};
int opt;
int err;
 
-- 
1.8.3.1



[PATCH v2 iproute2-next 4/6] rdma: Add CQ resource tracking information

2018-03-02 Thread Steve Wise
Sample output:

# rdma resource show cq
link cxgb4_0/- cqe 46 users 2 pid 30503 comm rping
link cxgb4_0/- cqe 46 users 2 pid 30498 comm rping
link mlx4_0/- cqe 63 users 2 pid 30494 comm rping
link mlx4_0/- cqe 63 users 2 pid 30489 comm rping
link mlx4_0/- cqe 1023 users 2 poll_ctx WORKQUEUE pid 0 comm [ib_core]

# rdma resource show cq pid 30489
link mlx4_0/- cqe 63 users 2 pid 30489 comm rping

Signed-off-by: Steve Wise 
---
 rdma/res.c   | 136 +++
 rdma/utils.c |   5 +++
 2 files changed, 141 insertions(+)

diff --git a/rdma/res.c b/rdma/res.c
index 1ef4f20..595fbbb 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -21,6 +21,8 @@ static int res_help(struct rd *rd)
pr_out("  resource show qp link [DEV/PORT] [FILTER-NAME 
FILTER-VALUE]\n");
pr_out("  resource show cm_id link [DEV/PORT]\n");
pr_out("  resource show cm_id link [DEV/PORT] [FILTER-NAME 
FILTER-VALUE]\n");
+   pr_out("  resource show cq link [DEV/PORT]\n");
+   pr_out("  resource show cq link [DEV/PORT] [FILTER-NAME 
FILTER-VALUE]\n");
return 0;
 }
 
@@ -659,6 +661,127 @@ static int res_cm_id_parse_cb(const struct nlmsghdr *nlh, 
void *data)
return MNL_CB_OK;
 }
 
+static void print_cqe(struct rd *rd, uint32_t val)
+{
+   if (rd->json_output)
+   jsonw_uint_field(rd->jw, "cqe", val);
+   else
+   pr_out("cqe %u ", val);
+}
+
+static void print_users(struct rd *rd, uint64_t val)
+{
+   if (rd->json_output)
+   jsonw_uint_field(rd->jw, "users", val);
+   else
+   pr_out("users %" PRIu64 " ", val);
+}
+
+static const char *poll_ctx_to_str(uint8_t idx)
+{
+   static const char * const cm_id_states_str[] = { "DIRECT", "SOFTIRQ",
+ "WORKQUEUE"};
+
+   if (idx < ARRAY_SIZE(cm_id_states_str))
+   return cm_id_states_str[idx];
+   return "UNKNOWN";
+}
+
+static void print_poll_ctx(struct rd *rd, uint8_t poll_ctx)
+{
+   if (rd->json_output) {
+   jsonw_string_field(rd->jw, "poll-ctx", 
poll_ctx_to_str(poll_ctx));
+   return;
+   }
+   pr_out("poll-ctx %s ", poll_ctx_to_str(poll_ctx));
+}
+
+static int res_cq_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+   struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+   struct nlattr *nla_table, *nla_entry;
+   struct rd *rd = data;
+   const char *name;
+   uint32_t idx;
+
+   mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+   if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+   !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+   !tb[RDMA_NLDEV_ATTR_RES_CQ])
+   return MNL_CB_ERROR;
+
+   name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+   idx =  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+   nla_table = tb[RDMA_NLDEV_ATTR_RES_CQ];
+
+   mnl_attr_for_each_nested(nla_entry, nla_table) {
+   struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
+   char *comm = NULL;
+   uint32_t pid = 0;
+   uint8_t poll_ctx = 0;
+   uint64_t users;
+   uint32_t cqe;
+   int err;
+
+   err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, nla_line);
+   if (err != MNL_CB_OK)
+   return MNL_CB_ERROR;
+
+   if (!nla_line[RDMA_NLDEV_ATTR_RES_CQE] ||
+   !nla_line[RDMA_NLDEV_ATTR_RES_USECNT] ||
+   (!nla_line[RDMA_NLDEV_ATTR_RES_PID] &&
+!nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])) {
+   return MNL_CB_ERROR;
+   }
+
+   cqe = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_CQE]);
+
+   users = mnl_attr_get_u64(nla_line[RDMA_NLDEV_ATTR_RES_USECNT]);
+   if (rd_check_is_filtered(rd, "users", users))
+   continue;
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_POLL_CTX]) {
+   poll_ctx = 
mnl_attr_get_u8(nla_line[RDMA_NLDEV_ATTR_RES_POLL_CTX]);
+   if (rd_check_is_string_filtered(rd, "poll-ctx", 
poll_ctx_to_str(poll_ctx)))
+   continue;
+   }
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_PID]) {
+   pid = 
mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_PID]);
+   comm = get_task_name(pid);
+   }
+
+   if (rd_check_is_filtered(rd, "pid", pid)) {
+   free(comm);
+   continue;
+   }
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])
+   /* discard const from mnl_attr_get_str */
+   comm = (char 
*)mnl_attr_get_str(nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME]);
+
+   if (rd->json_output)
+   

  1   2   3   >